// // bonds_stress_strain.cu // input bonds, apply stress, output strain // (c) MIT CBA Neil Gershenfeld 6/30/20 // reads from stdin, writes to stdout // bonds_source | bonds_stress_strain grid block spring mass dissipation dt nloop fraction step bond // // includes // #include <iostream> #include <string> #include <chrono> using namespace std; // // variables // class vars { public: float3 *hp,*hf; // host position, force float3 *dp,*dv,*df; // device position, velocity, force float *s,*ds; // strain float d,dmax,sd; // particle pitch, max, sort float fractionTop; // fixed top particle fraction float fractionBottom; // fixed bottom particle fraction float step; // strain step size float bond; // bond break multiplier float stress,strain; // stress, strain accumulators int lpp; // links per particle float xmin,xmax,ymin,ymax,zmin,zmax,top,bot; // limits float spring,mass,diss,dt; // integration int nloop,grid,block; // update loop, GPU threads int ms = 10; // maxsort int np,nx,ny,nz; // number of particles, sort int *dsort,*dnsort; // sort int *links,*nlinks,*dlinks,*dnlinks; // links }; vars v; // // sort particles // __global__ void sort_particles(vars v) { int index = blockIdx.x*blockDim.x+threadIdx.x; int nindex = gridDim.x*blockDim.x; int start = (index/float(nindex))*v.np; int stop = ((index+1)/float(nindex))*v.np; for (int i = start; i < stop; ++i) { int ix = (v.nx-1)*(v.dp[i].x-v.xmin)/(v.xmax-v.xmin); if ((ix < 0) || (ix > v.nx-1)) continue; int iy = (v.ny-1)*(v.dp[i].y-v.ymin)/(v.ymax-v.ymin); if ((iy < 0) || (iy > v.ny-1)) continue; int iz = (v.nz-1)*(v.dp[i].z-v.zmin)/(v.zmax-v.zmin); if ((iz < 0) || (iz > v.nz-1)) continue; int isort = v.nz*v.ny*ix+v.nz*iy+iz; v.dsort[v.ms*isort+atomicAdd(&v.dnsort[isort],1)] = i; } } // // test for link // __device__ bool linked(int i,int j,vars v) { for (int l = 0; l < v.dnlinks[i]; ++l) { if (j == v.dlinks[v.lpp*i+l]) return true; } return false; } // // find force // __device__ void find_force(int i,int j,vars v) { float dx = v.dp[i].x-v.dp[j].x; float dy = v.dp[i].y-v.dp[j].y; float dz = v.dp[i].z-v.dp[j].z; if (linked(i,j,v)) { float d = sqrt(dx*dx+dy*dy+dz*dz); float nx = dx/d; float ny = dy/d; float nz = dz/d; float f = -v.spring*(d-v.d)/v.d; float dvx = v.dv[i].x-v.dv[j].x; float dvy = v.dv[i].y-v.dv[j].y; float dvz = v.dv[i].z-v.dv[j].z; f -= v.diss*(dvx*dx+dvy*dy+dvz*dz)/d; atomicAdd(&(v.df[i].x),f*nx); atomicAdd(&(v.df[i].y),f*ny); atomicAdd(&(v.df[i].z),f*nz); atomicAdd(&(v.df[j].x),-f*nx); atomicAdd(&(v.df[j].y),-f*ny); atomicAdd(&(v.df[j].z),-f*nz); } else { float d = sqrt(dx*dx+dy*dy+dz*dz); if (d < v.d) { float nx = dx/d; float ny = dy/d; float nz = dz/d; float f = -v.spring*(d-v.d)/v.d; float dvx = v.dv[i].x-v.dv[j].x; float dvy = v.dv[i].y-v.dv[j].y; float dvz = v.dv[i].z-v.dv[j].z; f -= v.diss*(dvx*dx+dvy*dy+dvz*dz)/d; atomicAdd(&(v.df[i].x),f*nx); atomicAdd(&(v.df[i].y),f*ny); atomicAdd(&(v.df[i].z),f*nz); atomicAdd(&(v.df[j].x),-f*nx); atomicAdd(&(v.df[j].y),-f*ny); atomicAdd(&(v.df[j].z),-f*nz); } } } // // calculate forces // __global__ void calculate_forces(vars v) { int index = blockIdx.x*blockDim.x+threadIdx.x; int nindex = gridDim.x*blockDim.x; int start = (index/float(nindex))*v.np; int stop = ((index+1)/float(nindex))*v.np; for (int i = start; i < stop; ++i) { float x = v.dp[i].x; float y = v.dp[i].y; float z = v.dp[i].z; int ix = (v.nx-1)*(v.dp[i].x-v.xmin)/(v.xmax-v.xmin); int iy = (v.ny-1)*(v.dp[i].y-v.ymin)/(v.ymax-v.ymin); int iz = (v.nz-1)*(v.dp[i].z-v.zmin)/(v.zmax-v.zmin); // // loop over links to find strain // v.ds[i] = 0; int norm = 0; for (int l = 0; l < v.dnlinks[i]; ++l) { int j = v.dlinks[v.lpp*i+l]; if (j == -1) // // ignore deleted link // continue; float dx = x-v.dp[j].x; float dy = y-v.dp[j].y; float dz = z-v.dp[j].z; float d = sqrtf(dx*dx+dy*dy+dz*dz); // // accumulate // v.ds[i] += abs(d-v.d)/v.d; norm += 1; // // check for broken links // if (d > v.dmax) // // yes, mark for deletion // v.dlinks[v.lpp*i+l] = -1; } v.ds[i] = v.ds[i]/norm; // // loop over sort to find forces // for (int dx = -1; dx <= 1; ++dx) { if (((ix+dx) < 0) || ((ix+dx) > (v.nx-1))) continue; for (int dy = -1; dy <= 1; ++dy) { if (((iy+dy) < 0) || ((iy+dy) > (v.ny-1))) continue; for (int dz = -1; dz <= 1; ++dz) { if (((iz+dz) < 0) || ((iz+dz) > (v.nz-1))) continue; int isort = v.nz*v.ny*(ix+dx)+v.nz*(iy+dy)+(iz+dz); for (int s = 0; s < v.dnsort[isort]; ++s) { int j = v.dsort[v.ms*isort+s]; if (i < j) find_force(i,j,v); } } } } } } // // integrate positions // __global__ void integrate_positions(vars v) { int index = blockIdx.x*blockDim.x+threadIdx.x; int nindex = gridDim.x*blockDim.x; int start = (index/float(nindex))*v.np; int stop = ((index+1)/float(nindex))*v.np; float ax,ay,az; for (int i = start; i < stop; ++i) { if (i < v.np*v.fractionTop) v.dp[i].z -= v.step; else if (i > v.np*(1-v.fractionBottom)) v.dp[i].z += v.step; else { ax = v.df[i].x/v.mass; ay = v.df[i].y/v.mass; az = v.df[i].z/v.mass; v.dv[i].x += v.dt*ax; v.dv[i].y += v.dt*ay; v.dv[i].z += v.dt*az; v.dp[i].x += v.dt*v.dv[i].x; v.dp[i].y += v.dt*v.dv[i].y; v.dp[i].z += v.dt*v.dv[i].z; } } } // // CUDA error check // void cudaCheck(string msg) { cudaError err; err = cudaGetLastError(); if (cudaSuccess != err) cerr << msg << ": " << cudaGetErrorString(err) << endl; } // // main // int main(int argc, char** argv) { if (argc != 11) { cerr << "command line: bonds_source | bonds_stress_strain grid block spring mass dissipation dt nloop fractionTop fractionBottom step bond"; return 1; } v.grid = stoi(argv[1]); v.block = stoi(argv[2]); v.spring = stof(argv[3]); v.mass = stof(argv[4]); v.diss = stof(argv[5]); v.dt = stof(argv[6]); v.nloop = stoi(argv[7]); v.fractionTop = stof(argv[8]); v.fractionBottom = stof(argv[9]); v.step = stof(argv[10]); v.bond = stof(argv[11]); // // read and parse inputs // string s; while (1) { getline(cin,s); if (s.compare("xmin:") == 0) { getline(cin,s); v.xmin = stof(s); } else if (s.compare("xmax:") == 0) { getline(cin,s); v.xmax = stof(s); } else if (s.compare("ymin:") == 0) { getline(cin,s); v.ymin = stof(s); } else if (s.compare("ymax:") == 0) { getline(cin,s); v.ymax = stof(s); } else if (s.compare("zmin:") == 0) { getline(cin,s); v.zmin = stof(s); } else if (s.compare("zmax:") == 0) { getline(cin,s); v.zmax = stof(s); } else if (s.compare("pitch:") == 0) { getline(cin,s); v.d = stof(s); v.dmax = v.bond*v.d; // bond breaking v.sd = 1.01*v.d; // sort bins bigger to avoid roundoff at edges float xmid = (v.xmax+v.xmin)/2.0; float ymid = (v.ymax+v.ymin)/2.0; float zmid = (v.zmax+v.zmin)/2.0; v.xmin = xmid+1.25*(v.xmin-xmid); // enlarge sort volume v.xmax = xmid+1.25*(v.xmax-xmid); v.ymin = ymid+1.25*(v.ymin-ymid); v.ymax = ymid+1.25*(v.ymax-ymid); v.zmin = zmid+1.25*(v.zmin-zmid); v.zmax = zmid+1.25*(v.zmax-zmid); v.nx = (v.xmax-v.xmin)/v.sd; v.ny = (v.ymax-v.ymin)/v.sd; v.nz = (v.zmax-v.zmin)/v.sd; cudaMalloc(&v.dsort,v.nz*v.ny*v.nx*v.ms*sizeof(int)); cudaMalloc(&v.dnsort,v.nz*v.ny*v.nx*sizeof(int)); cudaMemset(v.dnsort,0,v.nz*v.ny*v.nx*sizeof(int)); cudaCheck("malloc pitch"); } else if (s.compare("links per particle:") == 0) { getline(cin,s); v.lpp = stoi(s); } else if (s.compare("number:") == 0) { getline(cin,s); v.np = stoi(s); // number of particles cudaMallocHost(&v.hp,v.np*sizeof(float3)); // host arrays cudaMallocHost(&v.hf,v.np*sizeof(float3)); cudaMallocHost(&v.s,v.np*sizeof(float)); v.links = new int[v.lpp*v.np]; v.nlinks = new int[v.np]; cudaMalloc(&v.dp,v.np*sizeof(float3)); // device arrays cudaMalloc(&v.dv,v.np*sizeof(float3)); cudaMalloc(&v.df,v.np*sizeof(float3)); cudaMalloc(&v.ds,v.np*sizeof(float)); cudaMalloc(&v.dlinks,v.lpp*v.np*sizeof(int)); cudaMalloc(&v.dnlinks,v.np*sizeof(int)); cudaMemset(v.dnlinks,0,v.np*sizeof(int)); cudaCheck("malloc number"); } else if (s.compare("particles:") == 0) { for (int i = 0; i < v.np ; ++i) { cin.read((char *)&v.hp[i].x,4); cin.read((char *)&v.hp[i].y,4); cin.read((char *)&v.hp[i].z,4); } } else if (s.compare("nlinks:") == 0) { for (int i = 0; i < v.np ; ++i) cin.read((char *)&v.nlinks[i],4); } else if (s.compare("links:") == 0) { for (int i = 0; i < v.lpp*v.np ; ++i) cin.read((char *)&v.links[i],4); } else if (s.compare("end:") == 0) break; } cerr << "bonds_stress_strain:" << endl; cerr << " input " << v.np << " particles" << endl; // // copy data to GPU // cudaMemcpy(v.dp,v.hp,v.np*sizeof(float3), cudaMemcpyHostToDevice); cudaMemcpy(v.dnlinks,v.nlinks,v.np*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(v.dlinks,v.links,v.lpp*v.np*sizeof(int), cudaMemcpyHostToDevice); cudaCheck("copy to GPU"); // // start output // cout << "type:" << endl; cout << "bonds" << endl; cout << "format:" << endl; cout << "xyz float32" << endl; cout << "pitch:" << endl; cout << v.d << endl; cout << "links per particle:" << endl; cout << v.lpp << endl; cout << "number:" << endl; cout << v.np << endl; cout << "fractionTop:" << endl; cout << v.fractionTop << endl; cout << "fractionBottom:" << endl; cout << v.fractionBottom << endl; cout << "xmin:" << endl; cout << v.xmin << endl; cout << "xmax:" << endl; cout << v.xmax << endl; cout << "ymin:" << endl; cout << v.ymin << endl; cout << "ymax:" << endl; cout << v.ymax << endl; cout << "zmin:" << endl; cout << v.zmin << endl; cout << "zmax:" << endl; cout << v.zmax << endl; cout << "particles:" << endl; for (int i = 0; i < v.np; ++i) { cout.write((char*)&v.hp[i].y,4); cout.write((char*)&v.hp[i].y,4); cout.write((char*)&v.hp[i].y,4); } cout << endl << "continue:" << endl; // // start main loop // while (1) { // // update loop // cudaDeviceSynchronize(); auto t0 = chrono::high_resolution_clock::now(); cout << "outer loop" << endl; for (int i = 0; i < v.nloop; ++i) { cout << "inner loop" << endl; // // sort // cudaMemset(v.dnsort,0,v.nz*v.ny*v.nx*sizeof(int)); sort_particles<<<v.grid,v.block>>>(v); cudaCheck("sort"); // // forces // cudaMemset(v.df,0,v.np*sizeof(float3)); calculate_forces<<<v.grid,v.block>>>(v); cudaCheck("forces"); // // integrate // integrate_positions<<<v.grid,v.block>>>(v); cudaCheck("integrate"); } cudaDeviceSynchronize(); auto t1 = chrono::high_resolution_clock::now(); float dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count(); fprintf(stderr,"\r mps: %.0f",(v.nloop*v.np)/dt); // // measure timing // t0 = chrono::high_resolution_clock::now(); cudaMemset(v.dnsort,0,v.nz*v.ny*v.nx*sizeof(int)); sort_particles<<<v.grid,v.block>>>(v); cudaCheck("time sort"); cudaDeviceSynchronize(); t1 = chrono::high_resolution_clock::now(); dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count(); cerr << " srt: " << dt; // t0 = chrono::high_resolution_clock::now(); cudaMemset(v.df,0,v.np*sizeof(float3)); calculate_forces<<<v.grid,v.block>>>(v); cudaCheck("time forces"); cudaDeviceSynchronize(); t1 = chrono::high_resolution_clock::now(); dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count(); cerr << " frc: " << dt; // t0 = chrono::high_resolution_clock::now(); integrate_positions<<<v.grid,v.block>>>(v); cudaCheck("time integrate"); cudaDeviceSynchronize(); t1 = chrono::high_resolution_clock::now(); dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count(); cerr << " int: " << dt; // // copy data to CPU // t0 = chrono::high_resolution_clock::now(); cudaMemcpy(v.hp,v.dp,v.np*sizeof(float3),cudaMemcpyDeviceToHost); cudaMemcpy(v.hf,v.df,v.np*sizeof(float3),cudaMemcpyDeviceToHost); cudaMemcpy(v.s,v.ds,v.np*sizeof(int),cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); cudaCheck("copy to CPU"); t1 = chrono::high_resolution_clock::now(); dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count(); cerr << " cpy: " << dt; // // stress // v.stress = 0; for (int i = 0; i < v.np; ++i) { if (i < v.np*v.fractionTop) v.stress -= v.hf[i].z; else if (i > v.np*(1-v.fractionBottom)) v.stress += v.hf[i].z; } //cerr << endl << v.stress << endl; // // output // t0 = chrono::high_resolution_clock::now(); cout << "particles:" << endl; for (int i = 0; i < v.np; ++i) { cout.write((char*)&v.hp[i].x,4); cout.write((char*)&v.hp[i].y,4); cout.write((char*)&v.hp[i].z,4); } cout << endl << "strain:" << endl; for (int i = 0; i < v.np; ++i) cout.write((char*)&v.s[i],4); cout << endl << "continue:" << endl; t1 = chrono::high_resolution_clock::now(); dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count(); fprintf(stderr," out: %.3g ",dt); // cerr << "(us)" << flush; } }