// // particles_stress_strain.cu // input particles, apply stress, output strain // (c) MIT CBA Neil Gershenfeld 7/18/20 // reads from stdin, writes to stdout // particles_source | particles_stress_strain grid block spring mass dissipation dt nloop fraction step // // includes // #include <iostream> #include <string> #include <chrono> using namespace std; // // variables // class vars { public: float *px,*py,*pz; // host particle positions float *dpx,*dpy,*dpz; // device particle positions float *dvx,*dvy,*dvz; // device velocities float *dfx,*dfy,*dfz; // device forces float *s,*ds; // strain float d,dmax,sd; // particle pitch, max, sort float fraction; // fixed top and bottom particle fraction float step; // strain step size float xmin,xmax,ymin,ymax,zmin,zmax,top,bot; // limits float spring,mass,diss,dt; // integration int nloop,grid,block; // update loop, GPU threads int ms = 10; // maxsort int np,nx,ny,nz; // number of particles, sort int *dsort,*dnsort; // sort }; vars v; // // sort particles // __global__ void sort_particles(vars v) { int index = blockIdx.x*blockDim.x+threadIdx.x; int nindex = gridDim.x*blockDim.x; int start = (index/float(nindex))*v.np; int stop = ((index+1)/float(nindex))*v.np; for (int i = start; i < stop; ++i) { int ix = (v.nx-1)*(v.dpx[i]-v.xmin)/(v.xmax-v.xmin); if ((ix < 0) || (ix > v.nx-1)) continue; int iy = (v.ny-1)*(v.dpy[i]-v.ymin)/(v.ymax-v.ymin); if ((iy < 0) || (iy > v.ny-1)) continue; int iz = (v.nz-1)*(v.dpz[i]-v.zmin)/(v.zmax-v.zmin); if ((iz < 0) || (iz > v.nz-1)) continue; int isort = v.nz*v.ny*ix+v.nz*iy+iz; v.dsort[v.ms*isort+atomicAdd(&v.dnsort[isort],1)] = i; } } // // find force // __device__ void find_force(int i,int j,float x,float y,float z,vars v) { float xl = v.dpx[j]; float yl = v.dpy[j]; float zl = v.dpz[j]; float dx = x-xl; float dy = y-yl; float dz = z-zl; float d = sqrt(dx*dx+dy*dy+dz*dz); float nx = dx/d; float ny = dy/d; float nz = dz/d; if (d < v.d) f = -v.spring*(d-v.d)/v.d; else if (d < 2*v.d) f = -0.01*v.spring*(d-v.d)/v.d; else if (d < 3*v.d) f = +0.01*v.spring*(d-3*v.d)/v.d; else if (d < 3*v.d) { float dvx = v.dvx[i]-v.dvx[j]; float dvy = v.dvy[i]-v.dvy[j]; float dvz = v.dvz[i]-v.dvz[j]; f -= v.diss*(dvx*dx+dvy*dy+dvz*dz)/d; } v.dfx[i] += f*nx; v.dfy[i] += f*ny; v.dfz[i] += f*nz; } // // calculate forces // __global__ void calculate_forces(vars v) { int index = blockIdx.x*blockDim.x+threadIdx.x; int nindex = gridDim.x*blockDim.x; int start = (index/float(nindex))*v.np; int stop = ((index+1)/float(nindex))*v.np; for (int i = start; i < stop; ++i) { float x = v.dpx[i]; float y = v.dpy[i]; float z = v.dpz[i]; int ix = (v.nx-1)*(v.dpx[i]-v.xmin)/(v.xmax-v.xmin); int iy = (v.ny-1)*(v.dpy[i]-v.ymin)/(v.ymax-v.ymin); int iz = (v.nz-1)*(v.dpz[i]-v.zmin)/(v.zmax-v.zmin); /* // // loop over links to find strain // v.ds[i] = 0; int norm = 0; for (int l = 0; l < v.dnlinks[i]; ++l) { int j = v.dlinks[v.lpp*i+l]; if (j == -1) // // ignore deleted link // continue; float dx = x-v.dpx[j]; float dy = y-v.dpy[j]; float dz = z-v.dpz[j]; float d = sqrtf(dx*dx+dy*dy+dz*dz); // // accumulate // v.ds[i] += abs(d-v.d)/v.d; norm += 1; // // check for broken links // if (d > v.dmax) // // yes, mark for deletion // v.dlinks[v.lpp*i+l] = -1; } v.ds[i] = v.ds[i]/norm; */ // // loop over sort to find forces // v.dfx[i] = v.dfy[i] = v.dfz[i] = 0; for (int dx = -1; dx <= 1; ++dx) { if (((ix+dx) < 0) || ((ix+dx) > (v.nx-1))) continue; for (int dy = -1; dy <= 1; ++dy) { if (((iy+dy) < 0) || ((iy+dy) > (v.ny-1))) continue; for (int dz = -1; dz <= 1; ++dz) { if (((iz+dz) < 0) || ((iz+dz) > (v.nz-1))) continue; int isort = v.nz*v.ny*(ix+dx)+v.nz*(iy+dy)+(iz+dz); for (int s = 0; s < v.dnsort[isort]; ++s) { int j = v.dsort[v.ms*isort+s]; if (i != j) find_force(i,j,x,y,z,v); } } } } } } // // integrate positions // __global__ void integrate_positions(vars v) { int index = blockIdx.x*blockDim.x+threadIdx.x; int nindex = gridDim.x*blockDim.x; int start = (index/float(nindex))*v.np; int stop = ((index+1)/float(nindex))*v.np; float ax,ay,az; for (int i = start; i < stop; ++i) { if (i < v.np*v.fraction) v.dpz[i] -= v.step; else if (i > v.np*(1-v.fraction)) v.dpz[i] += v.step; else { ax = v.dfx[i]/v.mass; ay = v.dfy[i]/v.mass; az = v.dfz[i]/v.mass; v.dvx[i] += v.dt*ax; v.dvy[i] += v.dt*ay; v.dvz[i] += v.dt*az; v.dpx[i] += v.dt*v.dvx[i]; v.dpy[i] += v.dt*v.dvy[i]; v.dpz[i] += v.dt*v.dvz[i]; } } } // // CUDA error check // void cudaCheck(string msg) { cudaError err; err = cudaGetLastError(); if (cudaSuccess != err) cerr << msg << ": " << cudaGetErrorString(err) << endl; } // // main // int main(int argc, char** argv) { if (argc != 11) { cerr << "command line: particles_source | particles_stress_strain grid block spring mass dissipation dt nloop fraction step"; return 1; } v.grid = stoi(argv[1]); v.block = stoi(argv[2]); v.spring = stof(argv[3]); v.mass = stof(argv[4]); v.diss = stof(argv[5]); v.dt = stof(argv[6]); v.nloop = stoi(argv[7]); v.fraction = stof(argv[8]); v.step = stof(argv[9]); // // read and parse inputs // string s; while (1) { getline(cin,s); if (s.compare("xmin:") == 0) { getline(cin,s); v.xmin = stof(s); } else if (s.compare("xmax:") == 0) { getline(cin,s); v.xmax = stof(s); } else if (s.compare("ymin:") == 0) { getline(cin,s); v.ymin = stof(s); } else if (s.compare("ymax:") == 0) { getline(cin,s); v.ymax = stof(s); } else if (s.compare("zmin:") == 0) { getline(cin,s); v.zmin = stof(s); } else if (s.compare("zmax:") == 0) { getline(cin,s); v.zmax = stof(s); } else if (s.compare("pitch:") == 0) { getline(cin,s); v.d = stof(s); v.sd = 1.1*v.d; // sort bins bigger to avoid roundoff at edges float xmid = (v.xmax+v.xmin)/2.0; float ymid = (v.ymax+v.ymin)/2.0; float zmid = (v.zmax+v.zmin)/2.0; v.xmin = xmid+1.25*(v.xmin-xmid); // enlarge sort volume v.xmax = xmid+1.25*(v.xmax-xmid); v.ymin = ymid+1.25*(v.ymin-ymid); v.ymax = ymid+1.25*(v.ymax-ymid); v.zmin = zmid+1.25*(v.zmin-zmid); v.zmax = zmid+1.25*(v.zmax-zmid); v.nx = (v.xmax-v.xmin)/v.sd; v.ny = (v.ymax-v.ymin)/v.sd; v.nz = (v.zmax-v.zmin)/v.sd; cudaMalloc(&v.dsort,v.nz*v.ny*v.nx*v.ms*sizeof(int)); cudaMalloc(&v.dnsort,v.nz*v.ny*v.nx*sizeof(int)); cudaMemset(v.dnsort,0,v.nz*v.ny*v.nx*sizeof(int)); cudaCheck("malloc pitch"); } else if (s.compare("number:") == 0) { getline(cin,s); v.np = stoi(s); // number of particles v.px = new float[v.np]; // CPU arrays v.py = new float[v.np]; v.pz = new float[v.np]; v.s = new float[v.np]; cudaMalloc(&v.dpx,v.np*sizeof(float)); // GPU arrays cudaMalloc(&v.dpy,v.np*sizeof(float)); cudaMalloc(&v.dpz,v.np*sizeof(float)); cudaMalloc(&v.dvx,v.np*sizeof(float)); cudaMalloc(&v.dvy,v.np*sizeof(float)); cudaMalloc(&v.dvz,v.np*sizeof(float)); cudaMalloc(&v.dfx,v.np*sizeof(float)); cudaMalloc(&v.dfy,v.np*sizeof(float)); cudaMalloc(&v.dfz,v.np*sizeof(float)); cudaMalloc(&v.ds,v.np*sizeof(float)); } else if (s.compare("particles:") == 0) { for (int i = 0; i < v.np ; ++i) { cin.read((char *)&v.px[i],4); cin.read((char *)&v.py[i],4); cin.read((char *)&v.pz[i],4); } } else if (s.compare("end:") == 0) break; } cerr << "particles_stress_strain:" << endl; cerr << " input " << v.np << " particles" << endl; // // copy data to GPU // cudaMemcpy(v.dpx,v.px,v.np*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(v.dpy,v.py,v.np*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(v.dpz,v.pz,v.np*sizeof(float), cudaMemcpyHostToDevice); cudaCheck("copy to GPU"); // // start output // cout << "type:" << endl; cout << "particles" << endl; cout << "format:" << endl; cout << "xyz float32" << endl; cout << "pitch:" << endl; cout << v.d << endl; cout << "number:" << endl; cout << v.np << endl; cout << "fraction:" << endl; cout << v.fraction << endl; cout << "xmin:" << endl; cout << v.xmin << endl; cout << "xmax:" << endl; cout << v.xmax << endl; cout << "ymin:" << endl; cout << v.ymin << endl; cout << "ymax:" << endl; cout << v.ymax << endl; cout << "zmin:" << endl; cout << v.zmin << endl; cout << "zmax:" << endl; cout << v.zmax << endl; cout << "particles:" << endl; for (int i = 0; i < v.np; ++i) { cout.write((char*)&v.px[i],4); cout.write((char*)&v.py[i],4); cout.write((char*)&v.pz[i],4); } cout << endl << "continue:" << endl; // // start main loop // while (1) { // // update loop // cudaDeviceSynchronize(); auto t0 = chrono::high_resolution_clock::now(); cout << "outer loop" << endl; for (int i = 0; i < v.nloop; ++i) { cout << "inner loop" << endl; // // sort // cudaMemset(v.dnsort,0,v.nz*v.ny*v.nx*sizeof(int)); sort_particles<<<v.grid,v.block>>>(v); cudaCheck("sort"); // // forces // calculate_forces<<<v.grid,v.block>>>(v); cudaCheck("forces"); // // integrate // integrate_positions<<<v.grid,v.block>>>(v); cudaCheck("integrate"); } cudaDeviceSynchronize(); auto t1 = chrono::high_resolution_clock::now(); float dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count(); fprintf(stderr,"\r mps: %.0f",(v.nloop*v.np)/dt); // // measure timing // t0 = chrono::high_resolution_clock::now(); cudaMemset(v.dnsort,0,v.nz*v.ny*v.nx*sizeof(int)); sort_particles<<<v.grid,v.block>>>(v); cudaCheck("time sort"); cudaDeviceSynchronize(); t1 = chrono::high_resolution_clock::now(); dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count(); cerr << " srt: " << dt; // t0 = chrono::high_resolution_clock::now(); calculate_forces<<<v.grid,v.block>>>(v); cudaCheck("time forces"); cudaDeviceSynchronize(); t1 = chrono::high_resolution_clock::now(); dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count(); cerr << " frc: " << dt; // t0 = chrono::high_resolution_clock::now(); integrate_positions<<<v.grid,v.block>>>(v); cudaCheck("time integrate"); cudaDeviceSynchronize(); t1 = chrono::high_resolution_clock::now(); dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count(); cerr << " int: " << dt; // // copy data to CPU // t0 = chrono::high_resolution_clock::now(); cudaMemcpy(v.px,v.dpx,v.np*sizeof(int),cudaMemcpyDeviceToHost); cudaMemcpy(v.py,v.dpy,v.np*sizeof(int),cudaMemcpyDeviceToHost); cudaMemcpy(v.pz,v.dpz,v.np*sizeof(int),cudaMemcpyDeviceToHost); cudaMemcpy(v.s,v.ds,v.np*sizeof(int),cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); cudaCheck("copy to CPU"); t1 = chrono::high_resolution_clock::now(); dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count(); cerr << " cpy: " << dt; // // output // t0 = chrono::high_resolution_clock::now(); cout << "particles:" << endl; for (int i = 0; i < v.np; ++i) { cout.write((char*)&v.px[i],4); cout.write((char*)&v.py[i],4); cout.write((char*)&v.pz[i],4); } cout << endl << "strain:" << endl; for (int i = 0; i < v.np; ++i) cout.write((char*)&v.s[i],4); cout << endl << "continue:" << endl; t1 = chrono::high_resolution_clock::now(); dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count(); fprintf(stderr," dpy: %.3g ",dt); // cerr << "(us)" << flush; } }