diff --git a/examples/fibre_2mode.cpp b/examples/fibre_2mode.cpp index 6efa9f7..a55558a 100644 --- a/examples/fibre_2mode.cpp +++ b/examples/fibre_2mode.cpp @@ -61,7 +61,7 @@ int main() double n2 = 2.3e-20; double omega0 = 1.2153e3; bool is_self_steepening = false; - bool is_nonlinear = false; + bool is_nonlinear = true; MultimodeNLSE ode(num_modes, num_time_points, tmin, tmax, beta_mat, n2, omega0, Sk, is_self_steepening, is_nonlinear); diff --git a/scripts/comparison_plot.py b/scripts/comparison_plot.py index fa5b164..bf50e90 100644 --- a/scripts/comparison_plot.py +++ b/scripts/comparison_plot.py @@ -1,10 +1,14 @@ +import os import numpy as np import matplotlib.pyplot as plt from matplotlib import ticker, cm import h5py +current_dir = os.path.dirname(__file__); +plt.rcParams["savefig.directory"] = current_dir; -f_mpa = h5py.File('GRIN_1550_linear_sample_single_gpu_mpa.mat','r') +# f_mpa = h5py.File('GRIN_1550_linear_sample_single_gpu_mpa.mat','r') +f_mpa = h5py.File('GRIN_1550_nofR_noSS_c0_sample_single_gpu_mpa.mat','r') # data = scipy.io.loadmat('GRIN_1550_linear_sample_single_gpu_mpa.mat') print(f_mpa.keys()) field_data = np.array(f_mpa.get('prop_output/fields')); @@ -12,20 +16,25 @@ print(np.shape(complex_field_data)) plt.figure(figsize=(30, 12)) +plt.rcParams.update({'font.size': 16}) zstep_mpa = 15000; #10mm zstep_cpp = 150; for mode in range(0,2): - cpp_data = np.genfromtxt('../build/release/intensity_mode' + str(mode) + '_rk4_tderivorder2_nz3e5_gpu.txt', delimiter=','); + # cpp_data = np.genfromtxt('../build/release/intensity_mode' + str(mode) + '_rk4_tderivorder2_nz3e5_gpu.txt', delimiter=','); + # cpp_data = np.genfromtxt('../build/release/intensity_mode' + str(mode) + '.txt', delimiter=','); + cpp_data = np.genfromtxt('solutions/intensity_mode' + str(mode) + '_kerr_noSS_tderivorder2_nz6e5.txt', delimiter=','); print(cpp_data.shape) abs_u_cpp = np.transpose(cpp_data[zstep_cpp,:]); + intensity_db_cpp = np.maximum(20*np.log10(abs_u_cpp), -50); print(abs_u_cpp.shape) u_mpa = np.transpose(complex_field_data[zstep_mpa,mode,:]); print(u_mpa.shape) abs_u_mpa = np.abs(u_mpa); + intensity_db_mpa = np.maximum(20*np.log10(abs_u_mpa), -50); Nt = u_mpa.shape[0]; tvec = np.linspace(-40.0, 40.0, Nt); @@ -43,7 +52,10 @@ # print('max_diff: ', np.max(np.abs(abs_u_cpp - abs_u_mpa))) plt.subplot(1, 2, mode + 1); - plt.plot(tvec, abs_u_mpa, 'b-', tvec2, abs_u_cpp, 'r-'); + # plt.plot(tvec, abs_u_mpa, 'b-', tvec2, abs_u_cpp, 'r-'); + plt.plot(tvec, intensity_db_mpa, 'b-', tvec2, intensity_db_cpp, 'r-'); plt.legend(['MPA','Finite difference with RK4'], loc='upper left') + plt.xlabel('Time [ps]') + plt.ylabel('Intensity (dB)') plt.show() diff --git a/scripts/plot.py b/scripts/plot.py index 49f7001..b32d316 100644 --- a/scripts/plot.py +++ b/scripts/plot.py @@ -1,7 +1,11 @@ +import os import numpy as np import matplotlib.pyplot as plt from matplotlib import ticker, cm +current_dir = os.path.dirname(__file__); +plt.rcParams["savefig.directory"] = current_dir; + plt.figure(figsize=(30, 12)) for mode in range(0,2): @@ -15,11 +19,12 @@ tmat, zmat = np.meshgrid(tvec, zvec); # log_data = np.log10(np.clip(data, 1e-16, None)); + log_data = np.clip(20*np.log10(data), -50, None); print(np.max(np.abs(data[:]))) plt.subplot(2, 2, mode + 1); - cs = plt.contourf(tmat, zmat, data, 100); #, cmap ="bone") + cs = plt.contourf(tmat, zmat, log_data, 100); #, cmap ="bone") cbar = plt.colorbar(cs) plt.title('Mode ' + str(mode) + ' intensity') diff --git a/src/ode/GPUMultimodeNLSE.hpp b/src/ode/GPUMultimodeNLSE.hpp index a6dc787..6affbc3 100644 --- a/src/ode/GPUMultimodeNLSE.hpp +++ b/src/ode/GPUMultimodeNLSE.hpp @@ -18,12 +18,11 @@ namespace autodiffeq namespace gpu { - template __global__ -void AddDispersionRHS(const int num_modes, const int num_time_pts, - const DeviceArray2D& beta_mat, const double dt, - const DeviceArray1D& sol, DeviceArray1D& rhs) +void AddDispersionStencil5(const int num_modes, const int num_time_pts, + const DeviceArray2D& beta_mat, const double dt, + const DeviceArray1D& sol, DeviceArray1D& rhs) { const int t = blockIdx.x*blockDim.x + threadIdx.x; //time-point index const int p = threadIdx.y; //mode index (blockIdx.y is always 0 since only 1 block is launched in this dimension) @@ -104,9 +103,82 @@ void AddDispersionRHS(const int num_modes, const int num_time_pts, template __global__ -void AddKerrRHS(const int num_modes, const int num_time_pts, const bool use_shared_mem_for_Sk, - const DeviceArray4D& Sk_tensor, const DeviceArray1D& sol, - complex j_n_omega0_invc, DeviceArray1D& rhs) +void AddDispersionStencil7(const int num_modes, const int num_time_pts, + const DeviceArray2D& beta_mat, const double dt, + const DeviceArray1D& sol, DeviceArray1D& rhs) +{ + const int t = blockIdx.x*blockDim.x + threadIdx.x; //time-point index + const int p = threadIdx.y; //mode index (blockIdx.y is always 0 since only 1 block is launched in this dimension) + const int offset = num_modes*t + p; + int soldim = sol.size(); + + const int num_deriv_rows = beta_mat.GetNumRows(); + const int beta_size = num_deriv_rows * num_modes; + constexpr int stencil_size = 7; + + extern __shared__ double array[]; + double* beta = array; // num_deriv_rows x num_modes doubles + T* sol_stencil_shared = (T*) &beta[beta_size]; // (num_threads * stencil_size * num_modes) variables of type T + + if (threadIdx.x < num_deriv_rows) + beta[threadIdx.x*num_modes+p] = beta_mat(threadIdx.x,p); + + int shared_offset = threadIdx.x * stencil_size * num_modes; + T* sol_stencil = sol_stencil_shared + shared_offset; + if (t < num_time_pts) + { + for (int i = 0; i < stencil_size; ++i) + { + int t_offset = t + i - 3; + if (t_offset < 0) + t_offset = -t_offset; //Mirror data at left boundary + if (t_offset >= num_time_pts) + t_offset = 2*num_time_pts - 2 - t_offset; //Mirror data at right boundary + + sol_stencil[num_modes*i + p] = sol[num_modes*t_offset + p]; + } + } + + __syncthreads(); + + if (offset >= soldim) + return; + + constexpr double inv6 = 1.0/6.0; + constexpr double inv24 = 1.0/24.0; + constexpr T imag(0.0, 1.0); + const double inv_12dt = 1.0/(12.0*dt); + const double inv_12dt2 = inv_12dt / dt; + const double inv_8dt3 = 1.0 / (8.0*dt*dt*dt); + const double inv_6dt4 = 1.0 / (6.0*dt*dt*dt*dt); + + T sol_im3 = sol_stencil[ p]; + T sol_im2 = sol_stencil[num_modes + p]; + T sol_im1 = sol_stencil[num_modes*2 + p]; + T sol_i = sol_stencil[num_modes*3 + p]; + T sol_ip1 = sol_stencil[num_modes*4 + p]; + T sol_ip2 = sol_stencil[num_modes*5 + p]; + T sol_ip3 = sol_stencil[num_modes*6 + p]; + + //Calculate solution time-derivatives using stencil data + T sol_tderiv1 = (sol_im2 - 8.0*(sol_im1 - sol_ip1) - sol_ip2) * inv_12dt; + T sol_tderiv2 = (-sol_im2 + 16.0*(sol_im1 + sol_ip1) - 30.0*sol_i - sol_ip2) * inv_12dt2; + T sol_tderiv3 = (sol_im3 - 8.0*(sol_im2 - sol_ip2) + 13.0*(sol_im1 - sol_ip1) - sol_ip3) * inv_8dt3; + T sol_tderiv4 = (-(sol_im3 + sol_ip3) + 12.0*(sol_im2 + sol_ip2) - 39.0*(sol_im1 + sol_ip1) + 56.0*sol_i) * inv_6dt4; + + T rhs_val = imag*(beta[ p] - beta[ 0])*sol_i //(beta0p - beta00) + -(beta[ num_modes+p] - beta[num_modes])*sol_tderiv1 + - imag* beta[2*num_modes+p]*0.5 *sol_tderiv2 + + beta[3*num_modes+p]*inv6 *sol_tderiv3 + + imag* beta[4*num_modes+p]*inv24 *sol_tderiv4; + rhs[offset] = rhs_val; +} + +template +__global__ +void AddKerrNonlinearity(const int num_modes, const int num_time_pts, + const DeviceArray4D& Sk_tensor, const DeviceArray1D& sol, + complex j_n_omega0_invc, DeviceArray1D& rhs) { const int t = blockIdx.x*blockDim.x + threadIdx.x; //time-point index const int p = threadIdx.y; //mode index (blockIdx.y is always 0 since only 1 block is launched in this dimension) @@ -146,19 +218,18 @@ void AddKerrRHS(const int num_modes, const int num_time_pts, const bool use_shar if (use_shared_mem_for_Sk) { const int num_modes_sq = num_modes * num_modes; + const int num_modes_cu = num_modes * num_modes * num_modes; for (int q = 0; q < num_modes; ++q) { T Aq = sol_modes[q]; - int Sk_offset = num_modes_sq * (num_modes*p + q); //linear index for Sk(p,q,r,s) for (int r = 0; r < num_modes; ++r) { T Ar = sol_modes[r]; - Sk_offset += (num_modes*r); for (int s = 0; s < num_modes; ++s) { T As = sol_modes[s]; - Sk_offset += s; - kerr += Sk[Sk_offset]*Aq*Ar*conj(As); + int Sk_ind = num_modes_cu*p + num_modes_sq*q + num_modes*r + s; //linear index for Sk(p,q,r,s) + kerr += Sk[Sk_ind]*Aq*Ar*conj(As); } } } @@ -218,55 +289,60 @@ class GPUMultimodeNLSE : public ODE if (is_nonlinear_) for (int d = 0; d < 4; ++d) assert(Sk.GetDim(d) == num_modes); - - // const int max_Ap_tderiv = beta_mat_.GetNumRows()-1; - // sol_tderiv_.resize(max_Ap_tderiv, num_time_points_); //Stores the time-derivatives (e.g. d/dt, d^2/dt^2 ...) of a particular solution mode - - // if (is_nonlinear_) - // { - // kerr_.resize(num_time_points_); - // if (is_self_steepening_) - // kerr_tderiv_.resize(num_time_points_); - // } } int GetSolutionSize() const { return num_modes_*num_time_points_; } void EvalRHS(const GPUArray1D& sol, int step, double z, GPUArray1D& rhs) { - constexpr int threads_per_block = 256; + constexpr int threads_per_block = 128; dim3 block_dim(threads_per_block, num_modes_, 1); dim3 grid_dim((num_time_points_ + block_dim.x-1) / block_dim.x, 1, 1); //std::cout << "block_dim: " << block_dim.x << ", " << block_dim.y << ", " << block_dim.z << std::endl; //std::cout << "grid_dim: " << grid_dim.x << ", " << grid_dim.y << ", " << grid_dim.z << std::endl; +#if 0 + //Compute dispersion term using second-order time derivatives constexpr int stencil_size = 5; int shared_mem_bytes = beta_mat_.size()*sizeof(double) + (threads_per_block * stencil_size * num_modes_)*sizeof(T); // std::cout << "shared mem bytes1: " << shared_mem_bytes << std::endl; - - gpu::AddDispersionRHS<<>>( + gpu::AddDispersionStencil5<<>>( + num_modes_, num_time_points_, beta_mat_.GetDeviceArray(), dt_, + sol.GetDeviceArray(), rhs.GetDeviceArray()); + cudaCheckLastError(); +#else + //Compute dispersion term using fourth-order time derivatives + constexpr int stencil_size = 7; + int shared_mem_bytes = beta_mat_.size()*sizeof(double) + + (threads_per_block * stencil_size * num_modes_)*sizeof(T); + // std::cout << "shared mem bytes1: " << shared_mem_bytes << std::endl; + gpu::AddDispersionStencil7<<>>( num_modes_, num_time_points_, beta_mat_.GetDeviceArray(), dt_, sol.GetDeviceArray(), rhs.GetDeviceArray()); cudaCheckLastError(); +#endif if (is_nonlinear_) { + const complex j_n_omega0_invc(0.0, n2_*omega0_/c_); + shared_mem_bytes = (threads_per_block * num_modes_)*sizeof(T); int num_bytes_Sk = num_modes_*num_modes_*num_modes_*num_modes_*sizeof(double); - bool use_shared_mem_for_Sk = false; if (shared_mem_bytes + num_bytes_Sk <= 48000) { shared_mem_bytes += num_bytes_Sk; - // use_shared_mem_for_Sk = true; + // std::cout << "shared mem bytes2: " << shared_mem_bytes << std::endl; + gpu::AddKerrNonlinearity<<>>( + num_modes_, num_time_points_, Sk_.GetDeviceArray(), sol.GetDeviceArray(), + j_n_omega0_invc, rhs.GetDeviceArray()); + } + else + { + gpu::AddKerrNonlinearity<<>>( + num_modes_, num_time_points_, Sk_.GetDeviceArray(), sol.GetDeviceArray(), + j_n_omega0_invc, rhs.GetDeviceArray()); } - // std::cout << "shared mem bytes2: " << shared_mem_bytes << std::endl; - - const complex j_n_omega0_invc(0.0, n2_*omega0_/c_); - gpu::AddKerrRHS<<>>( - num_modes_, num_time_points_, use_shared_mem_for_Sk, - Sk_.GetDeviceArray(), sol.GetDeviceArray(), - j_n_omega0_invc, rhs.GetDeviceArray()); } #if 0 @@ -289,29 +365,6 @@ class GPUMultimodeNLSE : public ODE #endif } - // void ComputeKerrNonlinearity(const int p, const Array1D& sol) - // { - // #pragma omp for - // for (int i = 0; i < num_time_points_; ++i) - // { - // T sum = 0.0; - // for (int q = 0; q < num_modes_; ++q) - // { - // const auto& Aq = sol(q*num_time_points_ + i); - // for (int r = 0; r < num_modes_; ++r) - // { - // const auto& Ar = sol(r*num_time_points_ + i); - // for (int s = 0; s < num_modes_; ++s) - // { - // const auto& As = sol(s*num_time_points_ + i); - // sum += Sk_(p,q,r,s)*Aq*Ar*conj(As); - // } - // } - // } - // kerr_(i) = sum; - // } - // } - Array1D GetInitialSolutionGaussian(const Array1D& Et, const Array1D& t_FWHM, const Array1D& t_center) { assert(num_modes_ == (int) Et.size()); @@ -343,9 +396,6 @@ class GPUMultimodeNLSE : public ODE static constexpr double c_ = 2.99792458e-4; //[m/ps] const bool is_self_steepening_ = false; const bool is_nonlinear_ = false; - // Array2D sol_tderiv_; - Array1D kerr_; - // Array1D kerr_tderiv_; }; } \ No newline at end of file