Skip to content

Commit

Permalink
Implemented dispersion term based on fourth-order time derivatives on…
Browse files Browse the repository at this point in the history
… GPU
  • Loading branch information
savithru-j committed Nov 5, 2023
1 parent 079f05c commit 7944a70
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 64 deletions.
2 changes: 1 addition & 1 deletion examples/fibre_2mode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ int main()
double n2 = 2.3e-20;
double omega0 = 1.2153e3;
bool is_self_steepening = false;
bool is_nonlinear = false;
bool is_nonlinear = true;

MultimodeNLSE<Complex> ode(num_modes, num_time_points, tmin, tmax, beta_mat,
n2, omega0, Sk, is_self_steepening, is_nonlinear);
Expand Down
18 changes: 15 additions & 3 deletions scripts/comparison_plot.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,40 @@
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import ticker, cm
import h5py

current_dir = os.path.dirname(__file__);
plt.rcParams["savefig.directory"] = current_dir;

f_mpa = h5py.File('GRIN_1550_linear_sample_single_gpu_mpa.mat','r')
# f_mpa = h5py.File('GRIN_1550_linear_sample_single_gpu_mpa.mat','r')
f_mpa = h5py.File('GRIN_1550_nofR_noSS_c0_sample_single_gpu_mpa.mat','r')
# data = scipy.io.loadmat('GRIN_1550_linear_sample_single_gpu_mpa.mat')
print(f_mpa.keys())
field_data = np.array(f_mpa.get('prop_output/fields'));
complex_field_data = field_data['real'] + 1j*field_data['imag']
print(np.shape(complex_field_data))

plt.figure(figsize=(30, 12))
plt.rcParams.update({'font.size': 16})

zstep_mpa = 15000; #10mm
zstep_cpp = 150;

for mode in range(0,2):
cpp_data = np.genfromtxt('../build/release/intensity_mode' + str(mode) + '_rk4_tderivorder2_nz3e5_gpu.txt', delimiter=',');
# cpp_data = np.genfromtxt('../build/release/intensity_mode' + str(mode) + '_rk4_tderivorder2_nz3e5_gpu.txt', delimiter=',');
# cpp_data = np.genfromtxt('../build/release/intensity_mode' + str(mode) + '.txt', delimiter=',');
cpp_data = np.genfromtxt('solutions/intensity_mode' + str(mode) + '_kerr_noSS_tderivorder2_nz6e5.txt', delimiter=',');
print(cpp_data.shape)
abs_u_cpp = np.transpose(cpp_data[zstep_cpp,:]);
intensity_db_cpp = np.maximum(20*np.log10(abs_u_cpp), -50);
print(abs_u_cpp.shape)

u_mpa = np.transpose(complex_field_data[zstep_mpa,mode,:]);
print(u_mpa.shape)

abs_u_mpa = np.abs(u_mpa);
intensity_db_mpa = np.maximum(20*np.log10(abs_u_mpa), -50);
Nt = u_mpa.shape[0];

tvec = np.linspace(-40.0, 40.0, Nt);
Expand All @@ -43,7 +52,10 @@
# print('max_diff: ', np.max(np.abs(abs_u_cpp - abs_u_mpa)))

plt.subplot(1, 2, mode + 1);
plt.plot(tvec, abs_u_mpa, 'b-', tvec2, abs_u_cpp, 'r-');
# plt.plot(tvec, abs_u_mpa, 'b-', tvec2, abs_u_cpp, 'r-');
plt.plot(tvec, intensity_db_mpa, 'b-', tvec2, intensity_db_cpp, 'r-');
plt.legend(['MPA','Finite difference with RK4'], loc='upper left')
plt.xlabel('Time [ps]')
plt.ylabel('Intensity (dB)')

plt.show()
7 changes: 6 additions & 1 deletion scripts/plot.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import ticker, cm

current_dir = os.path.dirname(__file__);
plt.rcParams["savefig.directory"] = current_dir;

plt.figure(figsize=(30, 12))

for mode in range(0,2):
Expand All @@ -15,11 +19,12 @@
tmat, zmat = np.meshgrid(tvec, zvec);

# log_data = np.log10(np.clip(data, 1e-16, None));
log_data = np.clip(20*np.log10(data), -50, None);

print(np.max(np.abs(data[:])))

plt.subplot(2, 2, mode + 1);
cs = plt.contourf(tmat, zmat, data, 100); #, cmap ="bone")
cs = plt.contourf(tmat, zmat, log_data, 100); #, cmap ="bone")
cbar = plt.colorbar(cs)
plt.title('Mode ' + str(mode) + ' intensity')

Expand Down
168 changes: 109 additions & 59 deletions src/ode/GPUMultimodeNLSE.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,11 @@ namespace autodiffeq
namespace gpu
{


template<typename T>
__global__
void AddDispersionRHS(const int num_modes, const int num_time_pts,
const DeviceArray2D<double>& beta_mat, const double dt,
const DeviceArray1D<T>& sol, DeviceArray1D<T>& rhs)
void AddDispersionStencil5(const int num_modes, const int num_time_pts,
const DeviceArray2D<double>& beta_mat, const double dt,
const DeviceArray1D<T>& sol, DeviceArray1D<T>& rhs)
{
const int t = blockIdx.x*blockDim.x + threadIdx.x; //time-point index
const int p = threadIdx.y; //mode index (blockIdx.y is always 0 since only 1 block is launched in this dimension)
Expand Down Expand Up @@ -104,9 +103,82 @@ void AddDispersionRHS(const int num_modes, const int num_time_pts,

template<typename T>
__global__
void AddKerrRHS(const int num_modes, const int num_time_pts, const bool use_shared_mem_for_Sk,
const DeviceArray4D<double>& Sk_tensor, const DeviceArray1D<T>& sol,
complex<double> j_n_omega0_invc, DeviceArray1D<T>& rhs)
void AddDispersionStencil7(const int num_modes, const int num_time_pts,
const DeviceArray2D<double>& beta_mat, const double dt,
const DeviceArray1D<T>& sol, DeviceArray1D<T>& rhs)
{
const int t = blockIdx.x*blockDim.x + threadIdx.x; //time-point index
const int p = threadIdx.y; //mode index (blockIdx.y is always 0 since only 1 block is launched in this dimension)
const int offset = num_modes*t + p;
int soldim = sol.size();

const int num_deriv_rows = beta_mat.GetNumRows();
const int beta_size = num_deriv_rows * num_modes;
constexpr int stencil_size = 7;

extern __shared__ double array[];
double* beta = array; // num_deriv_rows x num_modes doubles
T* sol_stencil_shared = (T*) &beta[beta_size]; // (num_threads * stencil_size * num_modes) variables of type T

if (threadIdx.x < num_deriv_rows)
beta[threadIdx.x*num_modes+p] = beta_mat(threadIdx.x,p);

int shared_offset = threadIdx.x * stencil_size * num_modes;
T* sol_stencil = sol_stencil_shared + shared_offset;
if (t < num_time_pts)
{
for (int i = 0; i < stencil_size; ++i)
{
int t_offset = t + i - 3;
if (t_offset < 0)
t_offset = -t_offset; //Mirror data at left boundary
if (t_offset >= num_time_pts)
t_offset = 2*num_time_pts - 2 - t_offset; //Mirror data at right boundary

sol_stencil[num_modes*i + p] = sol[num_modes*t_offset + p];
}
}

__syncthreads();

if (offset >= soldim)
return;

constexpr double inv6 = 1.0/6.0;
constexpr double inv24 = 1.0/24.0;
constexpr T imag(0.0, 1.0);
const double inv_12dt = 1.0/(12.0*dt);
const double inv_12dt2 = inv_12dt / dt;
const double inv_8dt3 = 1.0 / (8.0*dt*dt*dt);
const double inv_6dt4 = 1.0 / (6.0*dt*dt*dt*dt);

T sol_im3 = sol_stencil[ p];
T sol_im2 = sol_stencil[num_modes + p];
T sol_im1 = sol_stencil[num_modes*2 + p];
T sol_i = sol_stencil[num_modes*3 + p];
T sol_ip1 = sol_stencil[num_modes*4 + p];
T sol_ip2 = sol_stencil[num_modes*5 + p];
T sol_ip3 = sol_stencil[num_modes*6 + p];

//Calculate solution time-derivatives using stencil data
T sol_tderiv1 = (sol_im2 - 8.0*(sol_im1 - sol_ip1) - sol_ip2) * inv_12dt;
T sol_tderiv2 = (-sol_im2 + 16.0*(sol_im1 + sol_ip1) - 30.0*sol_i - sol_ip2) * inv_12dt2;
T sol_tderiv3 = (sol_im3 - 8.0*(sol_im2 - sol_ip2) + 13.0*(sol_im1 - sol_ip1) - sol_ip3) * inv_8dt3;
T sol_tderiv4 = (-(sol_im3 + sol_ip3) + 12.0*(sol_im2 + sol_ip2) - 39.0*(sol_im1 + sol_ip1) + 56.0*sol_i) * inv_6dt4;

T rhs_val = imag*(beta[ p] - beta[ 0])*sol_i //(beta0p - beta00)
-(beta[ num_modes+p] - beta[num_modes])*sol_tderiv1
- imag* beta[2*num_modes+p]*0.5 *sol_tderiv2
+ beta[3*num_modes+p]*inv6 *sol_tderiv3
+ imag* beta[4*num_modes+p]*inv24 *sol_tderiv4;
rhs[offset] = rhs_val;
}

template<typename T, bool use_shared_mem_for_Sk = true>
__global__
void AddKerrNonlinearity(const int num_modes, const int num_time_pts,
const DeviceArray4D<double>& Sk_tensor, const DeviceArray1D<T>& sol,
complex<double> j_n_omega0_invc, DeviceArray1D<T>& rhs)
{
const int t = blockIdx.x*blockDim.x + threadIdx.x; //time-point index
const int p = threadIdx.y; //mode index (blockIdx.y is always 0 since only 1 block is launched in this dimension)
Expand Down Expand Up @@ -146,19 +218,18 @@ void AddKerrRHS(const int num_modes, const int num_time_pts, const bool use_shar
if (use_shared_mem_for_Sk)
{
const int num_modes_sq = num_modes * num_modes;
const int num_modes_cu = num_modes * num_modes * num_modes;
for (int q = 0; q < num_modes; ++q)
{
T Aq = sol_modes[q];
int Sk_offset = num_modes_sq * (num_modes*p + q); //linear index for Sk(p,q,r,s)
for (int r = 0; r < num_modes; ++r)
{
T Ar = sol_modes[r];
Sk_offset += (num_modes*r);
for (int s = 0; s < num_modes; ++s)
{
T As = sol_modes[s];
Sk_offset += s;
kerr += Sk[Sk_offset]*Aq*Ar*conj(As);
int Sk_ind = num_modes_cu*p + num_modes_sq*q + num_modes*r + s; //linear index for Sk(p,q,r,s)
kerr += Sk[Sk_ind]*Aq*Ar*conj(As);
}
}
}
Expand Down Expand Up @@ -218,55 +289,60 @@ class GPUMultimodeNLSE : public ODE<T>
if (is_nonlinear_)
for (int d = 0; d < 4; ++d)
assert(Sk.GetDim(d) == num_modes);

// const int max_Ap_tderiv = beta_mat_.GetNumRows()-1;
// sol_tderiv_.resize(max_Ap_tderiv, num_time_points_); //Stores the time-derivatives (e.g. d/dt, d^2/dt^2 ...) of a particular solution mode

// if (is_nonlinear_)
// {
// kerr_.resize(num_time_points_);
// if (is_self_steepening_)
// kerr_tderiv_.resize(num_time_points_);
// }
}

int GetSolutionSize() const { return num_modes_*num_time_points_; }

void EvalRHS(const GPUArray1D<T>& sol, int step, double z, GPUArray1D<T>& rhs)
{
constexpr int threads_per_block = 256;
constexpr int threads_per_block = 128;
dim3 block_dim(threads_per_block, num_modes_, 1);
dim3 grid_dim((num_time_points_ + block_dim.x-1) / block_dim.x, 1, 1);
//std::cout << "block_dim: " << block_dim.x << ", " << block_dim.y << ", " << block_dim.z << std::endl;
//std::cout << "grid_dim: " << grid_dim.x << ", " << grid_dim.y << ", " << grid_dim.z << std::endl;

#if 0
//Compute dispersion term using second-order time derivatives
constexpr int stencil_size = 5;
int shared_mem_bytes = beta_mat_.size()*sizeof(double)
+ (threads_per_block * stencil_size * num_modes_)*sizeof(T);
// std::cout << "shared mem bytes1: " << shared_mem_bytes << std::endl;

gpu::AddDispersionRHS<<<grid_dim, block_dim, shared_mem_bytes>>>(
gpu::AddDispersionStencil5<<<grid_dim, block_dim, shared_mem_bytes>>>(
num_modes_, num_time_points_, beta_mat_.GetDeviceArray(), dt_,
sol.GetDeviceArray(), rhs.GetDeviceArray());
cudaCheckLastError();
#else
//Compute dispersion term using fourth-order time derivatives
constexpr int stencil_size = 7;
int shared_mem_bytes = beta_mat_.size()*sizeof(double)
+ (threads_per_block * stencil_size * num_modes_)*sizeof(T);
// std::cout << "shared mem bytes1: " << shared_mem_bytes << std::endl;
gpu::AddDispersionStencil7<<<grid_dim, block_dim, shared_mem_bytes>>>(
num_modes_, num_time_points_, beta_mat_.GetDeviceArray(), dt_,
sol.GetDeviceArray(), rhs.GetDeviceArray());
cudaCheckLastError();
#endif

if (is_nonlinear_)
{
const complex<double> j_n_omega0_invc(0.0, n2_*omega0_/c_);

shared_mem_bytes = (threads_per_block * num_modes_)*sizeof(T);
int num_bytes_Sk = num_modes_*num_modes_*num_modes_*num_modes_*sizeof(double);
bool use_shared_mem_for_Sk = false;
if (shared_mem_bytes + num_bytes_Sk <= 48000)
{
shared_mem_bytes += num_bytes_Sk;
// use_shared_mem_for_Sk = true;
// std::cout << "shared mem bytes2: " << shared_mem_bytes << std::endl;
gpu::AddKerrNonlinearity<T, true><<<grid_dim, block_dim, shared_mem_bytes>>>(
num_modes_, num_time_points_, Sk_.GetDeviceArray(), sol.GetDeviceArray(),
j_n_omega0_invc, rhs.GetDeviceArray());
}
else
{
gpu::AddKerrNonlinearity<T, false><<<grid_dim, block_dim, shared_mem_bytes>>>(
num_modes_, num_time_points_, Sk_.GetDeviceArray(), sol.GetDeviceArray(),
j_n_omega0_invc, rhs.GetDeviceArray());
}
// std::cout << "shared mem bytes2: " << shared_mem_bytes << std::endl;

const complex<double> j_n_omega0_invc(0.0, n2_*omega0_/c_);
gpu::AddKerrRHS<<<grid_dim, block_dim, shared_mem_bytes>>>(
num_modes_, num_time_points_, use_shared_mem_for_Sk,
Sk_.GetDeviceArray(), sol.GetDeviceArray(),
j_n_omega0_invc, rhs.GetDeviceArray());
}

#if 0
Expand All @@ -289,29 +365,6 @@ class GPUMultimodeNLSE : public ODE<T>
#endif
}

// void ComputeKerrNonlinearity(const int p, const Array1D<T>& sol)
// {
// #pragma omp for
// for (int i = 0; i < num_time_points_; ++i)
// {
// T sum = 0.0;
// for (int q = 0; q < num_modes_; ++q)
// {
// const auto& Aq = sol(q*num_time_points_ + i);
// for (int r = 0; r < num_modes_; ++r)
// {
// const auto& Ar = sol(r*num_time_points_ + i);
// for (int s = 0; s < num_modes_; ++s)
// {
// const auto& As = sol(s*num_time_points_ + i);
// sum += Sk_(p,q,r,s)*Aq*Ar*conj(As);
// }
// }
// }
// kerr_(i) = sum;
// }
// }

Array1D<T> GetInitialSolutionGaussian(const Array1D<double>& Et, const Array1D<double>& t_FWHM, const Array1D<double>& t_center)
{
assert(num_modes_ == (int) Et.size());
Expand Down Expand Up @@ -343,9 +396,6 @@ class GPUMultimodeNLSE : public ODE<T>
static constexpr double c_ = 2.99792458e-4; //[m/ps]
const bool is_self_steepening_ = false;
const bool is_nonlinear_ = false;
// Array2D<T> sol_tderiv_;
Array1D<T> kerr_;
// Array1D<T> kerr_tderiv_;
};

}

0 comments on commit 7944a70

Please sign in to comment.