diff --git a/examples/fibre_2mode.cpp b/examples/fibre_2mode.cpp
index 6efa9f7..a55558a 100644
--- a/examples/fibre_2mode.cpp
+++ b/examples/fibre_2mode.cpp
@@ -61,7 +61,7 @@ int main()
   double n2 = 2.3e-20;
   double omega0 = 1.2153e3;
   bool is_self_steepening = false;
-  bool is_nonlinear = false;
+  bool is_nonlinear = true;
 
   MultimodeNLSE<Complex> ode(num_modes, num_time_points, tmin, tmax, beta_mat,
                              n2, omega0, Sk, is_self_steepening, is_nonlinear);
diff --git a/scripts/comparison_plot.py b/scripts/comparison_plot.py
index fa5b164..bf50e90 100644
--- a/scripts/comparison_plot.py
+++ b/scripts/comparison_plot.py
@@ -1,10 +1,14 @@
+import os
 import numpy as np
 import matplotlib.pyplot as plt
 from matplotlib import ticker, cm
 import h5py
 
+current_dir = os.path.dirname(__file__);
+plt.rcParams["savefig.directory"] = current_dir;
 
-f_mpa = h5py.File('GRIN_1550_linear_sample_single_gpu_mpa.mat','r')
+# f_mpa = h5py.File('GRIN_1550_linear_sample_single_gpu_mpa.mat','r')
+f_mpa = h5py.File('GRIN_1550_nofR_noSS_c0_sample_single_gpu_mpa.mat','r')
 # data = scipy.io.loadmat('GRIN_1550_linear_sample_single_gpu_mpa.mat')
 print(f_mpa.keys())
 field_data = np.array(f_mpa.get('prop_output/fields'));
@@ -12,20 +16,25 @@
 print(np.shape(complex_field_data))
 
 plt.figure(figsize=(30, 12))
+plt.rcParams.update({'font.size': 16})
 
 zstep_mpa = 15000; #10mm
 zstep_cpp = 150;
 
 for mode in range(0,2):
-  cpp_data = np.genfromtxt('../build/release/intensity_mode' + str(mode) + '_rk4_tderivorder2_nz3e5_gpu.txt', delimiter=',');
+  # cpp_data = np.genfromtxt('../build/release/intensity_mode' + str(mode) + '_rk4_tderivorder2_nz3e5_gpu.txt', delimiter=',');
+  # cpp_data = np.genfromtxt('../build/release/intensity_mode' + str(mode) + '.txt', delimiter=',');
+  cpp_data = np.genfromtxt('solutions/intensity_mode' + str(mode) + '_kerr_noSS_tderivorder2_nz6e5.txt', delimiter=',');
   print(cpp_data.shape)
   abs_u_cpp = np.transpose(cpp_data[zstep_cpp,:]);
+  intensity_db_cpp = np.maximum(20*np.log10(abs_u_cpp), -50);
   print(abs_u_cpp.shape)
 
   u_mpa = np.transpose(complex_field_data[zstep_mpa,mode,:]);
   print(u_mpa.shape)
   
   abs_u_mpa = np.abs(u_mpa);
+  intensity_db_mpa = np.maximum(20*np.log10(abs_u_mpa), -50);
   Nt = u_mpa.shape[0];
 
   tvec = np.linspace(-40.0, 40.0, Nt);
@@ -43,7 +52,10 @@
   # print('max_diff: ', np.max(np.abs(abs_u_cpp - abs_u_mpa)))
 
   plt.subplot(1, 2, mode + 1);
-  plt.plot(tvec, abs_u_mpa, 'b-', tvec2, abs_u_cpp, 'r-');
+  # plt.plot(tvec, abs_u_mpa, 'b-', tvec2, abs_u_cpp, 'r-');
+  plt.plot(tvec, intensity_db_mpa, 'b-', tvec2, intensity_db_cpp, 'r-');
   plt.legend(['MPA','Finite difference with RK4'], loc='upper left')
+  plt.xlabel('Time [ps]')
+  plt.ylabel('Intensity (dB)')
   
 plt.show()
diff --git a/scripts/plot.py b/scripts/plot.py
index 49f7001..b32d316 100644
--- a/scripts/plot.py
+++ b/scripts/plot.py
@@ -1,7 +1,11 @@
+import os
 import numpy as np
 import matplotlib.pyplot as plt
 from matplotlib import ticker, cm
 
+current_dir = os.path.dirname(__file__);
+plt.rcParams["savefig.directory"] = current_dir;
+
 plt.figure(figsize=(30, 12))
 
 for mode in range(0,2):
@@ -15,11 +19,12 @@
   tmat, zmat = np.meshgrid(tvec, zvec);
 
   # log_data = np.log10(np.clip(data, 1e-16, None));
+  log_data = np.clip(20*np.log10(data), -50, None);
 
   print(np.max(np.abs(data[:])))
 
   plt.subplot(2, 2, mode + 1);
-  cs = plt.contourf(tmat, zmat, data, 100); #, cmap ="bone")
+  cs = plt.contourf(tmat, zmat, log_data, 100); #, cmap ="bone")
   cbar = plt.colorbar(cs)
   plt.title('Mode ' + str(mode) + ' intensity')
 
diff --git a/src/ode/GPUMultimodeNLSE.hpp b/src/ode/GPUMultimodeNLSE.hpp
index a6dc787..6affbc3 100644
--- a/src/ode/GPUMultimodeNLSE.hpp
+++ b/src/ode/GPUMultimodeNLSE.hpp
@@ -18,12 +18,11 @@ namespace autodiffeq
 namespace gpu
 {
 
-
 template<typename T>
 __global__
-void AddDispersionRHS(const int num_modes, const int num_time_pts,
-                      const DeviceArray2D<double>& beta_mat, const double dt,
-                      const DeviceArray1D<T>& sol, DeviceArray1D<T>& rhs)
+void AddDispersionStencil5(const int num_modes, const int num_time_pts,
+                           const DeviceArray2D<double>& beta_mat, const double dt,
+                           const DeviceArray1D<T>& sol, DeviceArray1D<T>& rhs)
 {
   const int t = blockIdx.x*blockDim.x + threadIdx.x; //time-point index
   const int p = threadIdx.y; //mode index (blockIdx.y is always 0 since only 1 block is launched in this dimension)
@@ -104,9 +103,82 @@ void AddDispersionRHS(const int num_modes, const int num_time_pts,
 
 template<typename T>
 __global__
-void AddKerrRHS(const int num_modes, const int num_time_pts, const bool use_shared_mem_for_Sk,
-                const DeviceArray4D<double>& Sk_tensor, const DeviceArray1D<T>& sol, 
-                complex<double> j_n_omega0_invc, DeviceArray1D<T>& rhs)
+void AddDispersionStencil7(const int num_modes, const int num_time_pts,
+                           const DeviceArray2D<double>& beta_mat, const double dt,
+                           const DeviceArray1D<T>& sol, DeviceArray1D<T>& rhs)
+{
+  const int t = blockIdx.x*blockDim.x + threadIdx.x; //time-point index
+  const int p = threadIdx.y; //mode index (blockIdx.y is always 0 since only 1 block is launched in this dimension)
+  const int offset = num_modes*t + p;
+  int soldim = sol.size();
+
+  const int num_deriv_rows = beta_mat.GetNumRows();
+  const int beta_size = num_deriv_rows * num_modes;
+  constexpr int stencil_size = 7;
+
+  extern __shared__ double array[];
+  double* beta = array; // num_deriv_rows x num_modes doubles
+  T* sol_stencil_shared = (T*) &beta[beta_size]; // (num_threads * stencil_size * num_modes) variables of type T
+
+  if (threadIdx.x < num_deriv_rows)
+    beta[threadIdx.x*num_modes+p] = beta_mat(threadIdx.x,p);
+
+  int shared_offset = threadIdx.x * stencil_size * num_modes;
+  T* sol_stencil = sol_stencil_shared + shared_offset;
+  if (t < num_time_pts)
+  {
+    for (int i = 0; i < stencil_size; ++i)
+    {
+      int t_offset = t + i - 3;
+      if (t_offset < 0)
+        t_offset = -t_offset; //Mirror data at left boundary
+      if (t_offset >= num_time_pts)
+        t_offset = 2*num_time_pts - 2 - t_offset; //Mirror data at right boundary
+
+      sol_stencil[num_modes*i + p] = sol[num_modes*t_offset + p];
+    }
+  }
+
+  __syncthreads();
+
+  if (offset >= soldim)
+    return;
+
+  constexpr double inv6 = 1.0/6.0;
+  constexpr double inv24 = 1.0/24.0;
+  constexpr T imag(0.0, 1.0);
+  const double inv_12dt = 1.0/(12.0*dt);
+  const double inv_12dt2 = inv_12dt / dt;
+  const double inv_8dt3 = 1.0 / (8.0*dt*dt*dt);
+  const double inv_6dt4 = 1.0 / (6.0*dt*dt*dt*dt);
+
+  T sol_im3 = sol_stencil[              p];
+  T sol_im2 = sol_stencil[num_modes   + p];
+  T sol_im1 = sol_stencil[num_modes*2 + p];
+  T sol_i   = sol_stencil[num_modes*3 + p];
+  T sol_ip1 = sol_stencil[num_modes*4 + p];
+  T sol_ip2 = sol_stencil[num_modes*5 + p];
+  T sol_ip3 = sol_stencil[num_modes*6 + p];
+
+  //Calculate solution time-derivatives using stencil data
+  T sol_tderiv1 = (sol_im2 - 8.0*(sol_im1 - sol_ip1) - sol_ip2) * inv_12dt;
+  T sol_tderiv2 = (-sol_im2 + 16.0*(sol_im1 + sol_ip1) - 30.0*sol_i - sol_ip2) * inv_12dt2;
+  T sol_tderiv3 = (sol_im3 - 8.0*(sol_im2 - sol_ip2) + 13.0*(sol_im1 - sol_ip1) - sol_ip3) * inv_8dt3;
+  T sol_tderiv4 = (-(sol_im3 + sol_ip3) + 12.0*(sol_im2 + sol_ip2) - 39.0*(sol_im1 + sol_ip1) + 56.0*sol_i) * inv_6dt4;
+
+  T rhs_val = imag*(beta[            p] - beta[        0])*sol_i //(beta0p - beta00)
+                  -(beta[  num_modes+p] - beta[num_modes])*sol_tderiv1
+            - imag* beta[2*num_modes+p]*0.5               *sol_tderiv2
+                  + beta[3*num_modes+p]*inv6              *sol_tderiv3
+            + imag* beta[4*num_modes+p]*inv24             *sol_tderiv4;
+  rhs[offset] = rhs_val;
+}
+
+template<typename T, bool use_shared_mem_for_Sk = true>
+__global__
+void AddKerrNonlinearity(const int num_modes, const int num_time_pts,
+                         const DeviceArray4D<double>& Sk_tensor, const DeviceArray1D<T>& sol, 
+                         complex<double> j_n_omega0_invc, DeviceArray1D<T>& rhs)
 {
   const int t = blockIdx.x*blockDim.x + threadIdx.x; //time-point index
   const int p = threadIdx.y; //mode index (blockIdx.y is always 0 since only 1 block is launched in this dimension)
@@ -146,19 +218,18 @@ void AddKerrRHS(const int num_modes, const int num_time_pts, const bool use_shar
   if (use_shared_mem_for_Sk)
   {
     const int num_modes_sq = num_modes * num_modes;
+    const int num_modes_cu = num_modes * num_modes * num_modes;
     for (int q = 0; q < num_modes; ++q)
     {
       T Aq = sol_modes[q];
-      int Sk_offset = num_modes_sq * (num_modes*p + q); //linear index for Sk(p,q,r,s)
       for (int r = 0; r < num_modes; ++r)
       {
         T Ar = sol_modes[r];
-        Sk_offset += (num_modes*r);
         for (int s = 0; s < num_modes; ++s)
         {
           T As = sol_modes[s];
-          Sk_offset += s;
-          kerr += Sk[Sk_offset]*Aq*Ar*conj(As);
+          int Sk_ind = num_modes_cu*p + num_modes_sq*q + num_modes*r + s; //linear index for Sk(p,q,r,s)
+          kerr += Sk[Sk_ind]*Aq*Ar*conj(As);
         }
       }
     }
@@ -218,55 +289,60 @@ class GPUMultimodeNLSE : public ODE<T>
     if (is_nonlinear_)
       for (int d = 0; d < 4; ++d)
         assert(Sk.GetDim(d) == num_modes);
-
-    // const int max_Ap_tderiv = beta_mat_.GetNumRows()-1;
-    // sol_tderiv_.resize(max_Ap_tderiv, num_time_points_); //Stores the time-derivatives (e.g. d/dt, d^2/dt^2 ...) of a particular solution mode
-
-    // if (is_nonlinear_)
-    // {
-    //   kerr_.resize(num_time_points_);
-    //   if (is_self_steepening_)
-    //     kerr_tderiv_.resize(num_time_points_);
-    // }
   }
 
   int GetSolutionSize() const { return num_modes_*num_time_points_; }
 
   void EvalRHS(const GPUArray1D<T>& sol, int step, double z, GPUArray1D<T>& rhs)
   {
-    constexpr int threads_per_block = 256;
+    constexpr int threads_per_block = 128;
     dim3 block_dim(threads_per_block, num_modes_, 1);
     dim3 grid_dim((num_time_points_ + block_dim.x-1) / block_dim.x, 1, 1);
     //std::cout << "block_dim: " << block_dim.x << ", " << block_dim.y << ", " << block_dim.z << std::endl;
     //std::cout << "grid_dim: " << grid_dim.x << ", " << grid_dim.y << ", " << grid_dim.z << std::endl;
 
+#if 0
+    //Compute dispersion term using second-order time derivatives
     constexpr int stencil_size = 5;
     int shared_mem_bytes = beta_mat_.size()*sizeof(double)
                          + (threads_per_block * stencil_size * num_modes_)*sizeof(T);
     // std::cout << "shared mem bytes1: " << shared_mem_bytes << std::endl;
-
-    gpu::AddDispersionRHS<<<grid_dim, block_dim, shared_mem_bytes>>>(
+    gpu::AddDispersionStencil5<<<grid_dim, block_dim, shared_mem_bytes>>>(
+      num_modes_, num_time_points_, beta_mat_.GetDeviceArray(), dt_, 
+      sol.GetDeviceArray(), rhs.GetDeviceArray());
+    cudaCheckLastError();
+#else
+    //Compute dispersion term using fourth-order time derivatives
+    constexpr int stencil_size = 7;
+    int shared_mem_bytes = beta_mat_.size()*sizeof(double)
+                         + (threads_per_block * stencil_size * num_modes_)*sizeof(T);
+    // std::cout << "shared mem bytes1: " << shared_mem_bytes << std::endl;
+    gpu::AddDispersionStencil7<<<grid_dim, block_dim, shared_mem_bytes>>>(
       num_modes_, num_time_points_, beta_mat_.GetDeviceArray(), dt_, 
       sol.GetDeviceArray(), rhs.GetDeviceArray());
     cudaCheckLastError();
+#endif
 
     if (is_nonlinear_)
     {
+      const complex<double> j_n_omega0_invc(0.0, n2_*omega0_/c_);
+
       shared_mem_bytes = (threads_per_block * num_modes_)*sizeof(T);
       int num_bytes_Sk = num_modes_*num_modes_*num_modes_*num_modes_*sizeof(double);
-      bool use_shared_mem_for_Sk = false;
       if (shared_mem_bytes + num_bytes_Sk <= 48000)
       {
         shared_mem_bytes += num_bytes_Sk;
-        // use_shared_mem_for_Sk = true;
+        // std::cout << "shared mem bytes2: " << shared_mem_bytes << std::endl;
+        gpu::AddKerrNonlinearity<T, true><<<grid_dim, block_dim, shared_mem_bytes>>>(
+          num_modes_, num_time_points_, Sk_.GetDeviceArray(), sol.GetDeviceArray(), 
+          j_n_omega0_invc, rhs.GetDeviceArray());
+      }
+      else
+      {
+        gpu::AddKerrNonlinearity<T, false><<<grid_dim, block_dim, shared_mem_bytes>>>(
+          num_modes_, num_time_points_, Sk_.GetDeviceArray(), sol.GetDeviceArray(), 
+          j_n_omega0_invc, rhs.GetDeviceArray());
       }
-      // std::cout << "shared mem bytes2: " << shared_mem_bytes << std::endl;
-
-      const complex<double> j_n_omega0_invc(0.0, n2_*omega0_/c_);
-      gpu::AddKerrRHS<<<grid_dim, block_dim, shared_mem_bytes>>>(
-        num_modes_, num_time_points_, use_shared_mem_for_Sk,
-        Sk_.GetDeviceArray(), sol.GetDeviceArray(), 
-        j_n_omega0_invc, rhs.GetDeviceArray());
     }
 
 #if 0
@@ -289,29 +365,6 @@ class GPUMultimodeNLSE : public ODE<T>
 #endif
   }
 
-  // void ComputeKerrNonlinearity(const int p, const Array1D<T>& sol)
-  // {
-  //   #pragma omp for
-  //   for (int i = 0; i < num_time_points_; ++i) 
-  //   {
-  //     T sum = 0.0;
-  //     for (int q = 0; q < num_modes_; ++q)
-  //     {
-  //       const auto& Aq = sol(q*num_time_points_ + i);
-  //       for (int r = 0; r < num_modes_; ++r)
-  //       {
-  //         const auto& Ar = sol(r*num_time_points_ + i);
-  //         for (int s = 0; s < num_modes_; ++s)
-  //         {
-  //           const auto& As = sol(s*num_time_points_ + i);
-  //           sum += Sk_(p,q,r,s)*Aq*Ar*conj(As);
-  //         }
-  //       }
-  //     }
-  //     kerr_(i) = sum;
-  //   }
-  // }
-
   Array1D<T> GetInitialSolutionGaussian(const Array1D<double>& Et, const Array1D<double>& t_FWHM, const Array1D<double>& t_center) 
   {
     assert(num_modes_ == (int) Et.size());
@@ -343,9 +396,6 @@ class GPUMultimodeNLSE : public ODE<T>
   static constexpr double c_ = 2.99792458e-4; //[m/ps]
   const bool is_self_steepening_ = false;
   const bool is_nonlinear_ = false;
-  // Array2D<T> sol_tderiv_;
-  Array1D<T> kerr_;
-  // Array1D<T> kerr_tderiv_;
 };
 
 }
\ No newline at end of file