diff --git a/Examples/BasicSamples/EarlyStopExample.cs b/Examples/BasicSamples/EarlyStopExample.cs
index 121ad5d..6eaef87 100644
--- a/Examples/BasicSamples/EarlyStopExample.cs
+++ b/Examples/BasicSamples/EarlyStopExample.cs
@@ -3,6 +3,7 @@
 using Keras.Layers;
 using Keras.Models;
 using Keras.Optimizers;
+using Keras.Optimizers.Legacy;
 using Numpy;
 using System;
 using System.Collections.Generic;
diff --git a/Examples/BasicSamples/ImplementCallback.cs b/Examples/BasicSamples/ImplementCallback.cs
index 32bbd80..17fd97b 100644
--- a/Examples/BasicSamples/ImplementCallback.cs
+++ b/Examples/BasicSamples/ImplementCallback.cs
@@ -3,6 +3,7 @@
 using Keras.Layers;
 using Keras.Models;
 using Keras.Optimizers;
+using Keras.Optimizers.Legacy;
 using Numpy;
 using System;
 using System.Collections.Generic;
diff --git a/Examples/BasicSamples/MNIST_CNN.cs b/Examples/BasicSamples/MNIST_CNN.cs
index b7c6148..e10f7bf 100644
--- a/Examples/BasicSamples/MNIST_CNN.cs
+++ b/Examples/BasicSamples/MNIST_CNN.cs
@@ -9,6 +9,7 @@
 using Keras.Layers;
 using Keras.Utils;
 using Keras.Optimizers;
+using Keras.Optimizers.Legacy;
 
 namespace BasicSamples
 {
diff --git a/Examples/ImageExamples/Cifar10_CNN.cs b/Examples/ImageExamples/Cifar10_CNN.cs
index 8df6808..f3a3cd9 100644
--- a/Examples/ImageExamples/Cifar10_CNN.cs
+++ b/Examples/ImageExamples/Cifar10_CNN.cs
@@ -11,6 +11,7 @@
 using Keras.Optimizers;
 using Keras.PreProcessing.Image;
 using System.IO;
+using Keras.Optimizers.Legacy;
 
 namespace ImageExamples
 {
diff --git a/Examples/ImageExamples/MNIST_CNN.cs b/Examples/ImageExamples/MNIST_CNN.cs
index 32fbb78..a8c1c0b 100644
--- a/Examples/ImageExamples/MNIST_CNN.cs
+++ b/Examples/ImageExamples/MNIST_CNN.cs
@@ -10,6 +10,7 @@
 using Keras.Utils;
 using Keras.Optimizers;
 using System.IO;
+using Keras.Optimizers.Legacy;
 
 namespace ImageExamples
 {
diff --git a/Keras.NET.sln b/Keras.NET.sln
index c55c9bd..428594e 100644
--- a/Keras.NET.sln
+++ b/Keras.NET.sln
@@ -19,11 +19,9 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Keras", "Keras\Keras.csproj
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ReleaseBot", "ReleaseBot\ReleaseBot.csproj", "{2BAEA60C-88A2-45DC-8044-2C9571E1B8CF}"
 EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "KerasExampleWinApp", "Examples\KerasExampleWinApp\KerasExampleWinApp.csproj", "{0C0B0830-4871-4979-8675-93F980F5EBE2}"
-EndProject
 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "MusicGeneration", "Examples\MusicGeneration\MusicGeneration.csproj", "{108C3326-58D2-4C26-9D78-5F045D620A26}"
 EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Tensorflow", "Tensorflow\Tensorflow.csproj", "{27230C96-FCB4-406C-8AAD-450020F9074D}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tensorflow", "Tensorflow\Tensorflow.csproj", "{27230C96-FCB4-406C-8AAD-450020F9074D}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
@@ -255,38 +253,6 @@ Global
 		{2BAEA60C-88A2-45DC-8044-2C9571E1B8CF}.Release|Any CPU.Build.0 = Release|Any CPU
 		{2BAEA60C-88A2-45DC-8044-2C9571E1B8CF}.Release|x64.ActiveCfg = Release|Any CPU
 		{2BAEA60C-88A2-45DC-8044-2C9571E1B8CF}.Release|x64.Build.0 = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Debug|Any CPU.Build.0 = Debug|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Debug|x64.ActiveCfg = Debug|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Debug|x64.Build.0 = Debug|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py2.7_Mono|Any CPU.ActiveCfg = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py2.7_Mono|Any CPU.Build.0 = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py2.7_Mono|x64.ActiveCfg = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py2.7_Mono|x64.Build.0 = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py2.7_OSX|Any CPU.ActiveCfg = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py2.7_OSX|Any CPU.Build.0 = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py2.7_OSX|x64.ActiveCfg = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py2.7_OSX|x64.Build.0 = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py2.7_WIN|Any CPU.ActiveCfg = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py2.7_WIN|Any CPU.Build.0 = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py2.7_WIN|x64.ActiveCfg = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py2.7_WIN|x64.Build.0 = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py3.6_Mono|Any CPU.ActiveCfg = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py3.6_Mono|Any CPU.Build.0 = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py3.6_Mono|x64.ActiveCfg = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py3.6_Mono|x64.Build.0 = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py3.6_OSX|Any CPU.ActiveCfg = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py3.6_OSX|Any CPU.Build.0 = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py3.6_OSX|x64.ActiveCfg = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py3.6_OSX|x64.Build.0 = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py3.6_WIN|Any CPU.ActiveCfg = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py3.6_WIN|Any CPU.Build.0 = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py3.6_WIN|x64.ActiveCfg = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Py3.6_WIN|x64.Build.0 = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Release|Any CPU.ActiveCfg = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Release|Any CPU.Build.0 = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Release|x64.ActiveCfg = Release|Any CPU
-		{0C0B0830-4871-4979-8675-93F980F5EBE2}.Release|x64.Build.0 = Release|Any CPU
 		{108C3326-58D2-4C26-9D78-5F045D620A26}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 		{108C3326-58D2-4C26-9D78-5F045D620A26}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{108C3326-58D2-4C26-9D78-5F045D620A26}.Debug|x64.ActiveCfg = Debug|Any CPU
@@ -359,7 +325,6 @@ Global
 		{A0786763-88EC-41DB-9E4F-6DDACA1A1162} = {96B07D94-46E0-4A1C-9484-E842B47FFE04}
 		{EC18ED5C-A9EC-414F-948C-DD1BC052D312} = {96B07D94-46E0-4A1C-9484-E842B47FFE04}
 		{7F906C3D-4C18-4185-8235-4908FC082398} = {96B07D94-46E0-4A1C-9484-E842B47FFE04}
-		{0C0B0830-4871-4979-8675-93F980F5EBE2} = {96B07D94-46E0-4A1C-9484-E842B47FFE04}
 		{108C3326-58D2-4C26-9D78-5F045D620A26} = {96B07D94-46E0-4A1C-9484-E842B47FFE04}
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
diff --git a/Keras.UnitTest/Keras.Layers.Core.cs b/Keras.UnitTest/Keras.Layers.Core.cs
index 766b6ec..e856521 100644
--- a/Keras.UnitTest/Keras.Layers.Core.cs
+++ b/Keras.UnitTest/Keras.Layers.Core.cs
@@ -46,7 +46,7 @@ public void Dense_CustomKRegularizerAndKInitParams()
             Assert.AreEqual(2000, modelAsJson.config.layers[i].config.kernel_regularizer.config.l2.Value);
 
             // Compile and train
-            model.Compile(optimizer: new Adam(lr: 0.001F), loss: "binary_crossentropy", metrics: new string[] { "accuracy" });
+            model.Compile(optimizer: new Adam(learning_rate: 0.001F), loss: "binary_crossentropy", metrics: new string[] { "accuracy" });
             model.Fit(x, y, batch_size: x.shape[0], epochs: 100, verbose: 0);
             Assert.AreEqual(2, model.GetWeights().Count);
         }
diff --git a/Keras/Activations.cs b/Keras/Activations.cs
index 126fc34..6e70455 100644
--- a/Keras/Activations.cs
+++ b/Keras/Activations.cs
@@ -157,5 +157,43 @@ public static NDarray Linear(NDarray x)
             parameters["x"] = x;
             return new NDarray(InvokeStaticMethod(caller, "linear", parameters));
         }
+
+        /// <summary>
+        /// Gaussian error linear unit (GELU) computes x * P(X <= x), where P(X) ~ N(0, 1). The (GELU) nonlinearity weights inputs by their value, rather than gates inputs by their sign as in ReLU.
+        /// </summary>
+        /// <param name="x">Input tensor.</param>
+        /// <param name="approximate">A bool, whether to enable approximation.</param>
+        /// <returns></returns>
+        public static NDarray Gelu(NDarray x, bool approximate = false)
+        {
+            Dictionary<string, object> parameters = new Dictionary<string, object>();
+            parameters["x"] = x;
+            parameters["approximate"] = approximate;
+            return new NDarray(InvokeStaticMethod(caller, "gelu", parameters));
+        }
+
+        /// <summary>
+        /// Mish activation function.
+        /// </summary>
+        /// <param name="x">Input tensor.</param>
+        /// <returns>Output tensor</returns>
+        public static NDarray Mish(NDarray x)
+        {
+            Dictionary<string, object> parameters = new Dictionary<string, object>();
+            parameters["x"] = x;
+            return new NDarray(InvokeStaticMethod(caller, "mish", parameters));
+        }
+
+        /// <summary>
+        /// Swish activation function, swish(x) = x * sigmoid(x).
+        /// </summary>
+        /// <param name="x">Input tensor.</param>
+        /// <returns>Output tensor</returns>
+        public static NDarray Swish(NDarray x)
+        {
+            Dictionary<string, object> parameters = new Dictionary<string, object>();
+            parameters["x"] = x;
+            return new NDarray(InvokeStaticMethod(caller, "swish", parameters));
+        }
     }
 }
diff --git a/Keras/Callbacks.cs b/Keras/Callbacks.cs
index 07ab8bf..3bef351 100644
--- a/Keras/Callbacks.cs
+++ b/Keras/Callbacks.cs
@@ -6,6 +6,8 @@
 using Python.Runtime;
 using Numpy;
 using System.IO;
+using static System.Net.WebRequestMethods;
+using Keras.Models;
 
 namespace Keras.Callbacks
 {
@@ -36,7 +38,7 @@ public static Callback Custom(string name, string fileOrcode, bool isFile = true
             string code = "";
             if(isFile)
             {
-                code = File.ReadAllText(fileOrcode);
+                code = System.IO.File.ReadAllText(fileOrcode);
             }
             else
             {
@@ -183,9 +185,9 @@ public class ModelCheckpoint : Callback
         /// <param name="save_best_only">if save_best_only=True, the latest best model according to the quantity monitored will not be overwritten.</param>
         /// <param name="save_weights_only"> if True, then only the model's weights will be saved (model.save_weights(filepath)), else the full model is saved (model.save(filepath)).</param>
         /// <param name="mode">one of {auto, min, max}. If save_best_only=True, the decision to overwrite the current save file is made based on either the maximization or the minimization of the monitored quantity. For  val_acc, this should be max, for val_loss this should be min, etc. In auto mode, the direction is automatically inferred from the name of the monitored quantity.</param>
-        /// <param name="period">Interval (number of epochs) between checkpoints.</param>
-        public ModelCheckpoint(string filepath, string monitor = "val_loss", int verbose = 0, bool save_best_only = true
-                    , bool save_weights_only = false, string mode = "auto", int period = 1)
+        /// <param name="save_freq">'epoch' or integer. When using 'epoch', the callback saves the model after each epoch. When using integer, the callback saves the model at end of this many batches.</param>
+        public ModelCheckpoint(string filepath, string monitor = "val_loss", int verbose = 0, bool save_best_only = false
+                    , bool save_weights_only = false, string mode = "auto", string save_freq= "epoch")
         {
             Parameters["filepath"] = filepath;
             Parameters["monitor"] = monitor;
@@ -193,8 +195,8 @@ public ModelCheckpoint(string filepath, string monitor = "val_loss", int verbose
             Parameters["save_best_only"] = save_best_only;
             Parameters["save_weights_only"] = save_weights_only;
             Parameters["mode"] = mode;
-            Parameters["period"] = period;
-
+            Parameters["save_freq"] = save_freq;
+            //ToDo: extend options parameter
             PyInstance = Instance.keras.callbacks.ModelCheckpoint;
             Init();
         }
@@ -216,7 +218,9 @@ public class EarlyStopping : Callback
         /// <param name="mode">one of {auto, min, max}. In min mode, training will stop when the quantity monitored has stopped decreasing; in max mode it will stop when the quantity monitored has stopped increasing; in auto mode, the direction is automatically inferred from the name of the monitored quantity.</param>
         /// <param name="baseline"> Baseline value for the monitored quantity to reach. Training will stop if the model doesn't show improvement over the baseline.</param>
         /// <param name="restore_best_weights"> whether to restore model weights from the epoch with the best value of the monitored quantity. If False, the model weights obtained at the last step of training are used.</param>
-        public EarlyStopping(string monitor = "val_loss", float min_delta = 0, int patience = 0, int verbose = 0, string mode = "auto", float? baseline = null, bool restore_best_weights = false)
+        /// <param name="start_from_epoch"> Number of epochs to wait before starting to monitor improvement. This allows for a warm-up period in which no improvement is expected and thus training will not be stopped.</param>
+        public EarlyStopping(string monitor = "val_loss", float min_delta = 0, int patience = 0, int verbose = 0, string mode = "auto", 
+            float? baseline = null, bool restore_best_weights = false, int start_from_epoch = 0)
         {
             Parameters["monitor"] = monitor;
             Parameters["min_delta"] = min_delta;
@@ -225,6 +229,7 @@ public EarlyStopping(string monitor = "val_loss", float min_delta = 0, int patie
             Parameters["mode"] = mode;
             Parameters["baseline"] = baseline;
             Parameters["restore_best_weights"] = restore_best_weights;
+            Parameters["start_from_epoch"] = start_from_epoch;
 
             PyInstance = Instance.keras.callbacks.EarlyStopping;
             Init();
@@ -294,28 +299,24 @@ public class TensorBoard : Callback
         /// </summary>
         /// <param name="log_dir"> the path of the directory where to save the log files to be parsed by TensorBoard.</param>
         /// <param name="histogram_freq"> frequency (in epochs) at which to compute activation and weight histograms for the layers of the model. If set to 0, histograms won't be computed. Validation data (or split) must be specified for histogram visualizations.</param>
-        /// <param name="batch_size"> size of batch of inputs to feed to the network for histograms computation.</param>
         /// <param name="write_graph"> whether to visualize the graph in TensorBoard. The log file can become quite large when write_graph is set to True.</param>
-        /// <param name="write_grads"> whether to visualize gradient histograms in TensorBoard. histogram_freq must be greater than 0.</param>
         /// <param name="write_images"> whether to write model weights to visualize as image in TensorBoard.</param>
+        /// <param name="write_steps_per_second"> whether to log the training steps per second into TensorBoard. This supports both epoch and batch frequency logging.</param>
+        /// <param name="update_freq"> 'batch' or 'epoch' or integer. When using 'epoch', writes the losses and metrics to TensorBoard after every epoch. </param>
         /// <param name="embeddings_freq"> frequency (in epochs) at which selected embedding layers will be saved. If set to 0, embeddings won't be computed. Data to be visualized in TensorBoard's Embedding tab must be passed as embeddings_data.</param>
-        /// <param name="embeddings_layer_names"> a list of names of layers to keep eye on. If None or empty list all the embedding layer will be watched.</param>
         /// <param name="embeddings_metadata"> a dictionary which maps layer name to a file name in which metadata for this embedding layer is saved. See the details about metadata files format. In case if the same metadata file is used for all embedding layers, string can be passed.</param>
-        /// <param name="embeddings_data"> data to be embedded at layers specified in embeddings_layer_names. Numpy array (if the model has a single input) or list of Numpy arrays (if the model has multiple inputs). Learn more about embeddings.</param>
-        public TensorBoard(string log_dir= "./logs", int histogram_freq= 0, int batch_size= 32, bool write_graph= true, bool write_grads= false, 
-                    bool write_images= false, int embeddings_freq= 0, string[] embeddings_layer_names= null, Dictionary<string, string> embeddings_metadata= null, 
-                    NDarray embeddings_data= null, string update_freq= "epoch")
+        public TensorBoard(string log_dir= "./logs", int histogram_freq= 0, bool write_graph= true, bool write_images= false, int? write_steps_per_second = null,
+                    string update_freq = "epoch", int embeddings_freq= 0, Dictionary<string, string> embeddings_metadata= null)
         {
             Parameters["log_dir"] = log_dir;
             Parameters["histogram_freq"] = histogram_freq;
-            Parameters["batch_size"] = batch_size;
             Parameters["write_graph"] = write_graph;
+            Parameters["write_images"] = write_images;
+            Parameters["write_steps_per_second"] = write_steps_per_second;
+            Parameters["update_freq"] = update_freq;
             Parameters["embeddings_freq"] = embeddings_freq;
-            Parameters["embeddings_layer_names"] = embeddings_layer_names;
             Parameters["embeddings_metadata"] = embeddings_metadata;
-            Parameters["embeddings_data"] = embeddings_data?.PyObject;
-            Parameters["update_freq"] = update_freq;
-
+            
             PyInstance = Instance.keras.callbacks.TensorBoard;
             Init();
         }
@@ -380,4 +381,53 @@ public CSVLogger(string filename, string separator = ",", bool append = false)
             Init();
         }
     }
+
+    /// <summary>
+    /// BackupAndRestore callback is intended to recover training from an interruption that has happened in the middle of a Model.fit execution, 
+    /// by backing up the training states in a temporary checkpoint file (with the help of a tf.train.CheckpointManager), at the end of each epoch. 
+    /// </summary>
+    public class BackupAndRestore : Callback
+    {
+        /// <summary>
+        /// Initializes a new instance of the <see cref="BackupAndRestore" /> class.
+        /// </summary>
+        /// <param name="backup_dir">String, path to store the checkpoint. e.g. backup_dir = os.path.join(working_dir, 'backup')</param>
+        /// <param name="save_freq">'epoch', integer, or False. When set to 'epoch' the callback saves the checkpoint at the end of each epoch</param>
+        /// <param name="delete_checkpoint">Boolean, default to True. This BackupAndRestore callback works by saving a checkpoint to back up the training state</param>
+        /// <param name="save_before_preemption">A boolean value instructing whether to turn on the automatic checkpoint saving for preemption/maintenance events. </param>
+        public BackupAndRestore(string backup_dir, string save_freq = "epoch", bool delete_checkpoint = true, bool save_before_preemption = false)
+        {
+            Parameters["backup_dir"] = backup_dir;
+            Parameters["save_freq"] = save_freq;
+            Parameters["delete_checkpoint"] = delete_checkpoint;
+            Parameters["save_before_preemption"] = save_before_preemption;
+
+            PyInstance = Instance.keras.callbacks.BackupAndRestore;
+            Init();
+        }
+    }
+
+    /// <summary>
+    /// Container abstracting a list of callbacks.
+    /// </summary>
+    public class CallbackList : Callback
+    {
+        /// <summary>
+        /// Initializes a new instance of the <see cref="CallbackList" /> class.
+        /// </summary>
+        /// <param name="callbacks">List of Callback instances.</param>
+        /// <param name="add_history">Whether a History callback should be added, if one does not already exist in the callbacks list.</param>
+        /// <param name="add_progbar">Whether a ProgbarLogger callback should be added, if one does not already exist in the callbacks list.</param>
+        /// <param name="model">The Model these callbacks are used with.</param>
+        public CallbackList(List<Callback> callbacks, bool add_history = false, bool add_progbar = false, BaseModel model = null)
+        {
+            Parameters["callbacks"] = callbacks;
+            Parameters["add_history"] = add_history;
+            Parameters["add_progbar"] = add_progbar;
+            Parameters["model"] = model.ToPython();
+
+            PyInstance = Instance.keras.callbacks.CallbackList;
+            Init();
+        }
+    }
 }
diff --git a/Keras/Constraints.cs b/Keras/Constraints.cs
index b122a1e..5f2af30 100644
--- a/Keras/Constraints.cs
+++ b/Keras/Constraints.cs
@@ -49,6 +49,7 @@ public class UnitNorm : Base
         /// <param name="axis">integer, axis along which to calculate weight norms. For instance, in a Dense layer the weight matrix has shape (input_dim, output_dim), set axis to 0 to constrain each weight vector of length (input_dim,). In a Conv2D layer with data_format="channels_last", the weight tensor has shape  (rows, cols, input_depth, output_depth), set axis to [0, 1, 2] to constrain the weights of each filter tensor of size  (rows, cols, input_depth).</param>
         public UnitNorm(int axis = 0)
         {
+            Parameters["axis"] = axis;
             PyInstance = keras.constraints.NonNeg;
             Init();
         }
@@ -70,8 +71,21 @@ public class MinMaxNorm : Base
         /// <param name="axis">integer, axis along which to calculate weight norms. For instance, in a Dense layer the weight matrix has shape (input_dim, output_dim), set axis to 0 to constrain each weight vector of length (input_dim,). In a Conv2D layer with data_format="channels_last", the weight tensor has shape  (rows, cols, input_depth, output_depth), set axis to [0, 1, 2] to constrain the weights of each filter tensor of size  (rows, cols, input_depth).</param>
         public MinMaxNorm(float min_value= 0.0f, float max_value= 1.0f, float rate= 1.0f, int axis = 0)
         {
+            Parameters["min_value"] = min_value;
+            Parameters["max_value"] = max_value;
+            Parameters["rate"] = rate;
+            Parameters["axis"] = axis;
             PyInstance = keras.constraints.NonNeg;
             Init();
         }
     }
+
+    public class RadialConstraint : Base
+    {
+        public RadialConstraint()
+        {
+            PyInstance = keras.constraints.RadialConstraint;
+            Init();
+        }
+    }
 }
diff --git a/Keras/ExportArchive.cs b/Keras/ExportArchive.cs
new file mode 100644
index 0000000..c582223
--- /dev/null
+++ b/Keras/ExportArchive.cs
@@ -0,0 +1,14 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace Keras
+{
+    public class ExportArchive : Base
+    {
+        public ExportArchive()
+        {
+            //ToDo: Build based on the tendor specs
+        }
+    }
+}
diff --git a/Keras/Keras.csproj b/Keras/Keras.csproj
index a5c2695..aaac2b8 100644
--- a/Keras/Keras.csproj
+++ b/Keras/Keras.csproj
@@ -43,7 +43,7 @@
   </ItemGroup>
 
   <ItemGroup>
-    <PackageReference Include="Numpy.Bare" Version="3.11.1.33" />
+    <ProjectReference Include="..\Tensorflow\Tensorflow.csproj" />
   </ItemGroup>
 
 </Project>
diff --git a/Keras/Models/BaseModel.cs b/Keras/Models/BaseModel.cs
index 7d5cdd2..2c7bdbe 100644
--- a/Keras/Models/BaseModel.cs
+++ b/Keras/Models/BaseModel.cs
@@ -1,4 +1,5 @@
 ﻿using Keras.Callbacks;
+using Keras.Layers;
 using Keras.Utils;
 using Numpy;
 using Python.Runtime;
@@ -7,6 +8,8 @@
 using System.IO;
 using System.Linq;
 using System.Text;
+using Tensorflow.Train;
+using static System.Net.WebRequestMethods;
 
 namespace Keras.Models
 {
@@ -16,23 +19,25 @@ public class BaseModel : Base
         ///Configures the model for training.
         /// </summary>
         /// <param name="optimizer"> String (name of optimizer) or optimizer instance. See optimizers.</param>
-        /// <param name="loss"> String (name of objective function) or objective function. See losses. If the model has multiple outputs, you can use a different loss on each output by passing a dictionary or a list of losses. The loss value that will be minimized by the model will then be the sum of all individual losses.</param>
+        /// <param name="loss"> List of Strings (name of objective function) or objective function. See losses. If the model has multiple outputs, you can use a different loss on each output by passing a dictionary or a list of losses. The loss value that will be minimized by the model will then be the sum of all individual losses.</param>
         /// <param name="metrics"> List of metrics to be evaluated by the model during training and testing. Typically you will use metrics=['accuracy']. To specify different metrics for different outputs of a multi-output model, you could also pass a dictionary, such as metrics={'output_a': 'accuracy'}.</param>
         /// <param name="loss_weights"> Optional list or dictionary specifying scalar coefficients (Python floats) to weight the loss contributions of different model outputs. The loss value that will be minimized by the model will then be the weighted sum of all individual losses, weighted by the loss_weightscoefficients. If a list, it is expected to have a 1:1 mapping to the model's outputs. If a tensor, it is expected to map output names (strings) to scalar coefficients.</param>
-        /// <param name="sample_weight_mode"> If you need to do timestep-wise sample weighting (2D weights), set this to "temporal". None defaults to sample-wise weights (1D). If the model has multiple outputs, you can use a different sample_weight_mode on each output by passing a dictionary or a list of modes.</param>
         /// <param name="weighted_metrics"> List of metrics to be evaluated and weighted by sample_weight or class_weight during training and testing.</param>
-        /// <param name="target_tensors"> By default, Keras will create placeholders for the model's target, which will be fed with the target data during training. If instead you would like to use your own target tensors (in turn, Keras will not expect external Numpy data for these targets at training time), you can specify them via the target_tensors argument. It can be a single tensor (for a single-output model), a list of tensors, or a dict mapping output names to target tensors.</param>
+        /// <param name="run_eagerly"> Bool. Defaults to False. If True, this Model's logic will not be wrapped in a tf.function. Recommended to leave this as None unless your Model cannot be run inside a tf.function. run_eagerly=True is not supported when using.</param>
+        /// <param name="steps_per_execution"> Int. Defaults to 1. The number of batches to run during each tf.function call. Running multiple batches inside a single tf.function call can greatly improve performance on TPUs or small models with a large Python overhead. At most, one full epoch will be run each execution</param>
+        /// <param name="jit_compile"> If True, compile the model training step with XLA. XLA is an optimizing compiler for machine learning. jit_compile is not enabled for by default. Note that jit_compile=True may not necessarily work for all models.</param>
         public void Compile(StringOrInstance optimizer, string loss, string[] metrics = null, float[] loss_weights = null,
-                       string sample_weight_mode = null, string[] weighted_metrics = null, NDarray[] target_tensors = null)
+                       string[] weighted_metrics = null, bool run_eagerly = false, int steps_per_execution = 1, bool jit_compile = false)
         {
             var args = new Dictionary<string, object>();
             args["optimizer"] = optimizer;
             args["loss"] = loss;
             args["metrics"] = metrics;
             args["loss_weights"] = loss_weights;
-            args["sample_weight_mode"] = sample_weight_mode;
             args["weighted_metrics"] = weighted_metrics;
-            args["target_tensors"] = target_tensors;
+            args["run_eagerly"] = run_eagerly;
+            args["steps_per_execution"] = steps_per_execution;
+            args["jit_compile"] = jit_compile;
 
             InvokeMethod("compile", args);
         }
@@ -44,24 +49,64 @@ public void Compile(StringOrInstance optimizer, string loss, string[] metrics =
         /// <param name="loss"> List of Strings (name of objective function) or objective function. See losses. If the model has multiple outputs, you can use a different loss on each output by passing a dictionary or a list of losses. The loss value that will be minimized by the model will then be the sum of all individual losses.</param>
         /// <param name="metrics"> List of metrics to be evaluated by the model during training and testing. Typically you will use metrics=['accuracy']. To specify different metrics for different outputs of a multi-output model, you could also pass a dictionary, such as metrics={'output_a': 'accuracy'}.</param>
         /// <param name="loss_weights"> Optional list or dictionary specifying scalar coefficients (Python floats) to weight the loss contributions of different model outputs. The loss value that will be minimized by the model will then be the weighted sum of all individual losses, weighted by the loss_weightscoefficients. If a list, it is expected to have a 1:1 mapping to the model's outputs. If a tensor, it is expected to map output names (strings) to scalar coefficients.</param>
-        /// <param name="sample_weight_mode"> If you need to do timestep-wise sample weighting (2D weights), set this to "temporal". None defaults to sample-wise weights (1D). If the model has multiple outputs, you can use a different sample_weight_mode on each output by passing a dictionary or a list of modes.</param>
         /// <param name="weighted_metrics"> List of metrics to be evaluated and weighted by sample_weight or class_weight during training and testing.</param>
-        /// <param name="target_tensors"> By default, Keras will create placeholders for the model's target, which will be fed with the target data during training. If instead you would like to use your own target tensors (in turn, Keras will not expect external Numpy data for these targets at training time), you can specify them via the target_tensors argument. It can be a single tensor (for a single-output model), a list of tensors, or a dict mapping output names to target tensors.</param>
+        /// <param name="run_eagerly"> Bool. Defaults to False. If True, this Model's logic will not be wrapped in a tf.function. Recommended to leave this as None unless your Model cannot be run inside a tf.function. run_eagerly=True is not supported when using.</param>
+        /// <param name="steps_per_execution"> Int. Defaults to 1. The number of batches to run during each tf.function call. Running multiple batches inside a single tf.function call can greatly improve performance on TPUs or small models with a large Python overhead. At most, one full epoch will be run each execution</param>
+        /// <param name="jit_compile"> If True, compile the model training step with XLA. XLA is an optimizing compiler for machine learning. jit_compile is not enabled for by default. Note that jit_compile=True may not necessarily work for all models.</param>
         public void Compile(StringOrInstance optimizer, string[] loss, string[] metrics = null, float[] loss_weights = null,
-                       string sample_weight_mode = null, string[] weighted_metrics = null, NDarray[] target_tensors = null)
+                      string[] weighted_metrics = null, bool run_eagerly = false, int steps_per_execution = 1, bool jit_compile = false)
         {
             var args = new Dictionary<string, object>();
             args["optimizer"] = optimizer;
             args["loss"] = loss;
             args["metrics"] = metrics;
             args["loss_weights"] = loss_weights;
-            args["sample_weight_mode"] = sample_weight_mode;
             args["weighted_metrics"] = weighted_metrics;
-            args["target_tensors"] = target_tensors;
+            args["run_eagerly"] = run_eagerly;
+            args["steps_per_execution"] = steps_per_execution;
+            args["jit_compile"] = jit_compile;
 
             InvokeMethod("compile", args);
         }
 
+        /// <summary>
+        /// Compute the total loss, validate it, and return it.
+        /// Subclasses can optionally override this method to provide custom loss computation logic.
+        /// </summary>
+        /// <param name="x">Input data.</param>
+        /// <param name="y">Target data.</param>
+        /// <param name="y_pred">Predictions returned by the model (output of model(x))</param>
+        /// <param name="sample_weight">Sample weights for weighting the loss function.</param>
+        public void ComputeLoss(NDarray x, NDarray y, NDarray y_pred = null, NDarray sample_weight = null)
+        {
+            var args = new Dictionary<string, object>();
+            args["x"] = x;
+            args["y"] = y;
+            args["y_pred"] = y_pred;
+            args["sample_weight"] = sample_weight;
+
+            InvokeMethod("compute_loss", args);
+        }
+
+        /// <summary>
+        /// Update metric states and collect all metrics to be returned.
+        /// Subclasses can optionally override this method to provide custom metric updating and collection logic.
+        /// </summary>
+        /// <param name="x">Input data.</param>
+        /// <param name="y">Target data.</param>
+        /// <param name="y_pred">Predictions returned by the model (output of model(x))</param>
+        /// <param name="sample_weight">Sample weights for weighting the loss function.</param>
+        public void ComputeMetrics(NDarray x, NDarray y, NDarray y_pred = null, NDarray sample_weight = null)
+        {
+            var args = new Dictionary<string, object>();
+            args["x"] = x;
+            args["y"] = y;
+            args["y_pred"] = y_pred;
+            args["sample_weight"] = sample_weight;
+
+            InvokeMethod("compute_metrics", args);
+        }
+
         /// <summary>
         /// Trains the model for a given number of epochs (iterations on a dataset).
         /// </summary>
@@ -79,10 +124,16 @@ public void Compile(StringOrInstance optimizer, string[] loss, string[] metrics
         /// <param name="initial_epoch">Integer. Epoch at which to start training (useful for resuming a previous training run).</param>
         /// <param name="steps_per_epoch">Integer or None. Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. When training with input tensors such as TensorFlow data tensors, the default None is equal to the number of samples in your dataset divided by the batch size, or 1 if that cannot be determined.</param>
         /// <param name="validation_steps">Only relevant if steps_per_epoch is specified. Total number of steps (batches of samples) to validate before stopping.</param>
+        /// <param name="validation_batch_size">Integer or None. Number of samples per validation batch. If unspecified, will default to batch_size. Do not specify the validation_batch_size if your data is in the form of datasets, generators, or keras.utils.Sequence instances</param>
+        /// <param name="validation_freq">Only relevant if validation data is provided. Integer or collections.abc.Container instance (e.g. list, tuple, etc.). If an integer, specifies how many training epochs to run before a new validation run is performed, e.g. validation_freq=2 runs validation every 2 epochs. If a Container, specifies the epochs on which to run validation, e.g. validation_freq=[1, 2, 10] runs validation at the end of the 1st, 2nd, and 10th epochs.</param>
+        /// <param name="max_queue_size">Integer. Used for generator or keras.utils.Sequence input only. Maximum size for the generator queue. If unspecified, max_queue_size will default to 10.</param>
+        /// <param name="workers">Integer. Used for generator or keras.utils.Sequence input only. Maximum number of processes to spin up when using process-based threading. If unspecified, workers will default to 1.</param>
+        /// <param name="use_multiprocessing">Boolean. Used for generator or keras.utils.Sequence input only. If True, use process-based threading. If unspecified, use_multiprocessing will default to False. .</param>
         /// <returns>A History object. Its History.history attribute is a record of training loss values and metrics values at successive epochs, as well as validation loss values and validation metrics values (if applicable).</returns>
         public History Fit(NDarray x, NDarray y, int? batch_size = null, int epochs = 1, int verbose = 1, Callback[] callbacks = null,
                         float validation_split = 0.0f, NDarray[] validation_data = null, bool shuffle = true, Dictionary<int, float> class_weight = null,
-                        NDarray sample_weight = null, int initial_epoch = 0, int? steps_per_epoch = null, int? validation_steps = null)
+                        NDarray sample_weight = null, int initial_epoch = 0, int? steps_per_epoch = null, int? validation_steps = null,
+                        int? validation_batch_size = null, int[] validation_freq = null, int max_queue_size = 10, int workers = 1, bool use_multiprocessing = false)
         {
             var args = new Dictionary<string, object>();
             args["x"] = x;
@@ -107,6 +158,11 @@ public History Fit(NDarray x, NDarray y, int? batch_size = null, int epochs = 1,
             args["initial_epoch"] = initial_epoch;
             args["steps_per_epoch"] = steps_per_epoch;
             args["validation_steps"] = validation_steps;
+            args["validation_batch_size"] = validation_batch_size;
+            args["validation_freq"] = validation_freq;
+            args["max_queue_size"] = max_queue_size;
+            args["workers"] = workers;
+            args["use_multiprocessing"] = use_multiprocessing;
 
             PyObject py = InvokeMethod("fit", args);
 
@@ -131,10 +187,16 @@ public History Fit(NDarray x, NDarray y, int? batch_size = null, int epochs = 1,
         /// <param name="initial_epoch">Integer. Epoch at which to start training (useful for resuming a previous training run).</param>
         /// <param name="steps_per_epoch">Integer or None. Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. When training with input tensors such as TensorFlow data tensors, the default None is equal to the number of samples in your dataset divided by the batch size, or 1 if that cannot be determined.</param>
         /// <param name="validation_steps">Only relevant if steps_per_epoch is specified. Total number of steps (batches of samples) to validate before stopping.</param>
+        /// <param name="validation_batch_size">Integer or None. Number of samples per validation batch. If unspecified, will default to batch_size. Do not specify the validation_batch_size if your data is in the form of datasets, generators, or keras.utils.Sequence instances</param>
+        /// <param name="validation_freq">Only relevant if validation data is provided. Integer or collections.abc.Container instance (e.g. list, tuple, etc.). If an integer, specifies how many training epochs to run before a new validation run is performed, e.g. validation_freq=2 runs validation every 2 epochs. If a Container, specifies the epochs on which to run validation, e.g. validation_freq=[1, 2, 10] runs validation at the end of the 1st, 2nd, and 10th epochs.</param>
+        /// <param name="max_queue_size">Integer. Used for generator or keras.utils.Sequence input only. Maximum size for the generator queue. If unspecified, max_queue_size will default to 10.</param>
+        /// <param name="workers">Integer. Used for generator or keras.utils.Sequence input only. Maximum number of processes to spin up when using process-based threading. If unspecified, workers will default to 1.</param>
+        /// <param name="use_multiprocessing">Boolean. Used for generator or keras.utils.Sequence input only. If True, use process-based threading. If unspecified, use_multiprocessing will default to False. .</param>
         /// <returns>A History object. Its History.history attribute is a record of training loss values and metrics values at successive epochs, as well as validation loss values and validation metrics values (if applicable).</returns>
         public History Fit(NDarray x, NDarray[] y, int? batch_size = null, int epochs = 1, int verbose = 1, Callback[] callbacks = null,
                         float validation_split = 0.0f, NDarray[] validation_data = null, bool shuffle = true, Dictionary<int, float> class_weight = null,
-                        NDarray sample_weight = null, int initial_epoch = 0, int? steps_per_epoch = null, int? validation_steps = null)
+                        NDarray sample_weight = null, int initial_epoch = 0, int? steps_per_epoch = null, int? validation_steps = null,
+                        int? validation_batch_size = null, int[] validation_freq = null, int max_queue_size = 10, int workers = 1, bool use_multiprocessing = false)
         {
             var args = new Dictionary<string, object>();
             args["x"] = x;
@@ -159,6 +221,11 @@ public History Fit(NDarray x, NDarray[] y, int? batch_size = null, int epochs =
             args["initial_epoch"] = initial_epoch;
             args["steps_per_epoch"] = steps_per_epoch;
             args["validation_steps"] = validation_steps;
+            args["validation_batch_size"] = validation_batch_size;
+            args["validation_freq"] = validation_freq;
+            args["max_queue_size"] = max_queue_size;
+            args["workers"] = workers;
+            args["use_multiprocessing"] = use_multiprocessing;
 
             PyObject py = InvokeMethod("fit", args);
 
@@ -175,8 +242,14 @@ public History Fit(NDarray x, NDarray[] y, int? batch_size = null, int epochs =
         /// <param name="sample_weight"> Optional Numpy array of weights for the test samples, used for weighting the loss function. You can either pass a flat (1D) Numpy array with the same length as the input samples (1:1 mapping between weights and samples), or in the case of temporal data, you can pass a 2D array with shape (samples, sequence_length), to apply a different weight to every timestep of every sample. In this case you should make sure to specifysample_weight_mode="temporal" in compile().</param>
         /// <param name="steps"> Integer or None. Total number of steps (batches of samples) before declaring the evaluation round finished. Ignored with the default value of None.</param>
         /// <param name="callbacks"> List of keras.callbacks.Callback instances. List of callbacks to apply during evaluation. See callbacks.</param>
+        /// <param name="max_queue_size"> Integer. Used for generator or keras.utils.Sequence input only. Maximum size for the generator queue. If unspecified, max_queue_size will default to 10.</param>
+        /// <param name="workers"> Integer. Used for generator or keras.utils.Sequence input only. Maximum number of processes to spin up when using process-based threading. If unspecified, workers will default to 1.</param>
+        /// <param name="use_multiprocessing"> Boolean. Used for generator or keras.utils.Sequence input only. If True, use process-based threading. If unspecified, use_multiprocessing will default to False. </param>
+        /// <param name="return_dict"> If True, loss and metric results are returned as a dict, with each key being the name of the metric. If False, they are returned as a list.</param>
         /// <returns>Scalar test loss (if the model has a single output and no metrics) or list of scalars (if the model has multiple outputs and/or metrics). The attribute model.metrics_names will give you the display labels for the scalar outputs.</returns>
-        public double[] Evaluate(NDarray x, NDarray y, int? batch_size = null, int verbose = 1, NDarray sample_weight = null, int? steps = null, Callback[] callbacks = null)
+        public double[] Evaluate(NDarray x, NDarray y, int? batch_size = null, int verbose = 1, 
+            NDarray sample_weight = null, int? steps = null, Callback[] callbacks = null, int max_queue_size=10,
+            int workers = 1, bool use_multiprocessing = false, bool return_dict = false)
         {
             var args = new Dictionary<string, object>();
             args["x"] = x.PyObject;
@@ -186,6 +259,10 @@ public double[] Evaluate(NDarray x, NDarray y, int? batch_size = null, int verbo
             args["sample_weight"] = sample_weight;
             args["steps"] = steps;
             args["callbacks"] = callbacks != null ? callbacks : null;
+            args["max_queue_size"] = max_queue_size;
+            args["workers"] = workers;
+            args["use_multiprocessing"] = use_multiprocessing;
+            args["return_dict"] = return_dict;
 
             return InvokeMethod("evaluate", args)?.As<double[]>();
         }
@@ -199,8 +276,12 @@ public double[] Evaluate(NDarray x, NDarray y, int? batch_size = null, int verbo
         /// <param name="verbose">Verbosity mode, 0 or 1.</param>
         /// <param name="steps">Total number of steps (batches of samples) before declaring the prediction round finished. Ignored with the default value of None.</param>
         /// <param name="callbacks">List of keras.callbacks.Callback instances. List of callbacks to apply during prediction. See callbacks.</param>
+        /// <param name="max_queue_size">Integer. Used for generator or keras.utils.Sequence input only. Maximum size for the generator queue. If unspecified, max_queue_size will default to 10.</param>
+        /// <param name="workers">Integer. Used for generator or keras.utils.Sequence input only. Maximum number of processes to spin up when using process-based threading. If unspecified, workers will default to 1.</param>
+        /// <param name="use_multiprocessing">Boolean. Used for generator or keras.utils.Sequence input only. If True, use process-based threading. If unspecified, use_multiprocessing will default to False. .</param>
         /// <returns>Numpy array(s) of predictions.</returns>
-        public NDarray Predict(NDarray x, int? batch_size = null, int verbose = 1, int? steps = null, Callback[] callbacks = null)
+        public NDarray Predict(NDarray x, int? batch_size = null, int verbose = 1, int? steps = null
+            , Callback[] callbacks = null, int max_queue_size = 10, int workers = 1, bool use_multiprocessing = false)
         {
             var args = new Dictionary<string, object>();
             args["x"] = x;
@@ -208,6 +289,9 @@ public NDarray Predict(NDarray x, int? batch_size = null, int verbose = 1, int?
             args["verbose"] = verbose;
             args["steps"] = steps;
             args["callbacks"] = callbacks != null ? callbacks : null;
+            args["max_queue_size"] = max_queue_size;
+            args["workers"] = workers;
+            args["use_multiprocessing"] = use_multiprocessing;
 
             return new NDarray(InvokeMethod("predict", args));
         }
@@ -253,8 +337,12 @@ public NDarray[] PredictMultipleOutputs(NDarray x, int? batch_size = null, int v
         /// <param name="verbose">Verbosity mode, 0 or 1.</param>
         /// <param name="steps">Total number of steps (batches of samples) before declaring the prediction round finished. Ignored with the default value of None.</param>
         /// <param name="callbacks">List of keras.callbacks.Callback instances. List of callbacks to apply during prediction. See callbacks.</param>
+        /// <param name="max_queue_size">Integer. Used for generator or keras.utils.Sequence input only. Maximum size for the generator queue. If unspecified, max_queue_size will default to 10.</param>
+        /// <param name="workers">Integer. Used for generator or keras.utils.Sequence input only. Maximum number of processes to spin up when using process-based threading. If unspecified, workers will default to 1.</param>
+        /// <param name="use_multiprocessing">Boolean. Used for generator or keras.utils.Sequence input only. If True, use process-based threading. If unspecified, use_multiprocessing will default to False. .</param>
         /// <returns>Numpy array(s) of predictions.</returns>
-        public NDarray Predict(List<NDarray> x, int? batch_size = null, int verbose = 1, int? steps = null, Callback[] callbacks = null)
+        public NDarray Predict(List<NDarray> x, int? batch_size = null, int verbose = 1, int? steps = null, 
+            Callback[] callbacks = null, int max_queue_size = 10, int workers = 1, bool use_multiprocessing = false)
         {
 
             var args = new Dictionary<string, object>();
@@ -271,6 +359,9 @@ public NDarray Predict(List<NDarray> x, int? batch_size = null, int verbose = 1,
             args["verbose"] = verbose;
             args["steps"] = steps;
             args["callbacks"] = callbacks != null ? callbacks : null;
+            args["max_queue_size"] = max_queue_size;
+            args["workers"] = workers;
+            args["use_multiprocessing"] = use_multiprocessing;
 
             return new NDarray(InvokeMethod("predict", args));
         }
@@ -283,15 +374,19 @@ public NDarray Predict(List<NDarray> x, int? batch_size = null, int verbose = 1,
         /// <param name="y">Numpy array of target data, or list of Numpy arrays if the model has multiple outputs. If all outputs in the model are named, you can also pass a dictionary mapping output names to Numpy arrays.</param>
         /// <param name="sample_weight">Optional array of the same length as x, containing weights to apply to the model's loss for each sample. In the case of temporal data, you can pass a 2D array with shape (samples, sequence_length), to apply a different weight to every timestep of every sample. In this case you should make sure to specify sample_weight_mode="temporal" in compile().</param>
         /// <param name="class_weight">Optional dictionary mapping class indices (integers) to a weight (float) to apply to the model's loss for the samples from this class during training. This can be useful to tell the model to "pay more attention" to samples from an under-represented class.</param>
+        /// <param name="reset_metrics">Optional array of the same length as x, containing weights to apply to the model's loss for each sample. In the case of temporal data, you can pass a 2D array with shape (samples, sequence_length), to apply a different weight to every timestep of every sample. In this case you should make sure to specify sample_weight_mode="temporal" in compile().</param> 
+        /// <param name="return_dict">If True, loss and metric results are returned as a dict, with each key being the name of the metric. If False, they are returned as a list.</param>
         /// <returns>Scalar training loss (if the model has a single output and no metrics) or list of scalars (if the model has multiple outputs and/or metrics). The attribute model.metrics_names will give you the display labels for the scalar outputs.</returns>
 
-        public double[] TrainOnBatch(NDarray x, NDarray y, NDarray sample_weight = null, Dictionary<int, float> class_weight = null)
+        public double[] TrainOnBatch(NDarray x, NDarray y, NDarray sample_weight = null, Dictionary<int, float> class_weight = null, bool reset_metrics = false, bool return_dict = false)
         {
             var args = new Dictionary<string, object>();
             args["x"] = x;
             args["y"] = y;
             args["sample_weight"] = sample_weight;
             args["class_weight"] = class_weight;
+            args["reset_metrics"] = reset_metrics;
+            args["return_dict"] = return_dict;
 
             var pyresult = InvokeMethod("train_on_batch", args);
             if (pyresult == null) return default;
@@ -304,13 +399,25 @@ public double[] TrainOnBatch(NDarray x, NDarray y, NDarray sample_weight = null,
             return result;
         }
 
-        public double[] TrainOnBatch(NDarray[] x, NDarray y, NDarray sample_weight = null, Dictionary<int, float> class_weight = null)
+        /// <summary>
+        /// Runs a single gradient update on a single batch of data.
+        /// </summary>
+        /// <param name="x">Numpy array of training data, or list of Numpy arrays if the model has multiple inputs. If all inputs in the model are named, you can also pass a dictionary mapping input names to Numpy arrays.</param>
+        /// <param name="y">Numpy array of target data, or list of Numpy arrays if the model has multiple outputs. If all outputs in the model are named, you can also pass a dictionary mapping output names to Numpy arrays.</param>
+        /// <param name="sample_weight">Optional array of the same length as x, containing weights to apply to the model's loss for each sample. In the case of temporal data, you can pass a 2D array with shape (samples, sequence_length), to apply a different weight to every timestep of every sample. In this case you should make sure to specify sample_weight_mode="temporal" in compile().</param>
+        /// <param name="class_weight">Optional dictionary mapping class indices (integers) to a weight (float) to apply to the model's loss for the samples from this class during training. This can be useful to tell the model to "pay more attention" to samples from an under-represented class.</param>
+        /// <param name="reset_metrics">Optional array of the same length as x, containing weights to apply to the model's loss for each sample. In the case of temporal data, you can pass a 2D array with shape (samples, sequence_length), to apply a different weight to every timestep of every sample. In this case you should make sure to specify sample_weight_mode="temporal" in compile().</param> 
+        /// <param name="return_dict">If True, loss and metric results are returned as a dict, with each key being the name of the metric. If False, they are returned as a list.</param>
+        /// <returns>Scalar training loss (if the model has a single output and no metrics) or list of scalars (if the model has multiple outputs and/or metrics). The attribute model.metrics_names will give you the display labels for the scalar outputs.</returns>
+        public double[] TrainOnBatch(NDarray[] x, NDarray y, NDarray sample_weight = null, Dictionary<int, float> class_weight = null, bool reset_metrics = false, bool return_dict = false)
         {
             var args = new Dictionary<string, object>();
             args["x"] = x;
             args["y"] = y;
             args["sample_weight"] = sample_weight;
             args["class_weight"] = class_weight;
+            args["reset_metrics"] = reset_metrics;
+            args["return_dict"] = return_dict;
 
             var pyresult = InvokeMethod("train_on_batch", args);
             if (pyresult == null) return default;
@@ -329,13 +436,17 @@ public double[] TrainOnBatch(NDarray[] x, NDarray y, NDarray sample_weight = nul
         /// <param name="x">Numpy array of test data, or list of Numpy arrays if the model has multiple inputs. If all inputs in the model are named, you can also pass a dictionary mapping input names to Numpy arrays.</param>
         /// <param name="y">Numpy array of target data, or list of Numpy arrays if the model has multiple outputs. If all outputs in the model are named, you can also pass a dictionary mapping output names to Numpy arrays.</param>
         /// <param name="sample_weight">Optional array of the same length as x, containing weights to apply to the model's loss for each sample. In the case of temporal data, you can pass a 2D array with shape (samples, sequence_length), to apply a different weight to every timestep of every sample. In this case you should make sure to specify sample_weight_mode="temporal" in compile().</param>
+        /// <param name="reset_metrics">Optional array of the same length as x, containing weights to apply to the model's loss for each sample. In the case of temporal data, you can pass a 2D array with shape (samples, sequence_length), to apply a different weight to every timestep of every sample. In this case you should make sure to specify sample_weight_mode="temporal" in compile().</param> 
+        /// <param name="return_dict">If True, loss and metric results are returned as a dict, with each key being the name of the metric. If False, they are returned as a list.</param>
         /// <returns>Scalar test loss (if the model has a single output and no metrics) or list of scalars (if the model has multiple outputs and/or metrics). The attribute model.metrics_names will give you the display labels for the scalar outputs.</returns>
-        public double[] TestOnBatch(NDarray x, NDarray y, NDarray sample_weight = null)
+        public double[] TestOnBatch(NDarray x, NDarray y, NDarray sample_weight = null, bool reset_metrics = false, bool return_dict = false)
         {
             var args = new Dictionary<string, object>();
             args["x"] = x;
             args["y"] = y;
             args["sample_weight"] = sample_weight;
+            args["reset_metrics"] = reset_metrics;
+            args["return_dict"] = return_dict;
 
             //return InvokeMethod("test_on_batch", args)?.As<double[]>();
             var pyresult = InvokeMethod("test_on_batch", args);
@@ -349,12 +460,23 @@ public double[] TestOnBatch(NDarray x, NDarray y, NDarray sample_weight = null)
             return result;
         }
 
-        public double[] TestOnBatch(NDarray[] x, NDarray y, NDarray sample_weight = null)
+        /// <summary>
+        /// Tests the on batch.
+        /// </summary>
+        /// <param name="x">Numpy array of test data, or list of Numpy arrays if the model has multiple inputs. If all inputs in the model are named, you can also pass a dictionary mapping input names to Numpy arrays.</param>
+        /// <param name="y">Numpy array of target data, or list of Numpy arrays if the model has multiple outputs. If all outputs in the model are named, you can also pass a dictionary mapping output names to Numpy arrays.</param>
+        /// <param name="sample_weight">Optional array of the same length as x, containing weights to apply to the model's loss for each sample. In the case of temporal data, you can pass a 2D array with shape (samples, sequence_length), to apply a different weight to every timestep of every sample. In this case you should make sure to specify sample_weight_mode="temporal" in compile().</param>
+        /// <param name="reset_metrics">Optional array of the same length as x, containing weights to apply to the model's loss for each sample. In the case of temporal data, you can pass a 2D array with shape (samples, sequence_length), to apply a different weight to every timestep of every sample. In this case you should make sure to specify sample_weight_mode="temporal" in compile().</param> 
+        /// <param name="return_dict">If True, loss and metric results are returned as a dict, with each key being the name of the metric. If False, they are returned as a list.</param>
+        /// <returns>Scalar test loss (if the model has a single output and no metrics) or list of scalars (if the model has multiple outputs and/or metrics). The attribute model.metrics_names will give you the display labels for the scalar outputs.</returns>
+        public double[] TestOnBatch(NDarray[] x, NDarray y, NDarray sample_weight = null, bool reset_metrics = false, bool return_dict = false)
         {
             var args = new Dictionary<string, object>();
             args["x"] = x;
             args["y"] = y;
             args["sample_weight"] = sample_weight;
+            args["reset_metrics"] = reset_metrics;
+            args["return_dict"] = return_dict;
 
             //return InvokeMethod("test_on_batch", args)?.As<double[]>();
             var pyresult = InvokeMethod("test_on_batch", args);
@@ -381,6 +503,11 @@ public NDarray PredictOnBatch(NDarray x)
             return new NDarray(InvokeMethod("predict_on_batch", args));
         }
 
+        /// <summary>
+        /// Returns predictions for a single batch of samples.
+        /// </summary>
+        /// <param name="x">Input samples, as a Numpy array.</param>
+        /// <returns>Numpy array(s) of predictions.</returns>
         public NDarray PredictOnBatch(NDarray[] x)
         {
             var args = new Dictionary<string, object>();
@@ -389,6 +516,32 @@ public NDarray PredictOnBatch(NDarray[] x)
             return new NDarray(InvokeMethod("predict_on_batch", args));
         }
 
+        /// <summary>
+        /// The logic for one inference step.
+        /// </summary>
+        /// <param name="data">A nested structure of Tensors.</param>
+        /// <returns>Numpy array(s) of predictions.</returns>
+        public NDarray PredictStep(NDarray data)
+        {
+            var args = new Dictionary<string, object>();
+            args["data"] = data;
+
+            return new NDarray(InvokeMethod("predict_step", args));
+        }
+
+        /// <summary>
+        /// The logic for one inference step.
+        /// </summary>
+        /// <param name="data">A nested structure of Tensors.</param>
+        /// <returns>Numpy array(s) of predictions.</returns>
+        public NDarray PredictStep(NDarray[] data)
+        {
+            var args = new Dictionary<string, object>();
+            args["data"] = data;
+
+            return new NDarray(InvokeMethod("predict_step", args));
+        }
+
         public History FitGenerator(Sequence generator, int? steps_per_epoch = null, int epochs = 1, int verbose = 1, Callback[] callbacks = null,
                     Sequence validation_data = null, int? validation_steps = null, int validation_freq = 1, Dictionary<int, float> class_weight = null,
                     int max_queue_size = 10, int workers = 1, bool use_multiprocessing = false, bool shuffle = true, int initial_epoch = 0)
@@ -473,19 +626,23 @@ public string ToJson()
         /// <summary>
         /// Saves the weight of the trained model to a file.
         /// </summary>
-        /// <param name="path">The path of the weight to save.</param>
-        public void SaveWeight(string path)
+        /// <param name="filepath">str or pathlib.Path object. Path where to save the model.</param>
+        /// <param name="overwrite">Whether we should overwrite any existing model at the target location, or instead ask the user via an interactive prompt.</param>
+        /// <param name="save_format">Either "keras", "tf", "h5", indicating whether to save the model in the native Keras format (.keras), in the TensorFlow SavedModel format (referred to as "SavedModel" below), or in the legacy HDF5 format (.h5). Defaults to "tf" in TF 2.X, and "h5" in TF 1.X.</param>
+        public void SaveWeight(string filepath, bool overwrite = true, string save_format = "tf")
         {
-            PyInstance.save_weights(path);
+            PyInstance.save_weights(filepath, overwrite, save_format);
         }
 
         /// <summary>
-        /// Save the model to h5 file
+        /// Saves a model as a TensorFlow SavedModel or HDF5 file.
         /// </summary>
-        /// <param name="path">The path with filename eg: model.h5.</param>
-        public void Save(string filepath, bool overwrite = true, bool include_optimizer = true)
+        /// <param name="filepath">str or pathlib.Path object. Path where to save the model.</param>
+        /// <param name="overwrite">Whether we should overwrite any existing model at the target location, or instead ask the user via an interactive prompt.</param>
+        /// <param name="save_format">Either "keras", "tf", "h5", indicating whether to save the model in the native Keras format (.keras), in the TensorFlow SavedModel format (referred to as "SavedModel" below), or in the legacy HDF5 format (.h5). Defaults to "tf" in TF 2.X, and "h5" in TF 1.X.</param>
+        public void Save(string filepath, bool overwrite = true, string save_format = "tf")
         {
-            PyInstance.save(filepath: filepath, overwrite: overwrite, include_optimizer: include_optimizer);
+            PyInstance.save(filepath: filepath, overwrite: overwrite, save_format: save_format);
         }
 
         /// <summary>
@@ -525,10 +682,17 @@ public void SetWeights(List<NDarray> weights)
         /// <summary>
         /// Loads the weight to the model from a file.
         /// </summary>
-        /// <param name="path">The path of of the weight file.</param>
-        public void LoadWeight(string path)
+        /// <param name="filepath">String, path to the weights file to load. For weight files in TensorFlow format, this is the file prefix (the same as was passed to save_weights()).</param>
+        /// <param name="skip_mismatch">A list of Numpy arrays with shapes and types matching the output of model.GetWeights()</param>
+        /// <param name="by_name">A list of Numpy arrays with shapes and types matching the output of model.GetWeights()</param>
+        /// <param name="options">A list of Numpy arrays with shapes and types matching the output of model.GetWeights()</param>
+        public void LoadWeight(string filepath, bool skip_mismatch= false, bool by_name= false, CheckpointOptions options= null)
         {
-            PyInstance.load_weights(path);
+            PyObject optionPyObject = null;
+            if (options != null)
+                optionPyObject = options.ToPython();
+
+            PyInstance.load_weights(filepath, skip_mismatch, by_name, optionPyObject);
         }
 
         /// <summary>
@@ -587,7 +751,7 @@ public static BaseModel ModelFromYaml(string json_string)
         public void SaveOnnx(string filePath)
         {
             var onnx_model = Instance.keras2onnx.convert_keras(model: (PyObject)this.PyInstance);
-            File.WriteAllText(filePath, onnx_model.ToString());
+            System.IO.File.WriteAllText(filePath, onnx_model.ToString());
         }
 
         /// <summary>
@@ -595,9 +759,9 @@ public void SaveOnnx(string filePath)
         /// </summary>
         /// <param name="line_length">Length of the line.</param>
         /// <param name="positions">The positions.</param>
-        public void Summary(int? line_length = null, float[] positions = null)
+        public void Summary(int? line_length = null, float[] positions = null, string print_fn = null, bool expand_nested = false, bool show_trainable = false, int[] layer_range = null)
         {
-            PyInstance.summary(line_length: line_length, positions: positions);
+            PyInstance.summary(line_length: line_length, positions: positions, print_fn: print_fn, expand_nested: expand_nested, show_trainable: show_trainable, layer_range: layer_range);
         }
 
         /// <summary>
@@ -609,5 +773,41 @@ public void SaveTensorflowJSFormat(string artifacts_dir, bool quantize = false)
         {
             Instance.tfjs.converters.save_keras_model(model: this.PyInstance, artifacts_dir: artifacts_dir);
         }
+
+        /// <summary>
+        /// Export the model
+        /// </summary>
+        /// <param name="filepath">File path to export</param>
+        public void Export(string filepath)
+        {
+            PyInstance.export(filepath);
+        }
+
+        /// <summary>
+        /// Get the layer based on name and/or index
+        /// </summary>
+        /// <param name="name">Name of the layer</param>
+        /// <param name="index">Index of the layer</param>
+        /// <returns></returns>
+        public BaseLayer GetLayer(string name = null, int? index = null)
+        {
+            return (BaseLayer)PyInstance.get_layer(name, index);
+        }
+
+        /// <summary>
+        /// Resets the state of all the metrics in the model.
+        /// </summary>
+        public void ResetMetrics()
+        {
+            PyInstance.reset_metrics();
+        }
+
+        /// <summary>
+        /// Reset the states of the model
+        /// </summary>
+        public void ResetStates()
+        {
+            PyInstance.reset_states();
+        }
     }
 }
diff --git a/Keras/Optimizers.cs b/Keras/Optimizers.cs
index aae320a..c545549 100644
--- a/Keras/Optimizers.cs
+++ b/Keras/Optimizers.cs
@@ -1,8 +1,97 @@
 ﻿using System;
 using System.Collections.Generic;
+using System.Runtime.InteropServices;
 using System.Text;
 
 namespace Keras.Optimizers
+{
+    /// <summary>
+    /// Adam optimizer. Default parameters follow those provided in the original paper.
+    /// </summary>
+    /// <seealso cref="Keras.Base" />
+    public class Adam : Base
+    {
+        public Adam(float learning_rate = 0.001f, float beta_1= 0.9f, float beta_2= 0.999f, float epsilon = 1e-07f, bool amsgrad = false, float? weight_decay = null,
+            bool? clipnorm = null, bool? clipvalue = null, float? global_clipnorm = null, bool use_ema = false, float ema_momentum = 0.99f, 
+            int? ema_overwrite_frequency = null, bool jit_compile = true)
+        {
+            Parameters["learning_rate"] = learning_rate;
+            Parameters["beta_1"] = beta_1;
+            Parameters["beta_2"] = beta_2;
+            Parameters["epsilon"] = epsilon;
+            Parameters["amsgrad"] = amsgrad;
+            Parameters["weight_decay"] = weight_decay;
+            Parameters["clipnorm"] = clipnorm;
+            Parameters["clipvalue"] = clipvalue;
+            Parameters["global_clipnorm"] = global_clipnorm;
+            Parameters["use_ema"] = use_ema;
+            Parameters["ema_momentum"] = ema_momentum;
+            Parameters["ema_overwrite_frequency"] = ema_overwrite_frequency;
+            Parameters["jit_compile"] = jit_compile;
+
+            PyInstance = Instance.keras.optimizers.Adam;
+            Init();
+        }
+    }
+
+    /// <summary>
+    /// Optimizer that implements the Adafactor algorithm.
+    /// </summary>
+    /// <seealso cref="Keras.Base" />
+    public class Adafactor : Base
+    {
+        public Adafactor(float learning_rate = 0.001f, float beta_2_decay = -0.8f, float epsilon_1 = 1e-30f, float epsilon_2 = 1e-3f, float clip_threshold = 1, 
+            bool relative_step = true, string name = null, float? weight_decay = null, bool? clipnorm = null, bool? clipvalue = null, 
+            float? global_clipnorm = null, bool use_ema = false, float ema_momentum = 0.99f, int? ema_overwrite_frequency = null, bool jit_compile = true)
+        {
+            Parameters["learning_rate"] = learning_rate;
+            Parameters["beta_2_decay"] = beta_2_decay;
+            Parameters["epsilon_1"] = epsilon_1;
+            Parameters["epsilon_2"] = epsilon_2;
+            Parameters["clip_threshold"] = clip_threshold;
+            Parameters["relative_step"] = relative_step;
+            Parameters["name"] = name;
+            Parameters["weight_decay"] = weight_decay;
+            Parameters["clipnorm"] = clipnorm;
+            Parameters["clipvalue"] = clipvalue;
+            Parameters["global_clipnorm"] = global_clipnorm;
+            Parameters["use_ema"] = use_ema;
+            Parameters["ema_momentum"] = ema_momentum;
+            Parameters["ema_overwrite_frequency"] = ema_overwrite_frequency;
+            Parameters["jit_compile"] = jit_compile;
+
+            PyInstance = Instance.keras.optimizers.Adafactor;
+            Init();
+        }
+    }
+
+    public class AdamW : Base
+    {
+        public AdamW(float learning_rate = 0.001f, float weight_decay = 0.004f, float beta_1 = 0.9f, float beta_2 = 0.999f, float epsilon = 1e-07f, bool amsgrad = false, 
+            bool? clipnorm = null, bool? clipvalue = null, float? global_clipnorm = null, bool use_ema = false, float ema_momentum = 0.99f,
+            int? ema_overwrite_frequency = null, bool jit_compile = true)
+        {
+            Parameters["learning_rate"] = learning_rate;
+            Parameters["weight_decay"] = weight_decay;
+            Parameters["beta_1"] = beta_1;
+            Parameters["beta_2"] = beta_2;
+            Parameters["epsilon"] = epsilon;
+            Parameters["amsgrad"] = amsgrad;
+            Parameters["clipnorm"] = clipnorm;
+            Parameters["clipvalue"] = clipvalue;
+            Parameters["global_clipnorm"] = global_clipnorm;
+            Parameters["use_ema"] = use_ema;
+            Parameters["ema_momentum"] = ema_momentum;
+            Parameters["ema_overwrite_frequency"] = ema_overwrite_frequency;
+            Parameters["jit_compile"] = jit_compile;
+
+            PyInstance = Instance.keras.optimizers.AdamW;
+            Init();
+        }
+    }
+}
+
+namespace Keras.Optimizers.Legacy
 {
     /// <summary>
     /// Stochastic gradient descent optimizer.    Includes support for momentum, learning rate decay, and Nesterov momentum.
@@ -24,7 +113,7 @@ public SGD(float lr = 0.01f, float momentum = 0.0f, float decay = 0.0f, bool nes
             Parameters["decay"] = decay;
             Parameters["nesterov"] = nesterov;
 
-            PyInstance = Instance.keras.optimizers.SGD;
+            PyInstance = Instance.keras.optimizers.legacy.SGD;
             Init();
         }
     }
@@ -50,7 +139,7 @@ public RMSprop(float lr = 0.01f, float rho = 0.9f, float? epsilon = null, float
             Parameters["epsilon"] = epsilon;
             Parameters["decay"] = decay;
 
-            PyInstance = Instance.keras.optimizers.RMSprop;
+            PyInstance = Instance.keras.optimizers.legacy.RMSprop;
             Init();
         }
     }
@@ -73,7 +162,7 @@ public Adagrad(float lr = 0.01f, float? epsilon = null, float decay = 0.0f)
             Parameters["epsilon"] = epsilon;
             Parameters["decay"] = lr;
 
-            PyInstance = Instance.keras.optimizers.Adagrad;
+            PyInstance = Instance.keras.optimizers.legacy.Adagrad;
             Init();
         }
     }
@@ -98,7 +187,7 @@ public Adadelta(float lr = 1.0f, float rho = 0.95f, float? epsilon = null, float
             Parameters["epsilon"] = epsilon;
             Parameters["decay"] = decay;
 
-            PyInstance = Instance.keras.optimizers.Adadelta;
+            PyInstance = Instance.keras.optimizers.legacy.Adadelta;
             Init();
         }
     }
@@ -118,7 +207,7 @@ public class Adam : Base
         /// <param name="epsilon">The epsilon.</param>
         /// <param name="decay">The decay.</param>
         /// <param name="amsgrad">boolean. Whether to apply the AMSGrad variant of this algorithm from the paper "On the Convergence of Adam and Beyond".</param>
-        public Adam(float lr = 0.001f, float beta_1= 0.9f, float beta_2= 0.999f, float? epsilon = null, float decay = 0.0f, bool amsgrad = false)
+        public Adam(float lr = 0.001f, float beta_1 = 0.9f, float beta_2 = 0.999f, float? epsilon = null, float decay = 0.0f, bool amsgrad = false)
         {
             Parameters["lr"] = lr;
             Parameters["beta_1"] = beta_1;
@@ -127,7 +216,7 @@ public Adam(float lr = 0.001f, float beta_1= 0.9f, float beta_2= 0.999f, float?
             Parameters["decay"] = decay;
             Parameters["amsgrad"] = amsgrad;
 
-            PyInstance = Instance.keras.optimizers.Adam;
+            PyInstance = Instance.keras.optimizers.legacy.Adam;
             Init();
         }
     }
@@ -154,7 +243,7 @@ public Adamax(float lr = 0.002f, float beta_1 = 0.9f, float beta_2 = 0.999f, flo
             Parameters["epsilon"] = epsilon;
             Parameters["decay"] = decay;
 
-            PyInstance = Instance.keras.optimizers.Adamax;
+            PyInstance = Instance.keras.optimizers.legacy.Adamax;
             Init();
         }
     }
@@ -180,7 +269,7 @@ public Nadam(float lr = 0.002f, float beta_1 = 0.9f, float beta_2 = 0.999f)
             Parameters["beta_1"] = beta_1;
             Parameters["beta_2"] = beta_2;
 
-            PyInstance = Instance.keras.optimizers.Adamax;
+            PyInstance = Instance.keras.optimizers.legacy.Nadam;
             Init();
         }
     }
@@ -203,7 +292,7 @@ public class Ftrl : Base
         /// <param name="l2rs">float <= 0. Lambda 2 Regularization Strength.</param>
         /// <param name="l2srs">float <= 0. Lambda 2 Shrinkage Regularization Strength.</param>
         /// <param name="beta">floats, 0 < beta < 1. Generally close to 1.</param>
-        public Ftrl(float lr = 0.001f,float lrp = -0.5f, float iav = 0.1f, float l1rs = 0.0f, float l2rs = 0.0f, float l2srs = 0.0f, float beta = 0.0f)
+        public Ftrl(float lr = 0.001f, float lrp = -0.5f, float iav = 0.1f, float l1rs = 0.0f, float l2rs = 0.0f, float l2srs = 0.0f, float beta = 0.0f)
         {
             Parameters["learning_rate"] = lr;
             Parameters["learning_rate_power"] = lrp;
@@ -213,7 +302,7 @@ public Ftrl(float lr = 0.001f,float lrp = -0.5f, float iav = 0.1f, float l1rs =
             Parameters["l2_shrinkage_regularization_strength"] = l2srs;
             Parameters["beta"] = beta;
 
-            PyInstance = Instance.keras.optimizers.Ftrl;
+            PyInstance = Instance.keras.optimizers.legacy.Ftrl;
             Init();
         }
     }
diff --git a/Keras/Base.cs b/Tensorflow/Base.cs
similarity index 99%
rename from Keras/Base.cs
rename to Tensorflow/Base.cs
index 1cb0c5a..a97cd75 100644
--- a/Keras/Base.cs
+++ b/Tensorflow/Base.cs
@@ -8,7 +8,7 @@ namespace Keras
 {
     public abstract class Base : Keras
     {
-        internal dynamic PyInstance;
+        public dynamic PyInstance;
         public Dictionary<string, object> Parameters = new Dictionary<string, object>();
 
         public object None = null;
diff --git a/Keras/InternalTypes.cs b/Tensorflow/InternalTypes.cs
similarity index 100%
rename from Keras/InternalTypes.cs
rename to Tensorflow/InternalTypes.cs
diff --git a/Keras/Keras.cs b/Tensorflow/Keras.cs
similarity index 95%
rename from Keras/Keras.cs
rename to Tensorflow/Keras.cs
index dc0504f..93687f0 100644
--- a/Keras/Keras.cs
+++ b/Tensorflow/Keras.cs
@@ -1,6 +1,4 @@
-﻿using Keras.Layers;
-using Keras.Utils;
-using Numpy;
+﻿using Numpy;
 using Numpy.Models;
 using Python.Runtime;
 using System;
@@ -22,7 +20,7 @@ public class Keras : IDisposable
 
         private static bool alreadyDisabled = false;
 
-        private static Lazy<Keras> _instance = new Lazy<Keras>(() =>
+        public static Lazy<Keras> _instance = new Lazy<Keras>(() =>
         {
             var instance = new Keras();
             instance.keras = InstallAndImport(Setup.KerasModule);
@@ -92,7 +90,7 @@ private static PyObject InstallAndImport(string module)
 
         private bool IsInitialized => keras != null;
 
-        internal Keras() { }
+        public Keras() { }
 
         public void Dispose()
         {
@@ -100,7 +98,7 @@ public void Dispose()
             PythonEngine.Shutdown();
         }
 
-        internal static PyObject ToPython(object obj)
+        public static PyObject ToPython(object obj)
         {
             if (obj == null) return Runtime.None;
             switch (obj)
@@ -122,7 +120,7 @@ internal static PyObject ToPython(object obj)
                 case Slice o: return o.ToPython();
                 case PythonObject o: return o.PyObject;
                 case PyObject o: return o;
-                case Sequence o: return o.PyInstance;
+                //case Sequence o: return o.PyInstance;
                 case StringOrInstance o: return o.PyObject;
                 case KerasFunction o: return o.PyObject;
                 case Base o: return o.PyInstance;
diff --git a/Keras/Setup.cs b/Tensorflow/Setup.cs
similarity index 100%
rename from Keras/Setup.cs
rename to Tensorflow/Setup.cs
diff --git a/Tensorflow/Train/CheckpointOptions.cs b/Tensorflow/Train/CheckpointOptions.cs
new file mode 100644
index 0000000..75fc983
--- /dev/null
+++ b/Tensorflow/Train/CheckpointOptions.cs
@@ -0,0 +1,12 @@
+﻿using Keras;
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace Tensorflow.Train
+{
+    public class CheckpointOptions : Base
+    {
+
+    }
+}