From 88fb5109e8d0a8d1d090795aedd51f41a9778a81 Mon Sep 17 00:00:00 2001
From: Yang Wang <yang3.wang@intel.com>
Date: Fri, 8 Nov 2019 13:23:27 +0800
Subject: [PATCH] Support iterate a dataset in sequential order when training
 (#1743)

* support iterate a dataset in sequential order when training

add unit test

fix style

* unpersist

* fix bug
---
 .../dllib/src/bigdl/dllib/feature/common.py   | 42 +++++++++++++++----
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/python/dllib/src/bigdl/dllib/feature/common.py b/python/dllib/src/bigdl/dllib/feature/common.py
index e25fce0b018..15e90a66d7a 100644
--- a/python/dllib/src/bigdl/dllib/feature/common.py
+++ b/python/dllib/src/bigdl/dllib/feature/common.py
@@ -224,7 +224,9 @@ def __init__(self, jvalue=None, bigdl_type="float"):
             self.value = jvalue
 
     @classmethod
-    def image_frame(cls, image_frame, memory_type="DRAM", bigdl_type="float"):
+    def image_frame(cls, image_frame, memory_type="DRAM",
+                    sequential_order=False,
+                    shuffle=True, bigdl_type="float"):
         """
         Create FeatureSet from ImageFrame.
         :param image_frame: ImageFrame
@@ -235,15 +237,21 @@ def image_frame(cls, image_frame, memory_type="DRAM", bigdl_type="float"):
                               of the data into memory during the training. After going through the
                               1/n, we will release the current cache, and load another 1/n into
                               memory.
+        :param sequential_order: whether to iterate the elements in the feature set
+                                 in sequential order for training.
+        :param shuffle: whether to shuffle the elements in each partition before each epoch
+                        when training
         :param bigdl_type: numeric type
         :return: A feature set
         """
         jvalue = callBigDlFunc(bigdl_type, "createFeatureSetFromImageFrame",
-                               image_frame, memory_type)
+                               image_frame, memory_type, sequential_order, shuffle)
         return cls(jvalue=jvalue)
 
     @classmethod
-    def image_set(cls, imageset, memory_type="DRAM", bigdl_type="float"):
+    def image_set(cls, imageset, memory_type="DRAM",
+                  sequential_order=False,
+                  shuffle=True, bigdl_type="float"):
         """
         Create FeatureSet from ImageFrame.
         :param imageset: ImageSet
@@ -254,15 +262,22 @@ def image_set(cls, imageset, memory_type="DRAM", bigdl_type="float"):
                               of the data into memory during the training. After going through the
                               1/n, we will release the current cache, and load another 1/n into
                               memory.
+        :param sequential_order: whether to iterate the elements in the feature set
+                                 in sequential order for training.
+        :param shuffle: whether to shuffle the elements in each partition before each epoch
+                        when training
         :param bigdl_type: numeric type
         :return: A feature set
         """
         jvalue = callBigDlFunc(bigdl_type, "createFeatureSetFromImageFrame",
-                               imageset.to_image_frame(), memory_type)
+                               imageset.to_image_frame(), memory_type,
+                               sequential_order, shuffle)
         return cls(jvalue=jvalue)
 
     @classmethod
-    def sample_rdd(cls, rdd, memory_type="DRAM", bigdl_type="float"):
+    def sample_rdd(cls, rdd, memory_type="DRAM",
+                   sequential_order=False,
+                   shuffle=True, bigdl_type="float"):
         """
         Create FeatureSet from RDD[Sample].
         :param rdd: A RDD[Sample]
@@ -273,14 +288,20 @@ def sample_rdd(cls, rdd, memory_type="DRAM", bigdl_type="float"):
                               of the data into memory during the training. After going through the
                               1/n, we will release the current cache, and load another 1/n into
                               memory.
+        :param sequential_order: whether to iterate the elements in the feature set
+                                 in sequential order when training.
+        :param shuffle: whether to shuffle the elements in each partition before each epoch
+                        when training
         :param bigdl_type:numeric type
         :return: A feature set
         """
-        jvalue = callBigDlFunc(bigdl_type, "createSampleFeatureSetFromRDD", rdd, memory_type)
+        jvalue = callBigDlFunc(bigdl_type, "createSampleFeatureSetFromRDD", rdd,
+                               memory_type, sequential_order, shuffle)
         return cls(jvalue=jvalue)
 
     @classmethod
-    def rdd(cls, rdd, memory_type="DRAM", bigdl_type="float"):
+    def rdd(cls, rdd, memory_type="DRAM", sequential_order=False,
+            shuffle=True, bigdl_type="float"):
         """
         Create FeatureSet from RDD.
         :param rdd: A RDD
@@ -291,10 +312,15 @@ def rdd(cls, rdd, memory_type="DRAM", bigdl_type="float"):
                               of the data into memory during the training. After going through the
                               1/n, we will release the current cache, and load another 1/n into
                               memory.
+        :param sequential_order: whether to iterate the elements in the feature set
+                                 in sequential order when training.
+        :param shuffle: whether to shuffle the elements in each partition before each epoch
+                        when training
         :param bigdl_type:numeric type
         :return: A feature set
         """
-        jvalue = callBigDlFunc(bigdl_type, "createFeatureSetFromRDD", rdd, memory_type)
+        jvalue = callBigDlFunc(bigdl_type, "createFeatureSetFromRDD", rdd,
+                               memory_type, sequential_order, shuffle)
         return cls(jvalue=jvalue)
 
     def transform(self, transformer):