From 88fb5109e8d0a8d1d090795aedd51f41a9778a81 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Fri, 8 Nov 2019 13:23:27 +0800 Subject: [PATCH] Support iterate a dataset in sequential order when training (#1743) * support iterate a dataset in sequential order when training add unit test fix style * unpersist * fix bug --- .../dllib/src/bigdl/dllib/feature/common.py | 42 +++++++++++++++---- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/python/dllib/src/bigdl/dllib/feature/common.py b/python/dllib/src/bigdl/dllib/feature/common.py index e25fce0b018..15e90a66d7a 100644 --- a/python/dllib/src/bigdl/dllib/feature/common.py +++ b/python/dllib/src/bigdl/dllib/feature/common.py @@ -224,7 +224,9 @@ def __init__(self, jvalue=None, bigdl_type="float"): self.value = jvalue @classmethod - def image_frame(cls, image_frame, memory_type="DRAM", bigdl_type="float"): + def image_frame(cls, image_frame, memory_type="DRAM", + sequential_order=False, + shuffle=True, bigdl_type="float"): """ Create FeatureSet from ImageFrame. :param image_frame: ImageFrame @@ -235,15 +237,21 @@ def image_frame(cls, image_frame, memory_type="DRAM", bigdl_type="float"): of the data into memory during the training. After going through the 1/n, we will release the current cache, and load another 1/n into memory. + :param sequential_order: whether to iterate the elements in the feature set + in sequential order for training. + :param shuffle: whether to shuffle the elements in each partition before each epoch + when training :param bigdl_type: numeric type :return: A feature set """ jvalue = callBigDlFunc(bigdl_type, "createFeatureSetFromImageFrame", - image_frame, memory_type) + image_frame, memory_type, sequential_order, shuffle) return cls(jvalue=jvalue) @classmethod - def image_set(cls, imageset, memory_type="DRAM", bigdl_type="float"): + def image_set(cls, imageset, memory_type="DRAM", + sequential_order=False, + shuffle=True, bigdl_type="float"): """ Create FeatureSet from ImageFrame. :param imageset: ImageSet @@ -254,15 +262,22 @@ def image_set(cls, imageset, memory_type="DRAM", bigdl_type="float"): of the data into memory during the training. After going through the 1/n, we will release the current cache, and load another 1/n into memory. + :param sequential_order: whether to iterate the elements in the feature set + in sequential order for training. + :param shuffle: whether to shuffle the elements in each partition before each epoch + when training :param bigdl_type: numeric type :return: A feature set """ jvalue = callBigDlFunc(bigdl_type, "createFeatureSetFromImageFrame", - imageset.to_image_frame(), memory_type) + imageset.to_image_frame(), memory_type, + sequential_order, shuffle) return cls(jvalue=jvalue) @classmethod - def sample_rdd(cls, rdd, memory_type="DRAM", bigdl_type="float"): + def sample_rdd(cls, rdd, memory_type="DRAM", + sequential_order=False, + shuffle=True, bigdl_type="float"): """ Create FeatureSet from RDD[Sample]. :param rdd: A RDD[Sample] @@ -273,14 +288,20 @@ def sample_rdd(cls, rdd, memory_type="DRAM", bigdl_type="float"): of the data into memory during the training. After going through the 1/n, we will release the current cache, and load another 1/n into memory. + :param sequential_order: whether to iterate the elements in the feature set + in sequential order when training. + :param shuffle: whether to shuffle the elements in each partition before each epoch + when training :param bigdl_type:numeric type :return: A feature set """ - jvalue = callBigDlFunc(bigdl_type, "createSampleFeatureSetFromRDD", rdd, memory_type) + jvalue = callBigDlFunc(bigdl_type, "createSampleFeatureSetFromRDD", rdd, + memory_type, sequential_order, shuffle) return cls(jvalue=jvalue) @classmethod - def rdd(cls, rdd, memory_type="DRAM", bigdl_type="float"): + def rdd(cls, rdd, memory_type="DRAM", sequential_order=False, + shuffle=True, bigdl_type="float"): """ Create FeatureSet from RDD. :param rdd: A RDD @@ -291,10 +312,15 @@ def rdd(cls, rdd, memory_type="DRAM", bigdl_type="float"): of the data into memory during the training. After going through the 1/n, we will release the current cache, and load another 1/n into memory. + :param sequential_order: whether to iterate the elements in the feature set + in sequential order when training. + :param shuffle: whether to shuffle the elements in each partition before each epoch + when training :param bigdl_type:numeric type :return: A feature set """ - jvalue = callBigDlFunc(bigdl_type, "createFeatureSetFromRDD", rdd, memory_type) + jvalue = callBigDlFunc(bigdl_type, "createFeatureSetFromRDD", rdd, + memory_type, sequential_order, shuffle) return cls(jvalue=jvalue) def transform(self, transformer):