Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Incremental Training for imagenet #1391

Merged
merged 26 commits into from
Jul 31, 2019
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 147 additions & 0 deletions zoo/src/main/scala/com/intel/analytics/zoo/common/ZooTrigger.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
package com.intel.analytics.zoo.common

import com.intel.analytics.bigdl.optim.Trigger
import com.intel.analytics.bigdl.utils.{T, Table}

/**
* A trigger specifies a timespot or several timespots during training,
* and a corresponding action will be taken when the timespot(s)
* is reached.
*/
trait ZooTrigger extends Trigger {
protected var zooState: Table = T()

/**
* We also hold some training metrics to control trigger.
* @param zooState zoo state table
*/
private[zoo] def setZooState(zooState: Table): Unit = {
this.zooState = zooState
}
}

/**
* A trigger that triggers an action when each epoch finishs.
* Could be used as trigger in setValidation and setCheckpoint
* in Optimizer, and also in TrainSummary.setSummaryTrigger.
*/
case class EveryEpoch() extends ZooTrigger{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If memoryType DRAM is used, the validation phase is ignored at the end of each epoch.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

private var lastEpoch = -1

override def apply(state: Table): Boolean = {
if (lastEpoch == -1) {
lastEpoch = state[Int]("epoch")
false
} else {
if (state[Int]("epoch") <= lastEpoch) {
false
} else {
if (zooState.contains("numSlice") && zooState.contains("currentSlice")
&& zooState[Int]("currentSlice") % zooState[Int]("numSlice") == 0) {
lastEpoch = state[Int]("epoch")
true
} else {
false
}
}
}
}
}
/**
* A trigger that triggers an action every "n" iterations.
* Could be used as trigger in setValidation and setCheckpoint
* in Optimizer, and also in TrainSummary.setSummaryTrigger.
*
* @param interval - trigger interval "n"
*/
case class SeveralIteration(interval: Int) extends ZooTrigger{
override def apply(state: Table): Boolean = {
val curIteration = state[Int]("neval") - 1
curIteration != 0 && curIteration % interval == 0
}
}

/**
* A trigger that triggers an action when training reaches
* the number of epochs specified by "max".
* Usually used in Optimizer.setEndWhen.
*
* @param max the epoch when the action takes place
*/
case class MaxEpoch(max: Int) extends ZooTrigger{
override def apply(state: Table): Boolean = {
state[Int]("epoch") > max
}
}

/**
* A trigger that triggers an action when training reaches
* the number of iterations specified by "max".
* Usually used in Optimizer.setEndWhen.
*
* @param max the iteration when the action takes place
*
*/
case class MaxIteration(max: Int) extends ZooTrigger {
override def apply(state: Table): Boolean = {
state[Int]("neval") > max
}
}

/**
* A trigger that triggers an action when validation score larger than "max" score
* @param max max score
*/
case class MaxScore(max: Float) extends ZooTrigger {
override def apply(state: Table): Boolean = {
state[Float]("score") > max
}
}

/**
* A trigger that triggers an action when training loss less than "min" loss
* @param min min loss
*/
case class MinLoss(min: Float) extends ZooTrigger {
override def apply(state: Table): Boolean = {
state[Float]("Loss") < min
}
}

/**
* A trigger contains other triggers and triggers when all of them trigger (logical AND)
* @param first first trigger
* @param others others triggers
*/
case class And(first : ZooTrigger, others : ZooTrigger*) extends ZooTrigger {
override def setZooState(zooState: Table): Unit = {
super.setZooState(zooState)
first.setZooState(zooState)
others.foreach{zt =>
zt.setZooState(zooState)
}
}

override def apply(state: Table): Boolean = {
first.apply(state) && others.forall(_.apply(state))
}
}

/**
* A trigger contains other triggers and triggers when any of them trigger (logical OR)
* @param first first trigger
* @param others others triggers
*/
case class Or(first : ZooTrigger, others : ZooTrigger*) extends ZooTrigger {
override def setZooState(zooState: Table): Unit = {
super.setZooState(zooState)
first.setZooState(zooState)
others.foreach{zt =>
zt.setZooState(zooState)
}
}

override def apply(state: Table): Boolean = {
first.apply(state) || others.exists(_.apply(state))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,13 @@ package com.intel.analytics.zoo.examples.inception
import java.nio.ByteBuffer

import com.intel.analytics.bigdl.dataset._
import com.intel.analytics.bigdl.dataset.image.CropCenter
import com.intel.analytics.bigdl.dataset.image.{BGRImgCropper, BGRImgNormalizer, BytesToBGRImg, MTLabeledBGRImgToBatch, HFlip => DatasetHFlip}
import com.intel.analytics.bigdl.dataset.image.{BGRImgCropper, BGRImgNormalizer, BGRImgToSample, BytesToBGRImg, CropCenter, MTLabeledBGRImgToBatch, HFlip => DatasetHFlip}
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.transform.vision.image._
import com.intel.analytics.bigdl.utils.{Engine, T}
import com.intel.analytics.zoo.feature.image._
import com.intel.analytics.zoo.feature.{DistributedFeatureSet, FeatureSet}
import com.intel.analytics.zoo.feature.pmem.{DRAM, MemoryType, PMEM}
import com.intel.analytics.zoo.feature.pmem._
import com.intel.analytics.zoo.pipeline.api.keras.layers.utils.EngineRef
import org.apache.hadoop.io.Text
import org.apache.log4j.Logger
Expand Down Expand Up @@ -78,27 +77,24 @@ object ImageNet2012 {
coresPerNode: Int,
classNumber: Int,
memoryType: MemoryType = DRAM,
opencvPreprocessing: Boolean = false
opencvPreprocessing: Boolean = false,
dataStrategy: DataStrategy = PARTITIONED
)
: FeatureSet[MiniBatch[Float]] = {
if (opencvPreprocessing) {
logger.info("Using opencv preprocessing for training set")
opencv(path, sc, imageSize, batchSize,
nodeNumber, coresPerNode, classNumber, memoryType)
nodeNumber, coresPerNode, classNumber, memoryType, dataStrategy)
} else {
val rawData = readFromSeqFiles(path, sc, classNumber)
.setName("ImageNet2012 Training Set")
val featureSet = FeatureSet.rdd(rawData, memoryType = memoryType)
featureSet.transform(
MTLabeledBGRImgToBatch[ByteRecord](
width = imageSize,
height = imageSize,
batchSize = batchSize,
transformer = (BytesToBGRImg()
val featureSet = FeatureSet.rdd(rawData, memoryType = memoryType, dataStrategy)
featureSet.transform(BytesToBGRImg()
-> BGRImgCropper(imageSize, imageSize)
-> DatasetHFlip(0.5)
-> BGRImgNormalizer(0.485, 0.456, 0.406, 0.229, 0.224, 0.225))
))
-> BGRImgNormalizer(0.485, 0.456, 0.406, 0.229, 0.224, 0.225)
-> BGRImgToSample()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why add toSample and toBatch here?

Copy link
Contributor Author

@qiuxin2012 qiuxin2012 Jul 8, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As the origin rdd is persisted on disk, when count the size of the dataset, MTLabeledBGRImgToBatch will throw an exception.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reverted, As I won't count the hold rdd on disk.

-> SampleToMiniBatch(batchSize))
}
}

Expand All @@ -123,11 +119,12 @@ object ImageNet2012 {
nodeNumber: Int,
coresPerNode: Int,
classNumber: Int,
memoryType: MemoryType = DRAM): FeatureSet[MiniBatch[Float]] = {
memoryType: MemoryType = DRAM,
dataStrategy: DataStrategy = PARTITIONED): FeatureSet[MiniBatch[Float]] = {
val rawData = readFromSeqFiles(path, sc, classNumber)
.map(byteRecordToImageFeature(_))
.setName("ImageNet2012 Training Set")
val featureSet = FeatureSet.rdd(rawData, memoryType = memoryType)
val featureSet = FeatureSet.rdd(rawData, memoryType = memoryType, dataStrategy)
val transformer = ImagePixelBytesToMat() ->
ImageRandomCrop(imageSize, imageSize) ->
ImageChannelNormalize(0.485f, 0.456f, 0.406f, 0.229f, 0.224f, 0.225f) ->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ package com.intel.analytics.zoo.examples.inception

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.models.inception.Inception_v1_NoAuxClassifier
import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Module}
import com.intel.analytics.bigdl.nn._
import com.intel.analytics.bigdl.optim.SGD.{Poly, SequentialSchedule, Warmup}
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter, T, Table}
import com.intel.analytics.zoo.feature.pmem.MemoryType
import com.intel.analytics.zoo.feature.pmem.{MemoryType, PARTITIONED}
import com.intel.analytics.zoo.pipeline.api.keras.layers.utils.EngineRef
import com.intel.analytics.zoo.pipeline.estimator.{ConstantClipping, Estimator, L2NormClipping}
import org.apache.spark.SparkContext
Expand Down Expand Up @@ -111,6 +111,7 @@ object TrainInceptionV1 {
checkPointTrigger = Some(checkpointTrigger),
valSet, Array(new Top1Accuracy[Float], new Top5Accuracy[Float]))

estimator.close()
sc.stop()
})
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,13 @@
package com.intel.analytics.zoo.examples.recommendation

import com.intel.analytics.bigdl.dataset.{Sample, SampleToMiniBatch}
import com.intel.analytics.bigdl.nn.{ClassNLLCriterion}
import com.intel.analytics.bigdl.nn.ClassNLLCriterion
import com.intel.analytics.bigdl.numeric.NumericFloat
import com.intel.analytics.bigdl.optim._
import com.intel.analytics.bigdl.utils.{RandomGenerator, T}
import com.intel.analytics.zoo.common.NNContext
import com.intel.analytics.zoo.common.{EveryEpoch, MaxEpoch, NNContext}
import com.intel.analytics.zoo.feature.FeatureSet
import com.intel.analytics.zoo.feature.pmem.DISK_AND_DRAM
import com.intel.analytics.zoo.models.recommendation._
import com.intel.analytics.zoo.pipeline.estimator.Estimator
import org.apache.log4j.{Level, Logger}
Expand Down Expand Up @@ -131,7 +132,7 @@ object CensusWideAndDeep {
}

val sample2batch = SampleToMiniBatch(batchSize)
val trainRdds = FeatureSet.rdd(trainpairFeatureRdds.map(x => x.sample).cache()) ->
val trainRdds = FeatureSet.rdd(trainpairFeatureRdds.map(x => x.sample).cache(), DISK_AND_DRAM(2)) ->
sample2batch
val validationRdds = FeatureSet.rdd(validationpairFeatureRdds.map(x => x.sample).cache()) ->
sample2batch
Expand All @@ -144,8 +145,8 @@ object CensusWideAndDeep {
Estimator[Float](wideAndDeep, optimMethods)
}

val (checkpointTrigger, testTrigger, endTrigger) =
(Trigger.everyEpoch, Trigger.everyEpoch, Trigger.maxEpoch(maxEpoch))
val (checkpointTrigger, endTrigger) =
(EveryEpoch(), MaxEpoch(maxEpoch))

estimator.train(trainRdds, ClassNLLCriterion[Float](),
Some(endTrigger),
Expand Down
Loading