Skip to content

Commit

Permalink
Specify categorical variables in metadata (#120)
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelweilsalesforce authored and tovbinm committed Sep 11, 2018
1 parent a6fce5e commit c7d19ac
Show file tree
Hide file tree
Showing 36 changed files with 696 additions and 194 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Copyright (c) 2017, Salesforce.com, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

package com.salesforce.op.stages.impl.feature

import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.sql.types.StructField
import org.scalatest.{Assertion, Matchers}

trait AttributeAsserts {
self: Matchers =>
/**
* Assert if attributes are nominal or not
*
* @param schema
* @param expectedNominal Expected array of booleans. True if the field is nominal, false if not.
*/
final def assertNominal(schema: StructField, expectedNominal: Array[Boolean]): Assertion = {
val attributes = AttributeGroup.fromStructField(schema).attributes
attributes.map(_.map(_.isNominal).toSeq) shouldBe Some(expectedNominal.toSeq)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class Base64VectorizerTest extends FlatSpec with TestSparkContext with Base64TestData {
class Base64VectorizerTest extends FlatSpec with TestSparkContext with Base64TestData with AttributeAsserts {

"Base64Vectorizer" should "vectorize random binary data" in {
val vec = randomBase64.vectorize(topK = 10, minSupport = 0, cleanText = true, trackNulls = false)
Expand All @@ -63,6 +63,8 @@ class Base64VectorizerTest extends FlatSpec with TestSparkContext with Base64Tes
def assertVectorizer(vec: FeatureLike[OPVector], expected: Seq[Text]): Unit = {
val result = new OpWorkflow().setResultFeatures(vec).transform(realData)
val vectors = result.collect(vec)
val schema = result.schema(vec.name)
assertNominal(schema, Array.fill(vectors.head.value.size)(true))

vectors.length shouldBe expected.length
// TODO add a more robust check
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class BinaryMapVectorizerTest
extends OpEstimatorSpec[OPVector, SequenceModel[BinaryMap, OPVector], BinaryMapVectorizer[BinaryMap]] {
extends OpEstimatorSpec[OPVector, SequenceModel[BinaryMap, OPVector], BinaryMapVectorizer[BinaryMap]]
with AttributeAsserts {

val (inputData, m1, m2) = TestFeatureBuilder("m1", "m2",
Seq(
Expand Down Expand Up @@ -73,6 +74,7 @@ class BinaryMapVectorizerTest

transformed.collect(vector) shouldBe expectedResult
val field = transformed.schema(estimator.getOutputFeatureName)
assertNominal(field, Array.fill(expectedResult.head.value.size)(true))
OpVectorMetadata(field) shouldEqual expectedMeta
val vectorMetadata = estimator.getMetadata()
OpVectorMetadata(field.copy(metadata = vectorMetadata)) shouldEqual expectedMeta
Expand Down Expand Up @@ -100,6 +102,7 @@ class BinaryMapVectorizerTest

transformed.collect(vector) shouldBe expected
val field = transformed.schema(estimator.getOutputFeatureName)
assertNominal(field, Array.fill(expected.head.value.size)(true))
OpVectorMetadata(field) shouldEqual expectedMeta
val vectorMetadata = estimator.getMetadata()
OpVectorMetadata(field.copy(metadata = vectorMetadata)) shouldEqual expectedMeta
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ import org.scalatest.junit.JUnitRunner


@RunWith(classOf[JUnitRunner])
class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer] {
class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer] with AttributeAsserts {

val (inputData, f1, f2) = TestFeatureBuilder(
Seq[(Binary, Binary)](
Expand Down Expand Up @@ -93,6 +93,8 @@ class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer]
f1 -> List(RootCol, IndCol(Some(TransmogrifierDefaults.NullString))),
f2 -> List(RootCol, IndCol(Some(TransmogrifierDefaults.NullString)))
)
val field = transformed.schema(vector.name)
assertNominal(field, Array.fill(expected.head.value.size)(true))
}

it should "transform the data correctly [trackNulls=true,fillValue=true]" in {
Expand All @@ -117,6 +119,8 @@ class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer]
f1 -> List(RootCol, IndCol(Some(TransmogrifierDefaults.NullString))),
f2 -> List(RootCol, IndCol(Some(TransmogrifierDefaults.NullString)))
)
val field = transformed.schema(vector.name)
assertNominal(field, Array.fill(expected.head.value.size)(true))
}

it should "transform the data correctly [trackNulls=false,fillValue=false]" in {
Expand All @@ -141,6 +145,8 @@ class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer]
f1 -> List(RootCol),
f2 -> List(RootCol)
)
val field = transformed.schema(vector.name)
assertNominal(field, Array.fill(expected.head.value.size)(true))
}

it should "transform the data correctly [trackNulls=false,fillValue=true]" in {
Expand All @@ -165,5 +171,7 @@ class BinaryVectorizerTest extends OpTransformerSpec[OPVector, BinaryVectorizer]
f1 -> List(RootCol),
f2 -> List(RootCol)
)
val field = transformed.schema(vector.name)
assertNominal(field, Array.fill(expected.head.value.size)(true))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -37,21 +37,24 @@ import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder, TestOpVect
import com.salesforce.op.utils.date.DateTimeUtils
import com.salesforce.op.utils.spark.OpVectorMetadata
import com.salesforce.op.utils.spark.RichDataset._
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.linalg.Vectors
import org.joda.time.{DateTime, DateTimeConstants}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner


@RunWith(classOf[JUnitRunner])
class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectorizer[DateList]] {
class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectorizer[DateList]] with AttributeAsserts {

// Sunday July 12th 1998 at 22:45
val defaultDate = new DateTime(1998, 7, 12, 22, 45, DateTimeUtils.DefaultTimeZone).getMillis
val now = TransmogrifierDefaults.ReferenceDate.minusMillis(1).getMillis // make date time be in the past

private def daysToMilliseconds(n: Int): Long = n * DateTimeConstants.MILLIS_PER_DAY

private def monthsToMilliseconds(n: Int): Long = n * 2628000000L

private def hoursToMilliseconds(n: Int): Long = n * DateTimeConstants.MILLIS_PER_HOUR

val (testData, clicks, opens, purchases) = TestFeatureBuilder("clicks", "opens", "purchases",
Expand Down Expand Up @@ -122,7 +125,9 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori
Vectors.dense(2.0, 1.0, -1.0).toOPVector
)

val fieldMetadata = transformed.schema(output.name).metadata
val schema = transformed.schema(output.name)
val fieldMetadata = schema.metadata
assertNominal(schema, Array.fill(testModelTimeSinceFirst.getInputFeatures().size)(false))
testModelTimeSinceFirst.getMetadata() shouldEqual fieldMetadata
}

Expand All @@ -148,7 +153,10 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori
Vectors.dense(2.0, 0.0, 1.0, 0.0, -1.0, 0.0).toOPVector
)

val fieldMetadata = transformed.schema(output.name).metadata
val schema = transformed.schema(output.name)
val fieldMetadata = schema.metadata
assertNominal(schema, Array.fill(testModelTimeSinceFirst.getInputFeatures().size)
(Seq(false, true)).flatten)
testModelTimeSinceFirst.getMetadata() shouldEqual fieldMetadata
}

Expand All @@ -174,7 +182,9 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori
Vectors.dense(-28.0, -29.0, -31.0).toOPVector
)

val fieldMetadata = transformed.schema(output.name).metadata
val schema = transformed.schema(output.name)
val fieldMetadata = schema.metadata
assertNominal(schema, Array.fill(testModelTimeSinceFirst.getInputFeatures().size)(false))
testModelTimeSinceFirst.getMetadata() shouldEqual fieldMetadata
}

Expand All @@ -196,7 +206,9 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori
Vectors.sparse(21, Array(), Array()).toOPVector
)

val fieldMetadata = transformed.schema(output.name).metadata
val schema = transformed.schema(output.name)
val fieldMetadata = schema.metadata
assertNominal(schema, Array.fill(testModelModeDay.getInputFeatures().size * 7)(true))
testModelModeDay.getMetadata() shouldEqual fieldMetadata

val daysOfWeek = List("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday").map(s =>
Expand Down Expand Up @@ -225,7 +237,9 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori
Vectors.sparse(24, Array(7, 15, 23), Array(1.0, 1.0, 1.0)).toOPVector
)

val fieldMetadata = transformed.schema(output.name).metadata
val schema = transformed.schema(output.name)
val fieldMetadata = schema.metadata
assertNominal(schema, Array.fill(testModelModeDay.getInputFeatures().size * 8)(true))
testModelModeDay.getMetadata() shouldEqual fieldMetadata

val daysOfWeek = List("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
Expand Down Expand Up @@ -253,7 +267,9 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori
Vectors.sparse(36, Array(), Array()).toOPVector
)

val fieldMetadata = transformed.schema(output.name).metadata
val schema = transformed.schema(output.name)
val fieldMetadata = schema.metadata
assertNominal(schema, Array.fill(testModelModeMonth.getInputFeatures().size * 12)(true))
testModelModeMonth.getMetadata() shouldEqual fieldMetadata

val months = List(
Expand Down Expand Up @@ -283,7 +299,9 @@ class DateListVectorizerTest extends OpTransformerSpec[OPVector, DateListVectori
Vectors.sparse(72, Array(), Array()).toOPVector
)

val fieldMetadata = transformed.schema(output.name).metadata
val schema = transformed.schema(output.name)
val fieldMetadata = schema.metadata
assertNominal(schema, Array.fill(testModelModeHour.getInputFeatures().size * 24)(true))
testModelModeHour.getMetadata() shouldEqual fieldMetadata

val hours = (0 until 24).map(i => IndCol(Some(s"$i:00"))).toList
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class DateMapToUnitCircleVectorizerTest extends OpEstimatorSpec[OPVector, SequenceModel[DateMap, OPVector],
DateMapToUnitCircleVectorizer[DateMap]] {
DateMapToUnitCircleVectorizer[DateMap]] with AttributeAsserts {

val eps = 1E-4
val sampleDateTimes = Seq[JDateTime](
Expand Down Expand Up @@ -77,7 +77,9 @@ class DateMapToUnitCircleVectorizerTest extends OpEstimatorSpec[OPVector, Sequen
val output = f1.toUnitCircle(TimePeriod.HourOfDay)
val transformed = output.originStage.asInstanceOf[DateMapToUnitCircleVectorizer[DateMap]]
.fit(inputData).transform(inputData)
val field = transformed.schema(output.name)
val actual = transformed.collect(output)
assertNominal(field, Array.fill(actual.head.value.size)(false))
all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps
}

Expand All @@ -88,7 +90,9 @@ class DateMapToUnitCircleVectorizerTest extends OpEstimatorSpec[OPVector, Sequen
val output = f1DT.toUnitCircle(TimePeriod.HourOfDay)
val transformed = output.originStage.asInstanceOf[DateMapToUnitCircleVectorizer[DateMap]]
.fit(inputData).transform(inputData)
val field = transformed.schema(output.name)
val actual = transformed.collect(output)
assertNominal(field, Array.fill(actual.head.value.size)(false))
all (actual.zip(expectedResult).map(g => Vectors.sqdist(g._1.value, g._2.value))) should be < eps
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class DateMapVectorizerTest extends FlatSpec with TestSparkContext {
class DateMapVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts {

// Sunday July 12th 1998 at 22:45
private val defaultDate = new JDateTime(1998, 7, 12, 22, 45, DateTimeUtils.DefaultTimeZone).getMillis
Expand All @@ -71,6 +71,8 @@ class DateMapVectorizerTest extends FlatSpec with TestSparkContext {
val meta = OpVectorMetadata(vector.name, transformed.schema(vector.name).metadata)
meta.columns.length shouldBe 3
meta.columns.map(_.grouping) should contain theSameElementsAs Array(Option("a"), Option("b"), Option("c"))
val field = transformed.schema(vector.name)
assertNominal(field, Array.fill(expected(moment).head.value.size)(false))

val vector2 = f1.vectorize(defaultValue = 0, referenceDate = moment, trackNulls = true,
circularDateReps = Seq())
Expand All @@ -80,6 +82,8 @@ class DateMapVectorizerTest extends FlatSpec with TestSparkContext {
val meta2 = OpVectorMetadata(vector2.name, transformed2.schema(vector2.name).metadata)
meta2.columns.length shouldBe 6
meta2.history.keys.size shouldBe 1
val field2 = transformed2.schema(vector2.name)
assertNominal(field2, Array.fill(expected(moment).head.value.size)(Seq(false, true)).flatten)

val vector3 = f1.vectorize(defaultValue = 0)
val transformed3 = new OpWorkflow().setResultFeatures(vector3).transform(ds)
Expand All @@ -88,6 +92,9 @@ class DateMapVectorizerTest extends FlatSpec with TestSparkContext {
val meta3 = OpVectorMetadata(vector3.name, transformed3.schema(vector3.name).metadata)
meta3.columns.length shouldBe 30
meta2.history.keys.size shouldBe 1
val field3 = transformed3.schema(vector3.name)
val expectedNominal = Array.fill(24)(false) ++ Array.fill(3)(Seq(false, true)).flatten.asInstanceOf[Array[Boolean]]
assertNominal(field3, expectedNominal)
}

private def expected(moment: JDateTime) = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ import org.scalatest.junit.JUnitRunner


@RunWith(classOf[JUnitRunner])
class DateTimeVectorizerTest extends FlatSpec with TestSparkContext {
class DateTimeVectorizerTest extends FlatSpec with TestSparkContext with AttributeAsserts {

// Sunday July 12th 1998 at 22:45
private val defaultDate = new JDateTime(1998, 7, 12, 22, 45, DateTimeUtils.DefaultTimeZone).getMillis
Expand Down Expand Up @@ -91,6 +91,8 @@ class DateTimeVectorizerTest extends FlatSpec with TestSparkContext {
val meta = OpVectorMetadata(vector.name, transformed.schema(vector.name).metadata)
meta.columns.length shouldBe 3
meta.history.keys.size shouldBe 3
val field = transformed.schema(vector.name)
assertNominal(field, Array.fill(expected(moment).head.value.size)(false))

val vector2 = f1.vectorize(
dateListPivot = TransmogrifierDefaults.DateListDefault,
Expand All @@ -105,6 +107,8 @@ class DateTimeVectorizerTest extends FlatSpec with TestSparkContext {
val meta2 = OpVectorMetadata(vector2.name, transformed2.schema(vector2.name).metadata)
meta2.columns.length shouldBe 6
meta2.history.keys.size shouldBe 3
val field2 = transformed2.schema(vector2.name)
assertNominal(field2, Array.fill(expected(moment).head.value.size)(Seq(false, true)).flatten)

val vector3 = f1.vectorize(
dateListPivot = TransmogrifierDefaults.DateListDefault,
Expand All @@ -117,6 +121,9 @@ class DateTimeVectorizerTest extends FlatSpec with TestSparkContext {
val meta3 = OpVectorMetadata(vector3.name, transformed3.schema(vector3.name).metadata)
meta3.columns.length shouldBe 30
meta3.history.keys.size shouldBe 6
val field3 = transformed3.schema(vector3.name)
val expectedNominal = Array.fill(24)(false) ++ Array.fill(3)(Seq(false, true)).flatten.asInstanceOf[Array[Boolean]]
assertNominal(field3, expectedNominal)
}

it should "vectorize dates correctly any time" in {
Expand Down
Loading

0 comments on commit c7d19ac

Please sign in to comment.