diff --git a/integration_tests/src/main/python/csv_test.py b/integration_tests/src/main/python/csv_test.py index 9101ef56760..49c16191333 100644 --- a/integration_tests/src/main/python/csv_test.py +++ b/integration_tests/src/main/python/csv_test.py @@ -231,7 +231,7 @@ def read_impl(spark): pytest.param('floats_invalid.csv', _double_schema, {'header': 'true'}), pytest.param('simple_float_values.csv', _float_schema, {'header': 'true'}), pytest.param('simple_float_values.csv', _double_schema, {'header': 'true'}), - pytest.param('simple_boolean_values.csv', _bool_schema, {'header': 'true'}, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/2071')), + pytest.param('simple_boolean_values.csv', _bool_schema, {'header': 'true'}), pytest.param('ints_with_whitespace.csv', _number_as_string_schema, {'header': 'true'}, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/2069')), pytest.param('ints_with_whitespace.csv', _byte_schema, {'header': 'true'}, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/130')) ], ids=idfn) diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index 563d713cc04..a349e3b3e67 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -38,6 +38,9 @@ 'spark.rapids.sql.format.json.enabled': 'true', 'spark.rapids.sql.format.json.read.enabled': 'true'} +_bool_schema = StructType([ + StructField('number', BooleanType())]) + _float_schema = StructType([ StructField('number', FloatType())]) @@ -170,6 +173,8 @@ def test_json_ts_formats_round_trip(spark_tmp_path, date_format, ts_part, v1_ena @approximate_float @pytest.mark.parametrize('filename', [ + 'boolean.json', + pytest.param('boolean_invalid.json', marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/4779')), 'nan_and_inf.json', pytest.param('nan_and_inf_edge_cases.json', marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/4646')), 'floats.json', @@ -177,7 +182,7 @@ def test_json_ts_formats_round_trip(spark_tmp_path, date_format, ts_part, v1_ena 'floats_invalid.json', pytest.param('floats_edge_cases.json', marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/4647')), ]) -@pytest.mark.parametrize('schema', [_float_schema, _double_schema]) +@pytest.mark.parametrize('schema', [_bool_schema, _float_schema, _double_schema]) @pytest.mark.parametrize('read_func', [read_json_df, read_json_sql]) @pytest.mark.parametrize('allow_non_numeric_numbers', ["true", "false"]) @pytest.mark.parametrize('allow_numeric_leading_zeros', ["true"]) diff --git a/integration_tests/src/test/resources/boolean.json b/integration_tests/src/test/resources/boolean.json new file mode 100644 index 00000000000..fc49e251617 --- /dev/null +++ b/integration_tests/src/test/resources/boolean.json @@ -0,0 +1,11 @@ +{ "number": true } +{ "number": True } +{ "number": TRUE } +{ "number": false } +{ "number": False } +{ "number": FALSE } +{ "number": null } +{ "number": y } +{ "number": n } +{ "number": 0 } +{ "number": 1 } \ No newline at end of file diff --git a/integration_tests/src/test/resources/boolean_invalid.json b/integration_tests/src/test/resources/boolean_invalid.json new file mode 100644 index 00000000000..f3e017baf91 --- /dev/null +++ b/integration_tests/src/test/resources/boolean_invalid.json @@ -0,0 +1,9 @@ +{ "number": "true" } +{ "number": "false" } +{ "number": "null" } +{ "number": "" } +{ "number": "True" } +{ "number": "TRUE" } +{ "number": "False" } +{ "number": "FALSE" } +{ "number": "BAD" } \ No newline at end of file diff --git a/integration_tests/src/test/resources/simple_boolean_values.csv b/integration_tests/src/test/resources/simple_boolean_values.csv index 4b8ec5a4e16..d841b528c7d 100644 --- a/integration_tests/src/test/resources/simple_boolean_values.csv +++ b/integration_tests/src/test/resources/simple_boolean_values.csv @@ -7,3 +7,11 @@ False TRUE FALSE BAD +y +n +yes +no +1 +0 +t +f diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBatchScanExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBatchScanExec.scala index c5b2be30471..fe1cc939653 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBatchScanExec.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBatchScanExec.scala @@ -21,7 +21,7 @@ import java.nio.charset.StandardCharsets import scala.collection.JavaConverters._ import ai.rapids.cudf -import ai.rapids.cudf.{HostMemoryBuffer, Schema, Table} +import ai.rapids.cudf.{ColumnVector, DType, HostMemoryBuffer, Scalar, Schema, Table} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path @@ -423,4 +423,28 @@ class CSVPartitionReader( * @return the file format short name */ override def getFileFormatShortName: String = "CSV" + + /** + * CSV supports "true" and "false" (case-insensitive) as valid boolean values. + */ + override def castStringToBool(input: ColumnVector): ColumnVector = { + withResource(input.strip()) { stripped => + withResource(stripped.lower()) { lower => + withResource(Scalar.fromString("true")) { t => + withResource(Scalar.fromString("false")) { f => + withResource(lower.equalTo(t)) { isTrue => + withResource(lower.equalTo(f)) { isFalse => + withResource(isTrue.or(isFalse)) { isValidBool => + withResource(Scalar.fromNull(DType.BOOL8)) { nullBool => + isValidBool.ifElse(isTrue, nullBool) + } + } + } + } + } + } + } + } + } + } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTextBasedPartitionReader.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTextBasedPartitionReader.scala index 515db04fc11..4c522ee248d 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTextBasedPartitionReader.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTextBasedPartitionReader.scala @@ -166,11 +166,11 @@ abstract class GpuTextBasedPartitionReader( readDataSchema } - // read floating-point columns as strings in cuDF + // read boolean and floating-point columns as strings in cuDF val dataSchemaWithStrings = StructType(dataSchema.fields .map(f => { f.dataType match { - case DataTypes.FloatType | DataTypes.DoubleType => + case DataTypes.BooleanType | DataTypes.FloatType | DataTypes.DoubleType => f.copy(dataType = DataTypes.StringType) case _ => f @@ -188,7 +188,7 @@ abstract class GpuTextBasedPartitionReader( } maxDeviceMemory = max(GpuColumnVector.getTotalDeviceMemoryUsed(table), maxDeviceMemory) - // parse floating-point columns that were read as strings + // parse boolean and floating-point columns that were read as strings val castTable = withResource(table) { _ => val columns = new ListBuffer[ColumnVector]() // Table increases the ref counts on the columns so we have @@ -198,6 +198,8 @@ abstract class GpuTextBasedPartitionReader( val ansiEnabled = false for (i <- 0 until table.getNumberOfColumns) { val castColumn = dataSchema.fields(i).dataType match { + case DataTypes.BooleanType => + castStringToBool(table.getColumn(i)) case DataTypes.FloatType => GpuCast.castStringToFloats(table.getColumn(i), ansiEnabled, DType.FLOAT32) case DataTypes.DoubleType => @@ -218,6 +220,8 @@ abstract class GpuTextBasedPartitionReader( } } + def castStringToBool(input: ColumnVector): ColumnVector + /** * Read the host buffer to GPU table * @param dataBuffer host buffer to be read diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala index ed7edeba396..10ed3368371 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/catalyst/json/rapids/GpuJsonScan.scala @@ -21,7 +21,7 @@ import java.nio.charset.StandardCharsets import scala.collection.JavaConverters._ import ai.rapids.cudf -import ai.rapids.cudf.{HostMemoryBuffer, Schema, Table} +import ai.rapids.cudf.{ColumnVector, DType, HostMemoryBuffer, Scalar, Schema, Table} import com.nvidia.spark.rapids._ import org.apache.hadoop.conf.Configuration @@ -334,4 +334,24 @@ class JsonPartitionReader( Some(new Table(prunedColumnVectors: _*)) } } + + /** + * JSON only supports unquoted lower-case "true" and "false" as valid boolean values. + */ + override def castStringToBool(input: ColumnVector): ColumnVector = { + withResource(Scalar.fromString("true")) { t => + withResource(Scalar.fromString("false")) { f => + withResource(input.equalTo(t)) { isTrue => + withResource(input.equalTo(f)) { isFalse => + withResource(isTrue.or(isFalse)) { isValidBool => + withResource(Scalar.fromNull(DType.BOOL8)) { nullBool => + isValidBool.ifElse(isTrue, nullBool) + } + } + } + } + } + } + } + }