Skip to content

Commit

Permalink
Add shims to compare ORC TypeDescription (#4786)
Browse files Browse the repository at this point in the history
Orc has defaulted to check Attribute when comparing two TypeDescription
from Spark 3.2.x, which causes some issues, since the catalyst schema
does not include this kind of information.

Signed-off-by: Bobby Wang <wbo4958@gmail.com>
  • Loading branch information
wbo4958 authored Feb 15, 2022
1 parent 5132869 commit 8394272
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import scala.collection.mutable.ArrayBuffer
import com.nvidia.spark.rapids.OrcOutputStripe
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hive.common.io.DiskRangeList
import org.apache.orc.{CompressionCodec, CompressionKind, DataReader, OrcFile, OrcProto, PhysicalWriter, Reader, StripeInformation}
import org.apache.orc.{CompressionCodec, CompressionKind, DataReader, OrcFile, OrcProto, PhysicalWriter, Reader, StripeInformation, TypeDescription}
import org.apache.orc.impl.{DataReaderProperties, OutStream, SchemaEvolution}
import org.apache.orc.impl.RecordReaderImpl.SargApplier

Expand Down Expand Up @@ -84,4 +84,11 @@ trait OrcShims301until320Base {
}
result
}

/**
* Compare if the two TypeDescriptions are equal by ignoring attribute
*/
def typeDescriptionEqual(lhs: TypeDescription, rhs: TypeDescription): Boolean = {
lhs.equals(rhs)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import com.nvidia.spark.rapids.OrcOutputStripe
import com.nvidia.spark.rapids.RapidsPluginImplicits._
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hive.common.io.DiskRangeList
import org.apache.orc.{CompressionCodec, CompressionKind, DataReader, OrcConf, OrcFile, OrcProto, PhysicalWriter, Reader, StripeInformation}
import org.apache.orc.{CompressionCodec, CompressionKind, DataReader, OrcConf, OrcFile, OrcProto, PhysicalWriter, Reader, StripeInformation, TypeDescription}
import org.apache.orc.impl.{BufferChunk, BufferChunkList, DataReaderProperties, InStream, OrcCodecPool, OutStream, ReaderImpl, SchemaEvolution}
import org.apache.orc.impl.RecordReaderImpl.SargApplier
import org.apache.orc.impl.reader.StripePlanner
Expand Down Expand Up @@ -120,4 +120,11 @@ object OrcShims {
result

}

/**
* Compare if the two TypeDescriptions are equal by ignoring attribute
*/
def typeDescriptionEqual(lhs: TypeDescription, rhs: TypeDescription): Boolean = {
lhs.equals(rhs, false)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1224,7 +1224,7 @@ private case class GpuOrcFileFilterHandler(
newUnion
// Primitive types should be equal to each other.
case _ =>
if (fileSchema != readSchema) {
if (!OrcShims.typeDescriptionEqual(fileSchema, readSchema)) {
throw new QueryExecutionException("Incompatible schemas for ORC file" +
s" at ${partFile.filePath}\n" +
s" file schema: $fileSchema\n" +
Expand Down

0 comments on commit 8394272

Please sign in to comment.