[SPARK-51773][SQL] Turn file formats into case classes to properly compare them

vladimirg-db · cloud-fan · cloud-fan · commit 0a79406d25aa · 2025-04-16T12:26:22.000+08:00
### What changes were proposed in this pull request? Turn file formats into case classes to properly compare them. ### Why are the changes needed? This is necessary to make `LogicalRelation`s comparable when comparing single-pass and fixed-point Analyzer results. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #50562 from vladimirg-db/vladimir-golubev_data/add-hashcode-and-equals-to-file-formats. Lead-authored-by: Vladimir Golubev <vladimir.golubev@databricks.com> Co-authored-by: Wenchen Fan <cloud0fan@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageFileFormat.scala b/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageFileFormat.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.sources.{DataSourceRegister, Filter}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.SerializableConfiguration
 
-private[image] class ImageFileFormat extends FileFormat with DataSourceRegister {
+private[image] case class ImageFileFormat() extends FileFormat with DataSourceRegister {
 
   override def inferSchema(
       sparkSession: SparkSession,
diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -67,9 +67,9 @@ private[libsvm] class LibSVMOutputWriter(
   }
 }
 
-/** @see [[LibSVMDataSource]] for public documentation. */
+// see `LibSVMDataSource` for public documentation.
 // If this is moved or renamed, please update DataSource's backwardCompatibilityMap.
-private[libsvm] class LibSVMFileFormat
+private[libsvm] case class LibSVMFileFormat()
   extends TextBasedFileFormat
   with DataSourceRegister
   with Logging {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
@@ -55,7 +55,7 @@ import org.apache.spark.util.SerializableConfiguration
  *     .load("/path/to/fileDir");
  * }}}
  */
-class BinaryFileFormat extends FileFormat with DataSourceRegister {
+case class BinaryFileFormat() extends FileFormat with DataSourceRegister {
 
   import BinaryFileFormat._
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
@@ -35,7 +35,7 @@ import org.apache.spark.util.SerializableConfiguration
 /**
  * Provides access to CSV data from pure SQL statements.
  */
-class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister {
+case class CSVFileFormat() extends TextBasedFileFormat with DataSourceRegister {
 
   override def shortName(): String = "csv"
 
@@ -158,10 +158,6 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister {
 
   override def toString: String = "CSV"
 
-  override def hashCode(): Int = getClass.hashCode()
-
-  override def equals(other: Any): Boolean = other.isInstanceOf[CSVFileFormat]
-
   /**
    * Allow reading variant from CSV, but don't allow writing variant into CSV. This is because the
    * written data (the string representation of variant) may not be read back as the same variant.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.SerializableConfiguration
 
-class JsonFileFormat extends TextBasedFileFormat with DataSourceRegister {
+case class JsonFileFormat() extends TextBasedFileFormat with DataSourceRegister {
   override val shortName: String = "json"
 
   override def isSplitable(
@@ -128,10 +128,6 @@ class JsonFileFormat extends TextBasedFileFormat with DataSourceRegister {
 
   override def toString: String = "JSON"
 
-  override def hashCode(): Int = getClass.hashCode()
-
-  override def equals(other: Any): Boolean = other.isInstanceOf[JsonFileFormat]
-
   override def supportDataType(dataType: DataType): Boolean = dataType match {
     case _: VariantType => true
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala
@@ -37,7 +37,7 @@ import org.apache.spark.util.{SerializableConfiguration, Utils}
 /**
  * A data source for reading text files. The text files must be encoded as UTF-8.
  */
-class TextFileFormat extends TextBasedFileFormat with DataSourceRegister {
+case class TextFileFormat() extends TextBasedFileFormat with DataSourceRegister {
 
   override def shortName(): String = "text"
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XmlFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XmlFileFormat.scala
@@ -35,7 +35,7 @@ import org.apache.spark.util.SerializableConfiguration
 /**
  * Provides access to XML data from pure SQL statements.
  */
-class XmlFileFormat extends TextBasedFileFormat with DataSourceRegister {
+case class XmlFileFormat() extends TextBasedFileFormat with DataSourceRegister {
 
   override def shortName(): String = "xml"
 
@@ -132,10 +132,6 @@ class XmlFileFormat extends TextBasedFileFormat with DataSourceRegister {
 
   override def toString: String = "XML"
 
-  override def hashCode(): Int = getClass.hashCode()
-
-  override def equals(other: Any): Boolean = other.isInstanceOf[XmlFileFormat]
-
   override def supportDataType(dataType: DataType): Boolean = dataType match {
     case _: VariantType => true
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
@@ -722,7 +722,7 @@ object LastArguments {
 }
 
 /** A test [[FileFormat]] that records the arguments passed to buildReader, and returns nothing. */
-class TestFileFormat extends TextBasedFileFormat {
+case class TestFileFormat() extends TextBasedFileFormat {
 
   override def toString: String = "TestFileFormat"
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala
@@ -48,7 +48,7 @@ import org.apache.spark.util.SerializableJobConf
  *
  * TODO: implement the read logic.
  */
-class HiveFileFormat(fileSinkConf: FileSinkDesc)
+case class HiveFileFormat(fileSinkConf: FileSinkDesc)
   extends FileFormat with DataSourceRegister with Logging {
 
   def this() = this(null)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -55,16 +55,12 @@ import org.apache.spark.util.SerializableConfiguration
  * `FileFormat` for reading ORC files. If this is moved or renamed, please update
  * `DataSource`'s backwardCompatibilityMap.
  */
-class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable {
+case class OrcFileFormat() extends FileFormat with DataSourceRegister with Serializable {
 
   override def shortName(): String = "orc"
 
   override def toString: String = "ORC"
 
-  override def hashCode(): Int = getClass.hashCode()
-
-  override def equals(other: Any): Boolean = other.isInstanceOf[OrcFileFormat]
-
   override def inferSchema(
       sparkSession: SparkSession,
       options: Map[String, String],
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
@@ -33,7 +33,7 @@ import org.apache.spark.util.ArrayImplicits._
 import org.apache.spark.util.SerializableConfiguration
 import org.apache.spark.util.Utils
 
-class SimpleTextSource extends TextBasedFileFormat with DataSourceRegister {
+case class SimpleTextSource() extends TextBasedFileFormat with DataSourceRegister {
   override def shortName(): String = "test"
 
   override def inferSchema(

Original file line number	Diff line number	Diff line change
`@@ -67,9 +67,9 @@ private[libsvm] class LibSVMOutputWriter(`
`67`	`67`	`}`
`68`	`68`	`}`
`69`	`69`
`70`		`-/** @see [[LibSVMDataSource]] for public documentation. */`
	`70`	+// see `LibSVMDataSource` for public documentation.
`71`	`71`	`// If this is moved or renamed, please update DataSource's backwardCompatibilityMap.`
`72`		`-private[libsvm] class LibSVMFileFormat`
	`72`	`+private[libsvm] case class LibSVMFileFormat()`
`73`	`73`	`extends TextBasedFileFormat`
`74`	`74`	`with DataSourceRegister`
`75`	`75`	`with Logging {`
Original file line number	Diff line number	Diff line change
`@@ -722,7 +722,7 @@ object LastArguments {`
`722`	`722`	`}`
`723`	`723`
`724`	`724`	`/** A test [[FileFormat]] that records the arguments passed to buildReader, and returns nothing. */`
`725`		`-class TestFileFormat extends TextBasedFileFormat {`
	`725`	`+case class TestFileFormat() extends TextBasedFileFormat {`
`726`	`726`
`727`	`727`	`override def toString: String = "TestFileFormat"`
`728`	`728`
Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ import org.apache.spark.util.SerializableJobConf`
`48`	`48`	`*`
`49`	`49`	`* TODO: implement the read logic.`
`50`	`50`	`*/`
`51`		`-class HiveFileFormat(fileSinkConf: FileSinkDesc)`
	`51`	`+case class HiveFileFormat(fileSinkConf: FileSinkDesc)`
`52`	`52`	`extends FileFormat with DataSourceRegister with Logging {`
`53`	`53`
`54`	`54`	`def this() = this(null)`