Skip to content

Commit 0a79406

Browse files
[SPARK-51773][SQL] Turn file formats into case classes to properly compare them
### What changes were proposed in this pull request? Turn file formats into case classes to properly compare them. ### Why are the changes needed? This is necessary to make `LogicalRelation`s comparable when comparing single-pass and fixed-point Analyzer results. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #50562 from vladimirg-db/vladimir-golubev_data/add-hashcode-and-equals-to-file-formats. Lead-authored-by: Vladimir Golubev <vladimir.golubev@databricks.com> Co-authored-by: Wenchen Fan <cloud0fan@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
1 parent e00e189 commit 0a79406

File tree

11 files changed

+12
-28
lines changed

11 files changed

+12
-28
lines changed

mllib/src/main/scala/org/apache/spark/ml/source/image/ImageFileFormat.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ import org.apache.spark.sql.sources.{DataSourceRegister, Filter}
3232
import org.apache.spark.sql.types.StructType
3333
import org.apache.spark.util.SerializableConfiguration
3434

35-
private[image] class ImageFileFormat extends FileFormat with DataSourceRegister {
35+
private[image] case class ImageFileFormat() extends FileFormat with DataSourceRegister {
3636

3737
override def inferSchema(
3838
sparkSession: SparkSession,

mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala

+2-2
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,9 @@ private[libsvm] class LibSVMOutputWriter(
6767
}
6868
}
6969

70-
/** @see [[LibSVMDataSource]] for public documentation. */
70+
// see `LibSVMDataSource` for public documentation.
7171
// If this is moved or renamed, please update DataSource's backwardCompatibilityMap.
72-
private[libsvm] class LibSVMFileFormat
72+
private[libsvm] case class LibSVMFileFormat()
7373
extends TextBasedFileFormat
7474
with DataSourceRegister
7575
with Logging {

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ import org.apache.spark.util.SerializableConfiguration
5555
* .load("/path/to/fileDir");
5656
* }}}
5757
*/
58-
class BinaryFileFormat extends FileFormat with DataSourceRegister {
58+
case class BinaryFileFormat() extends FileFormat with DataSourceRegister {
5959

6060
import BinaryFileFormat._
6161

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala

+1-5
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ import org.apache.spark.util.SerializableConfiguration
3535
/**
3636
* Provides access to CSV data from pure SQL statements.
3737
*/
38-
class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister {
38+
case class CSVFileFormat() extends TextBasedFileFormat with DataSourceRegister {
3939

4040
override def shortName(): String = "csv"
4141

@@ -158,10 +158,6 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister {
158158

159159
override def toString: String = "CSV"
160160

161-
override def hashCode(): Int = getClass.hashCode()
162-
163-
override def equals(other: Any): Boolean = other.isInstanceOf[CSVFileFormat]
164-
165161
/**
166162
* Allow reading variant from CSV, but don't allow writing variant into CSV. This is because the
167163
* written data (the string representation of variant) may not be read back as the same variant.

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala

+1-5
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ import org.apache.spark.sql.sources._
3232
import org.apache.spark.sql.types._
3333
import org.apache.spark.util.SerializableConfiguration
3434

35-
class JsonFileFormat extends TextBasedFileFormat with DataSourceRegister {
35+
case class JsonFileFormat() extends TextBasedFileFormat with DataSourceRegister {
3636
override val shortName: String = "json"
3737

3838
override def isSplitable(
@@ -128,10 +128,6 @@ class JsonFileFormat extends TextBasedFileFormat with DataSourceRegister {
128128

129129
override def toString: String = "JSON"
130130

131-
override def hashCode(): Int = getClass.hashCode()
132-
133-
override def equals(other: Any): Boolean = other.isInstanceOf[JsonFileFormat]
134-
135131
override def supportDataType(dataType: DataType): Boolean = dataType match {
136132
case _: VariantType => true
137133

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ import org.apache.spark.util.{SerializableConfiguration, Utils}
3737
/**
3838
* A data source for reading text files. The text files must be encoded as UTF-8.
3939
*/
40-
class TextFileFormat extends TextBasedFileFormat with DataSourceRegister {
40+
case class TextFileFormat() extends TextBasedFileFormat with DataSourceRegister {
4141

4242
override def shortName(): String = "text"
4343

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XmlFileFormat.scala

+1-5
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ import org.apache.spark.util.SerializableConfiguration
3535
/**
3636
* Provides access to XML data from pure SQL statements.
3737
*/
38-
class XmlFileFormat extends TextBasedFileFormat with DataSourceRegister {
38+
case class XmlFileFormat() extends TextBasedFileFormat with DataSourceRegister {
3939

4040
override def shortName(): String = "xml"
4141

@@ -132,10 +132,6 @@ class XmlFileFormat extends TextBasedFileFormat with DataSourceRegister {
132132

133133
override def toString: String = "XML"
134134

135-
override def hashCode(): Int = getClass.hashCode()
136-
137-
override def equals(other: Any): Boolean = other.isInstanceOf[XmlFileFormat]
138-
139135
override def supportDataType(dataType: DataType): Boolean = dataType match {
140136
case _: VariantType => true
141137

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -722,7 +722,7 @@ object LastArguments {
722722
}
723723

724724
/** A test [[FileFormat]] that records the arguments passed to buildReader, and returns nothing. */
725-
class TestFileFormat extends TextBasedFileFormat {
725+
case class TestFileFormat() extends TextBasedFileFormat {
726726

727727
override def toString: String = "TestFileFormat"
728728

sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ import org.apache.spark.util.SerializableJobConf
4848
*
4949
* TODO: implement the read logic.
5050
*/
51-
class HiveFileFormat(fileSinkConf: FileSinkDesc)
51+
case class HiveFileFormat(fileSinkConf: FileSinkDesc)
5252
extends FileFormat with DataSourceRegister with Logging {
5353

5454
def this() = this(null)

sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala

+1-5
Original file line numberDiff line numberDiff line change
@@ -55,16 +55,12 @@ import org.apache.spark.util.SerializableConfiguration
5555
* `FileFormat` for reading ORC files. If this is moved or renamed, please update
5656
* `DataSource`'s backwardCompatibilityMap.
5757
*/
58-
class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable {
58+
case class OrcFileFormat() extends FileFormat with DataSourceRegister with Serializable {
5959

6060
override def shortName(): String = "orc"
6161

6262
override def toString: String = "ORC"
6363

64-
override def hashCode(): Int = getClass.hashCode()
65-
66-
override def equals(other: Any): Boolean = other.isInstanceOf[OrcFileFormat]
67-
6864
override def inferSchema(
6965
sparkSession: SparkSession,
7066
options: Map[String, String],

sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ import org.apache.spark.util.ArrayImplicits._
3333
import org.apache.spark.util.SerializableConfiguration
3434
import org.apache.spark.util.Utils
3535

36-
class SimpleTextSource extends TextBasedFileFormat with DataSourceRegister {
36+
case class SimpleTextSource() extends TextBasedFileFormat with DataSourceRegister {
3737
override def shortName(): String = "test"
3838

3939
override def inferSchema(

0 commit comments

Comments
 (0)