chore: remove writeFormat as it is no longer used (#491)

tchow-zlai · thomaschow · web-flow · commit 25ff3cfb7a8a · 2025-03-12T14:48:57.000-07:00
## Summary

- Remove `writeFormat`. We now just create the table with the proper
provider, and write into it as specified by the table.

## Cheour clientslist
- [ ] Added Unit Tests
- [ ] Covered by existing CI
- [ ] Integration tested
- [ ] Documentation update
&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

- **Refactor**
- Removed legacy support for custom data write formatting and
configuration.
- Eliminated extended data pointer handling for saving and loading
operations.
- Streamlined table write operations by discarding redundant prefix and
format determination logic.
- Enhanced input validation for table types in the table creation
process.
- Simplified `BigQueryFormat` and `GCSFormat` by removing unsupported
methods.
- Removed unnecessary methods from `DeltaLake`, `Hive`, and `Iceberg`
format implementations.
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

&lt;!-- av pr metadata
This information is embedded by the av CLI when creating PRs to traour clients
the status of staour clientss when using Aviator. Please do not delete or edit
this section of the PR.
```
{"parent":"main","parentHead":"","trunk":"main"}
```
--&gt;

---------

Co-authored-by: Thomas Chow &lt;thomaschow369@gmail.com&gt;
diff --git a/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/BigQueryFormat.scala b/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/BigQueryFormat.scala
@@ -5,7 +5,7 @@ import ai.chronon.spark.format.Format
 import com.google.cloud.bigquery.BigQuery
 import com.google.cloud.bigquery.connector.common.BigQueryUtil
 import com.google.cloud.spark.bigquery.v2.Spark35BigQueryTableProvider
-import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.functions.{col, date_format, to_date}
 
 case class BigQueryFormat(project: String, bqClient: BigQuery, override val options: Map[String, String])
@@ -22,13 +22,6 @@ case class BigQueryFormat(project: String, bqClient: BigQuery, override val opti
   override def primaryPartitions(tableName: String, partitionColumn: String, subPartitionsFilter: Map[String, String])(
       implicit sparkSession: SparkSession): List[String] =
     super.primaryPartitions(tableName, partitionColumn, subPartitionsFilter)
-  override def generateTableBuilder(df: DataFrame,
-                                    tableName: String,
-                                    partitionColumns: List[String],
-                                    tableProperties: Map[String, String],
-                                    fileFormat: String): (String => Unit) => Unit = {
-    throw new UnsupportedOperationException("generateTableBuilder not supported for BigQuery")
-  }
 
   override def partitions(tableName: String)(implicit sparkSession: SparkSession): List[Map[String, String]] = {
     import sparkSession.implicits._
@@ -99,10 +92,5 @@ case class BigQueryFormat(project: String, bqClient: BigQuery, override val opti
 
   }
 
-  def createTableTypeString: String =
-    throw new UnsupportedOperationException("createTableTypeString not yet supported for BigQuery")
-  def fileFormatString(format: String): String =
-    throw new UnsupportedOperationException("fileFormatString not yet supported for BigQuery")
-
   override def supportSubPartitionsFilter: Boolean = true
 }
diff --git a/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/GCSFormat.scala b/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/GCSFormat.scala
@@ -5,7 +5,7 @@ import com.google.cloud.spark.bigquery.repaour clientsaged.com.google.cloud.bigq
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.execution.FileSourceScanExec
 import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
-import org.apache.spark.sql.{DataFrame, Encoders, Row, SparkSession}
+import org.apache.spark.sql.{Encoders, Row, SparkSession}
 import org.slf4j.LoggerFactory
 
 case class GCS(sourceUri: String, fileFormat: String) extends Format {
@@ -78,18 +78,6 @@ case class GCS(sourceUri: String, fileFormat: String) extends Format {
       .toList
   }
 
-  override def generateTableBuilder(df: DataFrame,
-                                    tableName: String,
-                                    partitionColumns: List[String],
-                                    tableProperties: Map[String, String],
-                                    fileFormat: String): (String => Unit) => Unit = {
-    throw new UnsupportedOperationException("generateTableBuilder not supported for GCS")
-  }
-
-  def createTableTypeString: String = throw new UnsupportedOperationException("GCS does not support create table")
-
-  def fileFormatString(format: String): String = ""
-
   override def supportSubPartitionsFilter: Boolean = true
 
 }
diff --git a/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/GcpFormatProvider.scala b/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/GcpFormatProvider.scala
@@ -36,14 +36,6 @@ case class GcpFormatProvider(sparkSession: SparkSession) extends FormatProvider
 
   override def readFormat(tableName: String): scala.Option[Format] = format(tableName)
 
-  override def writeFormat(table: String): Format = {
-    val writePrefix = TableUtils(sparkSession).writePrefix
-    require(writePrefix.nonEmpty, "Please set conf 'spark.chronon.table_write.prefix' pointing to a data buour clientset.")
-
-    val path = writePrefix.get + table.sanitize //split("/").map(_.sanitize).mkString("/")
-    GCS(path, "PARQUET")
-  }
-
   private[cloud_gcp] def getFormat(table: Table): Format = {
     table.getDefinition.asInstanceOf[TableDefinition] match {
       case definition: ExternalTableDefinition =>
diff --git a/spark/src/main/scala/ai/chronon/spark/CatalogAwareDataPointer.scala b/spark/src/main/scala/ai/chronon/spark/CatalogAwareDataPointer.scala
diff --git a/spark/src/main/scala/ai/chronon/spark/Extensions.scala b/spark/src/main/scala/ai/chronon/spark/Extensions.scala
@@ -297,89 +297,6 @@ object Extensions {
       result
     }
   }
-
-  implicit class DataPointerAwareDataFrameWriter[T](dfw: DataFrameWriter[T]) {
-
-    def save(dataPointer: DataPointer): Unit = {
-
-      val optionDfw = dfw.options(dataPointer.writeOptions)
-      dataPointer.writeFormat
-        .map((wf) => {
-          val normalized = wf.toLowerCase
-          normalized match {
-            case "bigquery" | "bq" =>
-              optionDfw
-                .format("bigquery")
-                .save(dataPointer.tableOrPath)
-            case "snowflake" | "sf" =>
-              optionDfw
-                .format("net.snowflake.spark.snowflake")
-                .option("dbtable", dataPointer.tableOrPath)
-                .save()
-            case "parquet" | "csv" =>
-              optionDfw
-                .format(normalized)
-                .save(dataPointer.tableOrPath)
-            case "hive" | "delta" | "iceberg" =>
-              optionDfw
-                .format(normalized)
-                .partitionBy(null: _*)
-                .insertInto(dataPointer.tableOrPath)
-            case _ =>
-              throw new UnsupportedOperationException(s"Unsupported write catalog: ${normalized}")
-          }
-        })
-        .getOrElse(
-          // None case is just table against default catalog
-          optionDfw
-            .format("hive")
-            .partitionBy(null: _*)
-            .insertInto(dataPointer.tableOrPath))
-    }
-  }
-
-  implicit class DataPointerAwareDataFrameReader(dfr: DataFrameReader) {
-
-    def load(dataPointer: DataPointer): DataFrame = {
-      val tableOrPath = dataPointer.tableOrPath
-
-      val optionDfr = dfr.options(dataPointer.readOptions)
-
-      dataPointer.readFormat
-        .map { fmt =>
-          val fmtLower = fmt.toLowerCase
-
-          fmtLower match {
-
-            case "bigquery" | "bq" =>
-              optionDfr
-                .format("bigquery")
-                .load(tableOrPath)
-
-            case "snowflake" | "sf" =>
-              optionDfr
-                .format("net.snowflake.spark.snowflake")
-                .option("dbtable", tableOrPath)
-                .load()
-
-            case "parquet" | "csv" =>
-              optionDfr
-                .format(fmt)
-                .load(tableOrPath)
-
-            case "hive" | "delta" | "iceberg" => optionDfr.table(tableOrPath)
-
-            case _ =>
-              throw new UnsupportedOperationException(s"Unsupported read catalog: $fmtLower")
-
-          }
-        }
-        .getOrElse {
-          // None case is just table against default catalog
-          optionDfr.table(tableOrPath)
-        }
-    }
-  }
   implicit class SourceSparkOps(source: api.Source) {
 
     def partitionColumn(implicit tableUtils: TableUtils): String = {
diff --git a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala
@@ -76,20 +76,6 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable
 
   private val tableWriteFormat = sparkSession.conf.get("spark.chronon.table_write.format", "").toLowerCase
 
-  val writePrefix: Option[String] = {
-
-    val barePrefix = sparkSession.conf.get("spark.chronon.table_write.prefix", "")
-
-    if (barePrefix.isEmpty || barePrefix.toUpperCase() == "NONE") {
-      None
-    } else if (barePrefix.endsWith("/")) {
-      Some(barePrefix)
-    } else {
-      Some(barePrefix + "/")
-    }
-
-  }
-
   // transient because the format provider is not always serializable.
   // for example, BigQueryImpl during reflecting with bq flavor
   @transient private implicit lazy val tableFormatProvider: FormatProvider = FormatProvider.from(sparkSession)
diff --git a/spark/src/main/scala/ai/chronon/spark/format/CreationUtils.scala b/spark/src/main/scala/ai/chronon/spark/format/CreationUtils.scala
@@ -4,13 +4,20 @@ import org.apache.spark.sql.types.StructType
 
 object CreationUtils {
 
+  private val ALLOWED_TABLE_TYPES = List("iceberg", "delta", "hive", "parquet", "hudi")
+
   def createTableSql(tableName: String,
                      schema: StructType,
                      partitionColumns: List[String],
                      tableProperties: Map[String, String],
                      fileFormatString: String,
                      tableTypeString: String): String = {
 
+    require(
+      tableTypeString.isEmpty || ALLOWED_TABLE_TYPES.contains(tableTypeString.toLowerCase),
+      s"Invalid table type: ${tableTypeString}. Must be empty OR one of: ${ALLOWED_TABLE_TYPES}"
+    )
+
     val noPartitions = StructType(
       schema
         .filterNot(field => partitionColumns.contains(field.name)))
diff --git a/spark/src/main/scala/ai/chronon/spark/format/DefaultFormatProvider.scala b/spark/src/main/scala/ai/chronon/spark/format/DefaultFormatProvider.scala
@@ -50,31 +50,4 @@ case class DefaultFormatProvider(sparkSession: SparkSession) extends FormatProvi
         false
     }
   }
-
-  // Return the write format to use for the given table. The logic at a high level is:
-  // 1) If the user specifies the spark.chronon.table_write.iceberg - we go with Iceberg
-  // 2) If the user specifies a spark.chronon.table_write.format as Hive (parquet), Iceberg or Delta we go with their choice
-  // 3) Default to Hive (parquet)
-  // Note the table_write.iceberg is supported for legacy reasons. Specifying "iceberg" in spark.chronon.table_write.format
-  // is preferred as the latter conf also allows us to support additional formats
-  override def writeFormat(tableName: String): Format = {
-    val useIceberg: Boolean = sparkSession.conf.get("spark.chronon.table_write.iceberg", "false").toBoolean
-
-    // Default provider just looks for any default config.
-    // Unlike read table, these write tables might not already exist.
-    val maybeFormat = sparkSession.conf.getOption("spark.chronon.table_write.format").map(_.toLowerCase) match {
-      case Some("hive")    => Some(Hive)
-      case Some("iceberg") => Some(Iceberg)
-      case Some("delta")   => Some(DeltaLake)
-      case _               => None
-    }
-    (useIceberg, maybeFormat) match {
-      // if explicitly configured Iceberg - we go with that setting
-      case (true, _) => Iceberg
-      // else if there is a write format we piour clients that
-      case (false, Some(format)) => format
-      // fallbaour clients to hive (parquet)
-      case (false, None) => Hive
-    }
-  }
 }
diff --git a/spark/src/main/scala/ai/chronon/spark/format/DeltaLake.scala b/spark/src/main/scala/ai/chronon/spark/format/DeltaLake.scala
@@ -32,9 +32,5 @@ case object DeltaLake extends Format {
 
   }
 
-  def createTableTypeString: String = "DELTA"
-
-  def fileFormatString(format: String): String = ""
-
   override def supportSubPartitionsFilter: Boolean = true
 }
diff --git a/spark/src/main/scala/ai/chronon/spark/format/Format.scala b/spark/src/main/scala/ai/chronon/spark/format/Format.scala
@@ -1,11 +1,8 @@
 paour clientsage ai.chronon.spark.format
 
 import ai.chronon.spark.format.CreationUtils.alterTablePropertiesSql
-import ai.chronon.spark.format.CreationUtils.createTableSql
-import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.SparkSession
-import org.slf4j.Logger
-import org.slf4j.LoggerFactory
+import org.slf4j.{Logger, LoggerFactory}
 
 trait Format {
   @transient private lazy val logger: Logger = LoggerFactory.getLogger(getClass)
@@ -48,33 +45,6 @@ trait Format {
   //      )
   def partitions(tableName: String)(implicit sparkSession: SparkSession): List[Map[String, String]]
 
-  def generateTableBuilder(df: DataFrame,
-                           tableName: String,
-                           partitionColumns: List[String],
-                           tableProperties: Map[String, String],
-                           fileFormat: String): (String => Unit) => Unit = {
-
-    def inner(df: DataFrame,
-              tableName: String,
-              partitionColumns: List[String],
-              tableProperties: Map[String, String],
-              fileFormat: String)(sqlEvaluator: String => Unit): Unit = {
-      val creationSql =
-        createTableSql(tableName,
-                       df.schema,
-                       partitionColumns,
-                       tableProperties,
-                       fileFormatString(fileFormat),
-                       createTableTypeString)
-
-      sqlEvaluator(creationSql)
-      ()
-    }
-
-    inner(df, tableName, partitionColumns, tableProperties, fileFormat)
-
-  }
-
   def alterTableProperties(tableName: String, tableProperties: Map[String, String]): (String => Unit) => Unit = {
 
     def inner(tableName: String, tableProperties: Map[String, String])(sqlEvaluator: String => Unit) = {
@@ -89,16 +59,7 @@ trait Format {
 
   }
 
-  // Help specify the appropriate table type to use in the Spark create table DDL query
-  def createTableTypeString: String
-
-  // Help specify the appropriate file format to use in the Spark create table DDL query
-  def fileFormatString(format: String): String
-
   // Does this format support sub partitions filters
   def supportSubPartitionsFilter: Boolean
 
-  // TODO: remove this once all formats implement table creation
-  val canCreateTable: Boolean = false
-
 }
diff --git a/spark/src/main/scala/ai/chronon/spark/format/FormatProvider.scala b/spark/src/main/scala/ai/chronon/spark/format/FormatProvider.scala
@@ -16,8 +16,6 @@ trait FormatProvider extends Serializable {
 
   def readFormat(tableName: String): Option[Format]
 
-  def writeFormat(tableName: String): Format
-
   def resolveTableName(tableName: String) = tableName
 
 }
diff --git a/spark/src/main/scala/ai/chronon/spark/format/Hive.scala b/spark/src/main/scala/ai/chronon/spark/format/Hive.scala
@@ -32,9 +32,5 @@ case object Hive extends Format {
       .toList
   }
 
-  def createTableTypeString: String = ""
-
-  def fileFormatString(format: String): String = s"STORED AS $format"
-
   override def supportSubPartitionsFilter: Boolean = true
 }
diff --git a/spark/src/main/scala/ai/chronon/spark/format/Iceberg.scala b/spark/src/main/scala/ai/chronon/spark/format/Iceberg.scala
@@ -52,9 +52,5 @@ case object Iceberg extends Format {
     }
   }
 
-  def createTableTypeString: String = "ICEBERG"
-
-  def fileFormatString(format: String): String = ""
-
   override def supportSubPartitionsFilter: Boolean = false
 }

Original file line number	Diff line number	Diff line change
`@@ -32,9 +32,5 @@ case object DeltaLake extends Format {`
`32`	`32`
`33`	`33`	`}`
`34`	`34`
`35`		`- def createTableTypeString: String = "DELTA"`
`36`		`-`
`37`		`- def fileFormatString(format: String): String = ""`
`38`		`-`
`39`	`35`	`override def supportSubPartitionsFilter: Boolean = true`
`40`	`36`	`}`
Original file line number	Diff line number	Diff line change
`@@ -16,8 +16,6 @@ trait FormatProvider extends Serializable {`
`16`	`16`
`17`	`17`	`def readFormat(tableName: String): Option[Format]`
`18`	`18`
`19`		`- def writeFormat(tableName: String): Format`
`20`		`-`
`21`	`19`	`def resolveTableName(tableName: String) = tableName`
`22`	`20`
`23`	`21`	`}`
Original file line number	Diff line number	Diff line change
`@@ -32,9 +32,5 @@ case object Hive extends Format {`
`32`	`32`	`.toList`
`33`	`33`	`}`
`34`	`34`
`35`		`- def createTableTypeString: String = ""`
`36`		`-`
`37`		`- def fileFormatString(format: String): String = s"STORED AS $format"`
`38`		`-`
`39`	`35`	`override def supportSubPartitionsFilter: Boolean = true`
`40`	`36`	`}`
Original file line number	Diff line number	Diff line change
`@@ -52,9 +52,5 @@ case object Iceberg extends Format {`
`52`	`52`	`}`
`53`	`53`	`}`
`54`	`54`
`55`		`- def createTableTypeString: String = "ICEBERG"`
`56`		`-`
`57`		`- def fileFormatString(format: String): String = ""`
`58`		`-`
`59`	`55`	`override def supportSubPartitionsFilter: Boolean = false`
`60`	`56`	`}`