Add flag to skip repartition before writing. (#239)

david-zlai · tchow-zlai · thomaschow · web-flow · commit c188e7deb08d · 2025-01-27T13:47:28.000-08:00
## Summary

## Checklist
- [ ] Added Unit Tests
- [ ] Covered by existing CI
- [ ] Integration tested
- [ ] Documentation update



&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

- **New Features**
  - Added configurable repartitioning option for DataFrame writes.
- Introduced a new configuration setting to control repartitioning
behavior.
  - Enhanced test suite with functionality to handle empty DataFrames.

- **Chores**
  - Improved code formatting and logging for DataFrame writing process.
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

&lt;!-- av pr metadata
This information is embedded by the av CLI when creating PRs to track
the status of stacks when using Aviator. Please do not delete or edit
this section of the PR.
```
{"parent":"main","parentHead":"","trunk":"main"}
```
--&gt;

---------

Co-authored-by: tchow-zlai &lt;thomas@zipline.ai&gt;
Co-authored-by: Thomas Chow &lt;thomaschow369@gmail.com&gt;
diff --git a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala
@@ -457,15 +457,23 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable
                                   sortByCols: Seq[String] = Seq.empty): Unit = {
     wrapWithCache(s"repartition & write to $tableName", df) {
       logger.info("Repartitioning before writing...")
-      repartitionAndWriteInternal(df, tableName, saveMode, stats, sortByCols)
+      val dataPointer = DataPointer.from(tableName, sparkSession)
+      val repartitioned =
+        if (sparkSession.conf.get("spark.chronon.write.repartition", true.toString).toBoolean)
+          repartitionInternal(df, tableName, stats, sortByCols)
+        else df
+      repartitioned.write
+        .mode(saveMode)
+        .save(dataPointer)
+
+      logger.info(s"Finished writing to $tableName")
     }.get
   }
 
-  private def repartitionAndWriteInternal(df: DataFrame,
-                                          tableName: String,
-                                          saveMode: SaveMode,
-                                          stats: Option[DfStats],
-                                          sortByCols: Seq[String]): Unit = {
+  private def repartitionInternal(df: DataFrame,
+                                  tableName: String,
+                                  stats: Option[DfStats],
+                                  sortByCols: Seq[String]): DataFrame = {
 
     // get row count and table partition count statistics
 
@@ -483,7 +491,6 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable
 
     // set to one if tablePartitionCount=0 to avoid division by zero
     val nonZeroTablePartitionCount = if (tablePartitionCount == 0) 1 else tablePartitionCount
-
     logger.info(s"$rowCount rows requested to be written into table $tableName")
     if (rowCount > 0) {
       val columnSizeEstimate = columnSizeEstimator(df.schema)
@@ -527,18 +534,13 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable
           (Seq(partitionColumn, saltCol), Seq(partitionColumn) ++ sortByCols)
         } else { (Seq(saltCol), sortByCols) }
       logger.info(s"Sorting within partitions with cols: $partitionSortCols")
-      val dataPointer = DataPointer.from(tableName, sparkSession)
 
       saltedDf
         .repartition(shuffleParallelism, repartitionCols.map(saltedDf.col): _*)
         .drop(saltCol)
         .sortWithinPartitions(partitionSortCols.map(col): _*)
-        .write
-        .mode(saveMode)
-        .save(dataPointer)
-
-      logger.info(s"Finished writing to $tableName")
     }
+    df
   }
 
   def chunk(partitions: Set[String]): Seq[PartitionRange] = {
diff --git a/spark/src/test/scala/ai/chronon/spark/test/TableUtilsTest.scala b/spark/src/test/scala/ai/chronon/spark/test/TableUtilsTest.scala
@@ -31,6 +31,8 @@ import org.scalatest.flatspec.AnyFlatSpec
 
 import scala.util.Try
 
+case class TestRecord(ds: String, id: String)
+
 class SimpleAddUDF extends UDF {
   def evaluate(value: Int): Int = {
     value + 20
@@ -482,6 +484,21 @@ class TableUtilsTest extends AnyFlatSpec {
     }
   }
 
+  it should "repartitioning an empty dataframe should work" in {
+    import spark.implicits._
+    val tableName = "db.test_empty_table"
+    tableUtils.createDatabase("db")
+
+    tableUtils.insertPartitions(spark.emptyDataset[TestRecord].toDF(), tableName)
+    val res = tableUtils.loadTable(tableName)
+    assertEquals(0, res.count)
+
+    tableUtils.insertPartitions(spark.createDataFrame(List(TestRecord("2025-01-01", "a"))), tableName)
+    val newRes = tableUtils.loadTable(tableName)
+
+    assertEquals(1, newRes.count)
+  }
+
   it should "create table" in {
     val tableName = "db.test_create_table"
     spark.sql("CREATE DATABASE IF NOT EXISTS db")