fix: Respect coalesce factor again (#372)

tchow-zlai · thomaschow · web-flow · commit 9802bafa110e · 2025-02-12T21:13:05.000-08:00
## Summary

## Checklist
- [ ] Added Unit Tests
- [ ] Covered by existing CI
- [ ] Integration tested
- [ ] Documentation update



&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

- **Bug Fixes**
- Improved handling of date-based partition columns during table
processing to ensure data is formatted and consolidated accurately.
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

&lt;!-- av pr metadata
This information is embedded by the av CLI when creating PRs to track
the status of stacks when using Aviator. Please do not delete or edit
this section of the PR.
```
{"parent":"main","parentHead":"","trunk":"main"}
```
--&gt;

---------

Co-authored-by: Thomas Chow &lt;thomaschow369@gmail.com&gt;
diff --git a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala
@@ -768,15 +768,13 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable
     val parallelism = sparkSession.sparkContext.getConf.getInt("spark.default.parallelism", 1000)
     val coalesceFactor = sparkSession.sparkContext.getConf.getInt("spark.chronon.coalesce.factor", 10)
 
-    df.coalesce(coalesceFactor * parallelism)
-
     // TODO: this is a temporary fix to handle the case where the partition column is a DATE type and not a string.
     //  This is the case for partitioned BigQuery native tables.
-    if (df.schema.fieldNames.contains(partitionColumn) && df.schema(partitionColumn).dataType == DateType) {
-      df.withColumn(partitionColumn, date_format(df.col(partitionColumn), partitionFormat))
-    } else {
-      df
-    }
+    (if (df.schema.fieldNames.contains(partitionColumn) && df.schema(partitionColumn).dataType == DateType) {
+       df.withColumn(partitionColumn, date_format(df.col(partitionColumn), partitionFormat))
+     } else {
+       df
+     }).coalesce(coalesceFactor * parallelism)
   }
 
   def whereClauses(partitionRange: PartitionRange, partitionColumn: String = partitionColumn): Seq[String] = {