fix: use insertInto instead of saveAsTable to preserve original TableUtils behavior (#173)

tchow-zlai · thomaschow · web-flow · commit 44d0afd1eb8b · 2025-01-06T11:55:51.000-08:00
## Summary - #157 introduced CatalogAwareDataPointer and also a regression in the way we write tables. Need to perform `insertInto` for hive-based tables which have different write semantics from `saveAsTable`. ## Checklist - [ ] Added Unit Tests - [x] Covered by existing CI - [ ] Integration tested - [ ] Documentation update  Co-authored-by: Thomas Chow <thomaschow369@gmail.com>
diff --git a/spark/src/main/scala/ai/chronon/spark/Extensions.scala b/spark/src/main/scala/ai/chronon/spark/Extensions.scala
@@ -325,7 +325,7 @@ object Extensions {
             case "hive" | "delta" | "iceberg" =>
               dfw
                 .format(normalized)
-                .saveAsTable(dataPointer.tableOrPath)
+                .insertInto(dataPointer.tableOrPath)
             case _ =>
               throw new UnsupportedOperationException(s"Unsupported write catalog: ${normalized}")
           }
@@ -334,7 +334,7 @@ object Extensions {
           // None case is just table against default catalog
           dfw
             .format("hive")
-            .saveAsTable(dataPointer.tableOrPath))
+            .insertInto(dataPointer.tableOrPath))
     }
   }
 
diff --git a/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala b/spark/src/test/scala/ai/chronon/spark/test/GroupByTest.scala
@@ -170,7 +170,8 @@ class GroupByTest {
     val computed = resultDf.select("user", "ts", "listing_view_last30", "listing_view_count")
     computed.show()
 
-    val expected = eventDf.sqlContext.sql("""
+    val expected = eventDf.sqlContext.sql(
+      """
          |SELECT
          |      events_last_k.user as user,
          |      queries_last_k.ts as ts,
@@ -351,9 +352,10 @@ class GroupByTest {
 
     val columns = aggregationsMetadata.map(a => a.name -> a.columnType).toMap
     assertEquals(Map(
-      "time_spent_ms" -> LongType,
-      "price" -> DoubleType
-    ), columns)
+                   "time_spent_ms" -> LongType,
+                   "price" -> DoubleType
+                 ),
+                 columns)
   }
 
   // test that OrderByLimit and OrderByLimitTimed serialization works well with Spark's data type
@@ -423,8 +425,8 @@ class GroupByTest {
     tableUtils.createDatabase(namespace)
     DataFrameGen.events(spark, sourceSchema, count = 1000, partitions = 200).save(sourceTable)
     val source = Builders.Source.events(
-      query =
-        Builders.Query(selects = Builders.Selects("ts", "item", "time_spent_ms", "price"), startPartition = startPartition),
+      query = Builders.Query(selects = Builders.Selects("ts", "item", "time_spent_ms", "price"),
+                             startPartition = startPartition),
       table = sourceTable
     )
     (source, endPartition)
@@ -560,7 +562,8 @@ class GroupByTest {
     val joinSource = TestUtils.getParentJoin(spark, namespace, "parent_join_table", "parent_gb")
     val query = Builders.Query(startPartition = today)
     val chainingGroupBy = TestUtils.getTestGBWithJoinSource(joinSource, query, namespace, "chaining_gb")
-    val newGroupBy = GroupBy.replaceJoinSource(chainingGroupBy, PartitionRange(today, today), tableUtils, computeDependency = false)
+    val newGroupBy =
+      GroupBy.replaceJoinSource(chainingGroupBy, PartitionRange(today, today), tableUtils, computeDependency = false)
 
     assertEquals(joinSource.metaData.outputTable, newGroupBy.sources.get(0).table)
     assertEquals(joinSource.left.topic + Constants.TopicInvalidSuffix, newGroupBy.sources.get(0).topic)
@@ -656,13 +659,13 @@ class GroupByTest {
           new Window(15, TimeUnit.DAYS),
           new Window(60, TimeUnit.DAYS)
         )
-      ),
+      )
     )
     backfill(name = "unit_test_group_by_descriptive_stats",
-      source = source,
-      endPartition = endPartition,
-      namespace = namespace,
-      tableUtils = tableUtils,
-      additionalAgg = aggs)
+             source = source,
+             endPartition = endPartition,
+             namespace = namespace,
+             tableUtils = tableUtils,
+             additionalAgg = aggs)
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -325,7 +325,7 @@ object Extensions {`
`325`	`325`	`case "hive" \| "delta" \| "iceberg" =>`
`326`	`326`	`dfw`
`327`	`327`	`.format(normalized)`
`328`		`- .saveAsTable(dataPointer.tableOrPath)`
	`328`	`+ .insertInto(dataPointer.tableOrPath)`
`329`	`329`	`case _ =>`
`330`	`330`	`throw new UnsupportedOperationException(s"Unsupported write catalog: ${normalized}")`
`331`	`331`	`}`
`@@ -334,7 +334,7 @@ object Extensions {`
`334`	`334`	`// None case is just table against default catalog`
`335`	`335`	`dfw`
`336`	`336`	`.format("hive")`
`337`		`- .saveAsTable(dataPointer.tableOrPath))`
	`337`	`+ .insertInto(dataPointer.tableOrPath))`
`338`	`338`	`}`
`339`	`339`	`}`
`340`	`340`