Refactor on comments

piyush-zlai · piyush-zlai · commit e3d6632e4edd · 2024-11-12T14:10:27.000-08:00
diff --git a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala
@@ -48,6 +48,27 @@ import scala.util.{Failure, Success, Try}
   * retrieve metadata / configure it appropriately at creation time
   */
 trait Format {
+  // Return the primary partitions (based on the 'partitionColumn') filtered down by sub-partition filters if provided
+  // If subpartition filters are supplied and the format doesn't support it, we throw an error
+  def primaryPartitions(tableName: String, partitionColumn: String, subPartitionsFilter: Map[String, String] = Map.empty)(implicit sparkSession: SparkSession): Seq[String] = {
+    if (!supportSubPartitionsFilter && subPartitionsFilter.nonEmpty) {
+      throw new NotImplementedError(s"subPartitionsFilter is not supported on this format")
+    }
+
+    val partitionSeq = partitions(tableName)(sparkSession)
+    partitionSeq.flatMap { partitionMap =>
+      if (
+        subPartitionsFilter.forall {
+          case (k, v) => partitionMap.get(k).contains(v)
+        }
+      ) {
+        partitionMap.get(partitionColumn)
+      } else {
+        None
+      }
+    }
+  }
+
   // Return a sequence for partitions where each partition entry consists of a Map of partition keys to values
   def partitions(tableName: String)(implicit sparkSession: SparkSession): Seq[Map[String, String]]
 
@@ -56,6 +77,9 @@ trait Format {
 
   // Help specify the appropriate file format to use in the Spark create table DDL query
   def fileFormatString(format: String): String
+
+  // Does this format support sub partitions filters
+  def supportSubPartitionsFilter: Boolean
 }
 
 /**
@@ -143,6 +167,9 @@ case class DefaultFormatProvider(sparkSession: SparkSession) extends FormatProvi
 }
 
 case object Hive extends Format {
+  override def primaryPartitions(tableName: String, partitionColumn: String, subPartitionsFilter: Map[String, String])(implicit sparkSession: SparkSession): Seq[String] =
+    super.primaryPartitions(tableName, partitionColumn, subPartitionsFilter)
+
   def parseHivePartition(pstring: String): Map[String, String] = {
     pstring
       .split("/")
@@ -166,24 +193,61 @@ case object Hive extends Format {
 
   def createTableTypeString: String = ""
   def fileFormatString(format: String): String = s"STORED AS $format"
+
+  override def supportSubPartitionsFilter: Boolean = true
 }
 
 case object Iceberg extends Format {
+  override def primaryPartitions(tableName: String, partitionColumn: String, subPartitionsFilter: Map[String, String])(implicit sparkSession: SparkSession): Seq[String] = {
+    if (!supportSubPartitionsFilter && subPartitionsFilter.nonEmpty) {
+      throw new NotImplementedError(s"subPartitionsFilter is not supported on this format")
+    }
+    
+    getIcebergPartitions(tableName)
+  }
+
   override def partitions(tableName: String)(implicit sparkSession: SparkSession): Seq[Map[String, String]] = {
     throw new NotImplementedError(
       "Multi-partitions retrieval is not supported on Iceberg tables yet." +
         "For single partition retrieval, please use 'partition' method.")
   }
 
+  private def getIcebergPartitions(tableName: String)(implicit sparkSession: SparkSession): Seq[String] = {
+    val partitionsDf = sparkSession.read.format("iceberg").load(s"$tableName.partitions")
+    val index = partitionsDf.schema.fieldIndex("partition")
+    if (partitionsDf.schema(index).dataType.asInstanceOf[StructType].fieldNames.contains("hr")) {
+      // Hour filter is currently buggy in iceberg. https://github.com/apache/iceberg/issues/4718
+      // so we collect and then filter.
+      partitionsDf
+        .select("partition.ds", "partition.hr")
+        .collect()
+        .filter(_.get(1) == null)
+        .map(_.getString(0))
+        .toSeq
+    } else {
+      partitionsDf
+        .select("partition.ds")
+        .collect()
+        .map(_.getString(0))
+        .toSeq
+    }
+  }
+
   def createTableTypeString: String = "USING iceberg"
   def fileFormatString(format: String): String = ""
+
+  override def supportSubPartitionsFilter: Boolean = false
 }
 
 // The Delta Lake format is compatible with the Delta lake and Spark versions currently supported by the project.
 // Attempting to use newer Delta lake library versions (e.g. 3.2 which works with Spark 3.5) results in errors:
 // java.lang.NoSuchMethodError: 'org.apache.spark.sql.delta.Snapshot org.apache.spark.sql.delta.DeltaLog.update(boolean)'
 // In such cases, you should implement your own FormatProvider built on the newer Delta lake version
 case object DeltaLake extends Format {
+
+  override def primaryPartitions(tableName: String, partitionColumn: String, subPartitionsFilter: Map[String, String])(implicit sparkSession: SparkSession): Seq[String] =
+    super.primaryPartitions(tableName, partitionColumn, subPartitionsFilter)
+
   override def partitions(tableName: String)(implicit sparkSession: SparkSession): Seq[Map[String, String]] = {
     // delta lake doesn't support the `SHOW PARTITIONS <tableName>` syntax - https://github.com/delta-io/delta/issues/996
     // there's alternative ways to retrieve partitions using the DeltaLog abstraction which is what we have to lean into
@@ -200,6 +264,8 @@ case object DeltaLake extends Format {
 
   def createTableTypeString: String = "USING DELTA"
   def fileFormatString(format: String): String = ""
+
+  override def supportSubPartitionsFilter: Boolean = true
 }
 
 case class TableUtils(sparkSession: SparkSession) {
@@ -315,47 +381,7 @@ case class TableUtils(sparkSession: SparkSession) {
   def partitions(tableName: String, subPartitionsFilter: Map[String, String] = Map.empty): Seq[String] = {
     if (!tableExists(tableName)) return Seq.empty[String]
     val format = tableReadFormat(tableName)
-
-    if (format == Iceberg) {
-      if (subPartitionsFilter.nonEmpty) {
-        throw new NotImplementedError("subPartitionsFilter is not supported on Iceberg tables yet.")
-      }
-      return getIcebergPartitions(tableName)
-    }
-
-    val partitionSeq = format.partitions(tableName)(sparkSession)
-    partitionSeq.flatMap { partitionMap =>
-      if (
-        subPartitionsFilter.forall {
-          case (k, v) => partitionMap.get(k).contains(v)
-        }
-      ) {
-        partitionMap.get(partitionColumn)
-      } else {
-        None
-      }
-    }
-  }
-
-  private def getIcebergPartitions(tableName: String): Seq[String] = {
-    val partitionsDf = sparkSession.read.format("iceberg").load(s"$tableName.partitions")
-    val index = partitionsDf.schema.fieldIndex("partition")
-    if (partitionsDf.schema(index).dataType.asInstanceOf[StructType].fieldNames.contains("hr")) {
-      // Hour filter is currently buggy in iceberg. https://github.com/apache/iceberg/issues/4718
-      // so we collect and then filter.
-      partitionsDf
-        .select("partition.ds", "partition.hr")
-        .collect()
-        .filter(_.get(1) == null)
-        .map(_.getString(0))
-        .toSeq
-    } else {
-      partitionsDf
-        .select("partition.ds")
-        .collect()
-        .map(_.getString(0))
-        .toSeq
-    }
+    format.primaryPartitions(tableName, partitionColumn, subPartitionsFilter)(sparkSession)
   }
 
   // Given a table and a query extract the schema of the columns involved as input.