[SPARK-52591][SDP] Validate streaming-ness of DFs returned by SDP table and standalone flow definitions

anishm-db · sryza · commit a1908738223b · 2025-06-27T08:17:20.000-07:00
### What changes were proposed in this pull request? Validate that streaming flows are actually backed by streaming sources, and batch flows are actually backed by batch sources. Also improve SDP test harnesses to be explicit about whether a streaming table or materialized view is being created, to match the Python/SQL API. ### Why are the changes needed? This change helps prevent incorrect usage of streaming/batch flows, such as directly reading from a batch source from a streaming table's flow. In this case for example, the `STREAM` key word to mark a SQL batch source as streaming or `readStream` should be used in Python to stream read from an otherwise non-streaming file source. ### Does this PR introduce _any_ user-facing change? No, as this impacts SDP which is not released in any Spark version yet. ### How was this patch tested? Existing suites and added tests to `ConnectInvalidPipelineSuite` ### Was this patch authored or co-authored using generative AI tooling? No Closes #51208 from AnishMahto/sdp-validate-flow-streamingness. Lead-authored-by: anishm-db <anish.mahto@databricks.com> Co-authored-by: Anish Mahto <anish.mahto@databricks.com> Signed-off-by: Sandy Ryza <sandy.ryza@databricks.com>
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -2743,6 +2743,34 @@
     ],
     "sqlState" : "42000"
   },
+  "INVALID_FLOW_QUERY_TYPE" : {
+    "message" : [
+      "Flow <flowIdentifier> returns an invalid relation type."
+    ],
+    "subClass" : {
+      "BATCH_RELATION_FOR_STREAMING_TABLE" : {
+        "message" : [
+          "Streaming tables may only be defined by streaming relations, but the flow <flowIdentifier> attempts to write a batch relation to the streaming table <tableIdentifier>. Consider using the STREAM operator in Spark-SQL to convert the batch relation into a streaming relation, or populating the streaming table with an append once-flow instead."
+        ]
+      },
+      "STREAMING_RELATION_FOR_MATERIALIZED_VIEW" : {
+        "message" : [
+          "Materialized views may only be defined by a batch relation, but the flow <flowIdentifier> attempts to write a streaming relation to the materialized view <tableIdentifier>."
+        ]
+      },
+      "STREAMING_RELATION_FOR_ONCE_FLOW" : {
+        "message" : [
+          "<flowIdentifier> is an append once-flow that is defined by a streaming relation. Append once-flows may only be defined by or return a batch relation."
+        ]
+      },
+      "STREAMING_RELATION_FOR_PERSISTED_VIEW" : {
+        "message" : [
+          "Persisted views may only be defined by a batch relation, but the flow <flowIdentifier> attempts to write a streaming relation to the persisted view <viewIdentifier>."
+        ]
+      }
+    },
+    "sqlState" : "42000"
+  },
   "INVALID_FORMAT" : {
     "message" : [
       "The format is invalid: <format>."
diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/pipelines/PipelinesHandler.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/pipelines/PipelinesHandler.scala
@@ -161,7 +161,7 @@ private[connect] object PipelinesHandler extends Logging {
               language = Option(Python())),
             format = Option.when(dataset.hasFormat)(dataset.getFormat),
             normalizedPath = None,
-            isStreamingTableOpt = None))
+            isStreamingTable = dataset.getDatasetType == proto.DatasetType.TABLE))
       case proto.DatasetType.TEMPORARY_VIEW =>
         val viewIdentifier =
           GraphIdentifierManager.parseTableIdentifier(dataset.getDatasetName, sparkSession)
diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/pipelines/PythonPipelineSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/pipelines/PythonPipelineSuite.scala
@@ -93,7 +93,7 @@ class PythonPipelineSuite
     val graph = buildGraph("""
         |@sdp.table
         |def table1():
-        |    return spark.range(10)
+        |    return spark.readStream.format("rate").load()
         |""".stripMargin)
       .resolve()
       .validate()
@@ -112,11 +112,11 @@ class PythonPipelineSuite
         |def c():
         |  return spark.readStream.table("a")
         |
-        |@sdp.table()
+        |@sdp.materialized_view()
         |def d():
         |  return spark.read.table("a")
         |
-        |@sdp.table()
+        |@sdp.materialized_view()
         |def a():
         |  return spark.range(5)
         |""".stripMargin)
@@ -177,11 +177,11 @@ class PythonPipelineSuite
   test("referencing external datasets") {
     sql("CREATE TABLE spark_catalog.default.src AS SELECT * FROM RANGE(5)")
     val graph = buildGraph("""
-        |@sdp.table
+        |@sdp.materialized_view
         |def a():
         |  return spark.read.table("spark_catalog.default.src")
         |
-        |@sdp.table
+        |@sdp.materialized_view
         |def b():
         |  return spark.table("spark_catalog.default.src")
         |
@@ -230,11 +230,11 @@ class PythonPipelineSuite
         |def a():
         |  return spark.read.table("spark_catalog.default.src")
         |
-        |@sdp.table
+        |@sdp.materialized_view
         |def b():
         |  return spark.table("spark_catalog.default.src")
         |
-        |@sdp.table
+        |@sdp.materialized_view
         |def c():
         |  return spark.readStream.table("spark_catalog.default.src")
         |""".stripMargin).resolve()
diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/pipelines/SparkDeclarativePipelinesServerSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/pipelines/SparkDeclarativePipelinesServerSuite.scala
@@ -206,7 +206,7 @@ class SparkDeclarativePipelinesServerSuite
           sql = Some("SELECT * FROM STREAM tableA"))
         createTable(
           name = "tableC",
-          datasetType = DatasetType.TABLE,
+          datasetType = DatasetType.MATERIALIZED_VIEW,
           sql = Some("SELECT * FROM tableB"))
       }
 
@@ -238,7 +238,7 @@ class SparkDeclarativePipelinesServerSuite
         createView(name = "viewC", sql = "SELECT * FROM curr.tableB")
         createTable(
           name = "other.tableD",
-          datasetType = proto.DatasetType.TABLE,
+          datasetType = proto.DatasetType.MATERIALIZED_VIEW,
           sql = Some("SELECT * FROM viewC"))
       }
 
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/CoreDataflowNodeProcessor.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/CoreDataflowNodeProcessor.scala
@@ -80,12 +80,6 @@ class CoreDataflowNodeProcessor(rawGraph: DataflowGraph) {
         val resolvedFlowsToTable = flowsToTable.map { flow =>
           resolvedFlowNodesMap.get(flow.identifier)
         }
-
-        // Assign isStreamingTable (MV or ST) to the table based on the resolvedFlowsToTable
-        val tableWithType = table.copy(
-          isStreamingTableOpt = Option(resolvedFlowsToTable.exists(f => f.df.isStreaming))
-        )
-
         // We mark all tables as virtual to ensure resolution uses incoming flows
         // rather than previously materialized tables.
         val virtualTableInput = VirtualTableInput(
@@ -95,7 +89,7 @@ class CoreDataflowNodeProcessor(rawGraph: DataflowGraph) {
           availableFlows = resolvedFlowsToTable
         )
         resolvedInputs.put(table.identifier, virtualTableInput)
-        Seq(tableWithType)
+        Seq(table)
       case view: View =>
         // For view, add the flow to resolvedInputs and return empty.
         require(upstreamNodes.size == 1, "Found multiple flows to view")
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DataflowGraph.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DataflowGraph.scala
@@ -191,6 +191,7 @@ case class DataflowGraph(flows: Seq[Flow], tables: Seq[Table], views: Seq[View])
     validatePersistedViewSources()
     validateEveryDatasetHasFlow()
     validateTablesAreResettable()
+    validateFlowStreamingness()
     inferredSchema
   }.failed
 
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/DatasetManager.scala
@@ -178,15 +178,15 @@ object DatasetManager extends Logging {
     }
 
     // Wipe the data if we need to
-    if ((isFullRefresh || !table.isStreamingTableOpt.get) && existingTableOpt.isDefined) {
+    if ((isFullRefresh || !table.isStreamingTable) && existingTableOpt.isDefined) {
       context.spark.sql(s"TRUNCATE TABLE ${table.identifier.quotedString}")
     }
 
     // Alter the table if we need to
     if (existingTableOpt.isDefined) {
       val existingSchema = existingTableOpt.get.schema()
 
-      val targetSchema = if (table.isStreamingTableOpt.get && !isFullRefresh) {
+      val targetSchema = if (table.isStreamingTable && !isFullRefresh) {
         SchemaMergingUtils.mergeSchemas(existingSchema, outputSchema)
       } else {
         outputSchema
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/GraphValidations.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/GraphValidations.scala
@@ -55,6 +55,82 @@ trait GraphValidations extends Logging {
     multiQueryTables
   }
 
+  /**
+   * Validate that each resolved flow is correctly either a streaming flow or non-streaming flow,
+   * depending on the flow type (ex. once flow vs non-once flow) and the dataset type the flow
+   * writes to (ex. streaming table vs materialized view).
+   */
+  protected[graph] def validateFlowStreamingness(): Unit = {
+    flowsTo.foreach { case (destTableIdentifier, flowsToDataset) =>
+      // The identifier should correspond to exactly one of a table or view
+      val destTableOpt = table.get(destTableIdentifier)
+      val destViewOpt = view.get(destTableIdentifier)
+
+      val resolvedFlowsToDataset: Seq[ResolvedFlow] = flowsToDataset.collect {
+        case rf: ResolvedFlow => rf
+      }
+
+      resolvedFlowsToDataset.foreach { resolvedFlow: ResolvedFlow =>
+        // A flow must be successfully analyzed, thus resolved, in order to determine if it is
+        // streaming or not. Unresolved flows will throw an exception anyway via
+        // [[validateSuccessfulFlowAnalysis]], so don't check them here.
+        if (resolvedFlow.once) {
+          // Once flows by definition should be batch flows, not streaming.
+          if (resolvedFlow.df.isStreaming) {
+            throw new AnalysisException(
+              errorClass = "INVALID_FLOW_QUERY_TYPE.STREAMING_RELATION_FOR_ONCE_FLOW",
+              messageParameters = Map(
+                "flowIdentifier" -> resolvedFlow.identifier.quotedString
+              )
+            )
+          }
+        } else {
+          destTableOpt.foreach { destTable =>
+            if (destTable.isStreamingTable) {
+              if (!resolvedFlow.df.isStreaming) {
+                throw new AnalysisException(
+                  errorClass = "INVALID_FLOW_QUERY_TYPE.BATCH_RELATION_FOR_STREAMING_TABLE",
+                  messageParameters = Map(
+                    "flowIdentifier" -> resolvedFlow.identifier.quotedString,
+                    "tableIdentifier" -> destTableIdentifier.quotedString
+                  )
+                )
+              }
+            } else {
+              if (resolvedFlow.df.isStreaming) {
+                // This check intentionally does NOT prevent materialized views from reading from
+                // a streaming table using a _batch_ read, which is still considered valid.
+                throw new AnalysisException(
+                  errorClass = "INVALID_FLOW_QUERY_TYPE.STREAMING_RELATION_FOR_MATERIALIZED_VIEW",
+                  messageParameters = Map(
+                    "flowIdentifier" -> resolvedFlow.identifier.quotedString,
+                    "tableIdentifier" -> destTableIdentifier.quotedString
+                  )
+                )
+              }
+            }
+          }
+
+          destViewOpt.foreach {
+            case _: PersistedView =>
+              if (resolvedFlow.df.isStreaming) {
+                throw new AnalysisException(
+                  errorClass = "INVALID_FLOW_QUERY_TYPE.STREAMING_RELATION_FOR_PERSISTED_VIEW",
+                  messageParameters = Map(
+                    "flowIdentifier" -> resolvedFlow.identifier.quotedString,
+                    "viewIdentifier" -> destTableIdentifier.quotedString
+                  )
+                )
+              }
+            case _: TemporaryView =>
+              // Temporary views' flows are allowed to be either streaming or batch, so no
+              // validation needs to be done for them
+          }
+        }
+      }
+    }
+  }
+
   /** Throws an exception if the flows in this graph are not topologically sorted. */
   protected[graph] def validateGraphIsTopologicallySorted(): Unit = {
     val visitedNodes = mutable.Set.empty[TableIdentifier] // Set of visited nodes
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/SqlGraphRegistrationContext.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/SqlGraphRegistrationContext.scala
@@ -199,7 +199,7 @@ class SqlGraphRegistrationContext(
           ),
           format = cst.tableSpec.provider,
           normalizedPath = None,
-          isStreamingTableOpt = None
+          isStreamingTable = true
         )
       )
     }
@@ -230,7 +230,7 @@ class SqlGraphRegistrationContext(
           ),
           format = cst.tableSpec.provider,
           normalizedPath = None,
-          isStreamingTableOpt = None
+          isStreamingTable = true
         )
       )
 
@@ -281,7 +281,7 @@ class SqlGraphRegistrationContext(
           ),
           format = cmv.tableSpec.provider,
           normalizedPath = None,
-          isStreamingTableOpt = None
+          isStreamingTable = false
         )
       )
 
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/elements.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/elements.scala
@@ -114,8 +114,7 @@ sealed trait TableInput extends Input {
  *                       path (if not defined, we will normalize a managed storage path for it).
  * @param properties Table Properties to set in table metadata.
  * @param comment User-specified comment that can be placed on the table.
- * @param isStreamingTableOpt if the table is a streaming table, will be None until we have resolved
- *                            flows into table
+ * @param isStreamingTable if the table is a streaming table, as defined by the source code.
  */
 case class Table(
     identifier: TableIdentifier,
@@ -125,7 +124,7 @@ case class Table(
     properties: Map[String, String] = Map.empty,
     comment: Option[String],
     baseOrigin: QueryOrigin,
-    isStreamingTableOpt: Option[Boolean],
+    isStreamingTable: Boolean,
     format: Option[String]
 ) extends TableInput
     with Output {
@@ -163,17 +162,6 @@ case class Table(
     normalizedPath.get
   }
 
-  /**
-   * Tell if a table is a streaming table or not. This property is not set until we have resolved
-   * the flows into the table. The exception reminds engineers that they cant call at random time.
-   */
-  def isStreamingTable: Boolean = isStreamingTableOpt.getOrElse {
-    throw new IllegalStateException(
-      "Cannot identify whether the table is streaming table or not. You may need to resolve the " +
-      "flows into table."
-    )
-  }
-
   /**
    * Get the DatasetType of the table
    */
diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/ConnectInvalidPipelineSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/ConnectInvalidPipelineSuite.scala
@@ -429,6 +429,77 @@ class ConnectInvalidPipelineSuite extends PipelineTest {
     )
   }
 
+  test("Streaming table backed by batch relation fails validation") {
+    val session = spark
+    import session.implicits._
+
+    val graph = new TestGraphRegistrationContext(spark) {
+      registerTable("a", query = Option(dfFlowFunc(Seq(1, 2).toDF())))
+    }.resolveToDataflowGraph()
+
+    val ex = intercept[AnalysisException] {
+      graph.validate()
+    }
+
+    checkError(
+      exception = ex,
+      condition = "INVALID_FLOW_QUERY_TYPE.BATCH_RELATION_FOR_STREAMING_TABLE",
+      parameters = Map(
+        "flowIdentifier" -> fullyQualifiedIdentifier("a").quotedString,
+        "tableIdentifier" -> fullyQualifiedIdentifier("a").quotedString
+      )
+    )
+  }
+
+  test("Materialized view backed by streaming relation fails validation") {
+    val session = spark
+    import session.implicits._
+
+    val graph = new TestGraphRegistrationContext(spark) {
+      registerMaterializedView("a", query = dfFlowFunc(MemoryStream[Int].toDF()))
+    }.resolveToDataflowGraph()
+
+    val ex = intercept[AnalysisException] {
+      graph.validate()
+    }
+
+    checkError(
+      exception = ex,
+      condition = "INVALID_FLOW_QUERY_TYPE.STREAMING_RELATION_FOR_MATERIALIZED_VIEW",
+      parameters = Map(
+        "flowIdentifier" -> fullyQualifiedIdentifier("a").quotedString,
+        "tableIdentifier" -> fullyQualifiedIdentifier("a").quotedString
+      )
+    )
+  }
+
+  test("Once flow backed by streaming relation fails validation") {
+    val session = spark
+    import session.implicits._
+
+    val graph = new TestGraphRegistrationContext(spark) {
+      registerTable("a")
+      registerFlow(
+        destinationName = "a",
+        name = "once_flow",
+        query = dfFlowFunc(MemoryStream[Int].toDF()),
+        once = true
+      )
+    }.resolveToDataflowGraph()
+
+    val ex = intercept[AnalysisException] {
+      graph.validate()
+    }
+
+    checkError(
+      exception = ex,
+      condition = "INVALID_FLOW_QUERY_TYPE.STREAMING_RELATION_FOR_ONCE_FLOW",
+      parameters = Map(
+        "flowIdentifier" -> fullyQualifiedIdentifier("once_flow").quotedString
+      )
+    )
+  }
+
   test("Inferred schema that isn't a subset of user-specified schema") {
     val session = spark
     import session.implicits._
diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/MaterializeTablesSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/MaterializeTablesSuite.scala
diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/SqlPipelineSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/SqlPipelineSuite.scala
diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/TriggeredGraphExecutionSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/TriggeredGraphExecutionSuite.scala
diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/utils/TestGraphRegistrationContext.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/utils/TestGraphRegistrationContext.scala

Original file line number	Diff line number	Diff line change
`@@ -199,7 +199,7 @@ class SqlGraphRegistrationContext(`
`199`	`199`	`),`
`200`	`200`	`format = cst.tableSpec.provider,`
`201`	`201`	`normalizedPath = None,`
`202`		`- isStreamingTableOpt = None`
	`202`	`+ isStreamingTable = true`
`203`	`203`	`)`
`204`	`204`	`)`
`205`	`205`	`}`
`@@ -230,7 +230,7 @@ class SqlGraphRegistrationContext(`
`230`	`230`	`),`
`231`	`231`	`format = cst.tableSpec.provider,`
`232`	`232`	`normalizedPath = None,`
`233`		`- isStreamingTableOpt = None`
	`233`	`+ isStreamingTable = true`
`234`	`234`	`)`
`235`	`235`	`)`
`236`	`236`
`@@ -281,7 +281,7 @@ class SqlGraphRegistrationContext(`
`281`	`281`	`),`
`282`	`282`	`format = cmv.tableSpec.provider,`
`283`	`283`	`normalizedPath = None,`
`284`		`- isStreamingTableOpt = None`
	`284`	`+ isStreamingTable = false`
`285`	`285`	`)`
`286`	`286`	`)`
`287`	`287`