fix

mihailotim-db · mihailotim-db · commit bfd67d01bc2f · 2025-07-07T13:09:55.000+02:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -504,6 +504,7 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor
       ResolveEncodersInUDF),
     Batch("Subquery", Once,
       UpdateOuterReferences),
+    Batch("Strip __is_duplicate metadata", Once, StripIsDuplicateMetadata),
     Batch("Cleanup", fixedPoint,
       CleanupAliases),
     Batch("HandleSpecialCommand", Once,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/StripIsDuplicateMetadata.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/StripIsDuplicateMetadata.scala
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.analysis
+
+import org.apache.spark.sql.catalyst.SQLConfHelper
+import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute}
+import org.apache.spark.sql.catalyst.plans.logical.{AnalysisHelper, LogicalPlan}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.catalyst.trees.CurrentOrigin
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.MetadataBuilder
+
+/**
+ * Strips is duplicate metadata from all attributes and aliases as it is no longer needed after
+ * resolution.
+ */
+object StripIsDuplicateMetadata extends Rule[LogicalPlan] with SQLConfHelper {
+  def apply(plan: LogicalPlan): LogicalPlan = AnalysisHelper.allowInvokingTransformsInAnalyzer {
+    if (!conf.getConf(SQLConf.STRIP_IS_DUPLICATE_METADATA)) {
+      plan
+    } else {
+      plan transformAllExpressions {
+        case alias: Alias if alias.metadata.contains("__is_duplicate") =>
+          val newMetadata = new MetadataBuilder()
+            .withMetadata(alias.metadata)
+            .remove("__is_duplicate")
+            .build()
+
+          val newAlias = CurrentOrigin.withOrigin(alias.origin) {
+            Alias(child = alias.child, name = alias.name)(
+              exprId = alias.exprId,
+              qualifier = alias.qualifier,
+              explicitMetadata = Some(newMetadata),
+              nonInheritableMetadataKeys = alias.nonInheritableMetadataKeys
+            )
+          }
+          newAlias.copyTagsFrom(alias)
+
+          newAlias
+        case attribute: Attribute if attribute.metadata.contains("__is_duplicate") =>
+          val newMetadata = new MetadataBuilder()
+            .withMetadata(attribute.metadata)
+            .remove("__is_duplicate")
+            .build()
+
+          val newAttribute = CurrentOrigin.withOrigin(attribute.origin) {
+            attribute.withMetadata(newMetadata = newMetadata)
+          }
+          newAttribute.copyTagsFrom(attribute)
+
+          newAttribute
+      }
+    }
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ResolverRunner.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ResolverRunner.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.analysis.resolver
 
 import org.apache.spark.sql.catalyst.{QueryPlanningTracker, SQLConfHelper}
-import org.apache.spark.sql.catalyst.analysis.{AnalysisContext, CleanupAliases}
+import org.apache.spark.sql.catalyst.analysis.{AnalysisContext, CleanupAliases, StripIsDuplicateMetadata}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.internal.SQLConf
@@ -41,6 +41,7 @@ class ResolverRunner(
    */
   private val planRewriteRules: Seq[Rule[LogicalPlan]] = Seq(
     PruneMetadataColumns,
+    StripIsDuplicateMetadata,
     CleanupAliases
   )
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -241,6 +241,14 @@ object SQLConf {
     }
   }
 
+  val STRIP_IS_DUPLICATE_METADATA =
+    buildConf("spark.sql.analyzer.stripIsDuplicateMetadata")
+    .internal()
+    .doc("When true, strip __is_duplicate metadata after resolution batch in analysis since " +
+      "it is no longer needed.")
+    .booleanConf
+    .createWithDefault(true)
+
   val ONLY_NECESSARY_AND_UNIQUE_METADATA_COLUMNS =
     buildConf("spark.sql.analyzer.uniqueNecessaryMetadataColumns")
     .internal()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StripIsDuplicateMetadataSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StripIsDuplicateMetadataSuite.scala
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import java.util.ArrayList
+
+import org.apache.spark.sql.catalyst.expressions.NamedExpression
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSparkSession
+
+class StripIsDuplicateMetadataSuite extends QueryTest with SharedSparkSession {
+
+  test("Strip __is_duplicate from project list") {
+    withTable("t1") {
+      sql("CREATE TABLE t1(col1 INT, col2 INT)")
+      val query =
+        """SELECT * FROM (
+          | SELECT col1, col1 FROM t1
+          | UNION
+          | SELECT col1, col2 FROM t1
+          |)""".stripMargin
+
+      checkMetadata(query)
+    }
+  }
+
+  test("Strip __is_duplicate from Union output") {
+    withTable("t1") {
+      sql("CREATE TABLE t1(col1 INT, col2 INT)")
+      val query =
+        """SELECT col1, col1 FROM t1
+          |UNION
+          |SELECT col1, col2 FROM t1""".stripMargin
+
+      checkMetadata(query)
+    }
+  }
+
+  test("Strip __is_duplicate from CTEs") {
+    withTable("t1") {
+      sql("CREATE TABLE t1(col1 INT, col2 INT)")
+      val query =
+        """WITH cte1 AS (
+          | SELECT col1, col1 FROM t1
+          |),
+          |cte2 AS (
+          | SELECT col1, col2 FROM t1
+          |)
+          |SELECT * FROM cte1
+          |UNION
+          |SELECT * FROM cte2""".stripMargin
+
+      checkMetadata(query)
+    }
+  }
+
+  private def checkMetadata(query: String): Unit = {
+    for (stripMetadata <- Seq(true, false)) {
+      withSQLConf(SQLConf.STRIP_IS_DUPLICATE_METADATA.key -> stripMetadata.toString) {
+        val analyzedPlan = sql(query).queryExecution.analyzed
+        val duplicateAttributes = new ArrayList[NamedExpression]
+        analyzedPlan.foreach {
+          case plan: LogicalPlan => plan.expressions.foreach {
+            case namedExpression: NamedExpression
+                if namedExpression.metadata.contains("__is_duplicate") =>
+              duplicateAttributes.add(namedExpression)
+            case _ =>
+          }
+        }
+        assert(duplicateAttributes.isEmpty == stripMetadata)
+      }
+    }
+  }
+}

Original file line number	Diff line number	Diff line change
`@@ -241,6 +241,14 @@ object SQLConf {`
`241`	`241`	`}`
`242`	`242`	`}`
`243`	`243`
	`244`	`+ val STRIP_IS_DUPLICATE_METADATA =`
	`245`	`+ buildConf("spark.sql.analyzer.stripIsDuplicateMetadata")`
	`246`	`+ .internal()`
	`247`	`+ .doc("When true, strip __is_duplicate metadata after resolution batch in analysis since " +`
	`248`	`+ "it is no longer needed.")`
	`249`	`+ .booleanConf`
	`250`	`+ .createWithDefault(true)`
	`251`	`+`
`244`	`252`	`val ONLY_NECESSARY_AND_UNIQUE_METADATA_COLUMNS =`
`245`	`253`	`buildConf("spark.sql.analyzer.uniqueNecessaryMetadataColumns")`
`246`	`254`	`.internal()`