[BugFix] fix wrong order by scope for distinct query (backport StarRocks#37910)

packy92 · packy92 · commit 1442a1fb0b83 · 2023-12-29T10:34:08.000+08:00
Signed-off-by: packy92 &lt;wangchao@starrocks.com&gt;
diff --git a/fe/fe-core/src/main/java/com/starrocks/qe/SessionVariable.java b/fe/fe-core/src/main/java/com/starrocks/qe/SessionVariable.java
@@ -423,6 +423,8 @@ public class SessionVariable implements Serializable, Writable, Cloneable {
     public static final String ENABLE_ICEBERG_IDENTITY_COLUMN_OPTIMIZE = "enable_iceberg_identity_column_optimize";
     public static final String ENABLE_PIPELINE_LEVEL_SHUFFLE = "enable_pipeline_level_shuffle";
 
+    public static final String ENABLE_STRICT_ORDER_BY = "enable_strict_order_by";
+
     // Flag to control whether to proxy follower's query statement to leader/follower.
     public enum FollowerQueryForwardMode {
         DEFAULT,    // proxy queries by the follower's replay progress (default)
@@ -1452,6 +1454,8 @@ public SessionVariableConstants.ChooseInstancesMode getChooseExecuteInstancesMod
     @VarAttr(name = CBO_EQ_BASE_TYPE, flag = VariableMgr.INVISIBLE)
     private String cboEqBaseType = SessionVariableConstants.VARCHAR;
 
+
+
     public boolean isCboDecimalCastStringStrict() {
         return cboDecimalCastStringStrict;
     }
@@ -1489,6 +1493,9 @@ public void setCboDeriveRangeJoinPredicate(boolean cboDeriveRangeJoinPredicate)
     @VarAttr(name = FOLLOWER_QUERY_FORWARD_MODE, flag = VariableMgr.INVISIBLE | VariableMgr.DISABLE_FORWARD_TO_LEADER)
     private String followerForwardMode = "";
 
+    @VarAttr(name = ENABLE_STRICT_ORDER_BY)
+    private boolean enableStrictOrderBy = true;
+
     public void setFollowerQueryForwardMode(String mode) {
         this.followerForwardMode = mode;
     }
@@ -2810,6 +2817,14 @@ public void setCrossJoinCostPenalty(long crossJoinCostPenalty) {
         this.crossJoinCostPenalty = crossJoinCostPenalty;
     }
 
+    public boolean isEnableStrictOrderBy() {
+        return enableStrictOrderBy;
+    }
+
+    public void setEnableStrictOrderBy(boolean enableStrictOrderBy) {
+        this.enableStrictOrderBy = enableStrictOrderBy;
+    }
+
     // Serialize to thrift object
     // used for rest api
     public TQueryOptions toThrift() {
diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/analyzer/AggregationAnalyzer.java b/fe/fe-core/src/main/java/com/starrocks/sql/analyzer/AggregationAnalyzer.java
@@ -46,6 +46,7 @@
 import com.starrocks.qe.SqlModeHelper;
 import com.starrocks.sql.ast.ArrayExpr;
 import com.starrocks.sql.ast.AstVisitor;
+import com.starrocks.sql.ast.FieldReference;
 import com.starrocks.sql.ast.LambdaFunctionExpr;
 import com.starrocks.sql.ast.QueryStatement;
 
@@ -115,9 +116,16 @@ public Boolean visit(ParseNode expr) {
             return super.visit(expr);
         }
 
+        @Override
+        public Boolean visitFieldReference(FieldReference node, Void context) {
+            String colInfo = node.getTblName() == null ? "column" : "column of " + node.getTblName().toString();
+            throw new SemanticException(colInfo + " must appear in the GROUP BY clause or be used in an aggregate function",
+                    node.getPos());
+        }
+
         @Override
         public Boolean visitExpression(Expr node, Void context) {
-            throw new SemanticException(PARSER_ERROR_MSG.unsupportedExprWithInfo(node.toSql(), "GROUP BY"),
+            throw new SemanticException(node.toSql() + " must appear in the GROUP BY clause or be used in an aggregate function",
                     node.getPos());
         }
 
diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/analyzer/SelectAnalyzer.java b/fe/fe-core/src/main/java/com/starrocks/sql/analyzer/SelectAnalyzer.java
@@ -14,6 +14,7 @@
 
 package com.starrocks.sql.analyzer;
 
+import com.google.common.base.Preconditions;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Sets;
@@ -86,10 +87,11 @@ public void analyze(AnalyzeState analyzeState,
         analyzeHaving(havingClause, analyzeState, sourceScope, outputScope, outputExpressions);
 
         // Construct sourceAndOutputScope with sourceScope and outputScope
-        Scope sourceAndOutputScope = computeAndAssignOrderScope(analyzeState, sourceScope, outputScope);
+        Scope sourceAndOutputScope = computeAndAssignOrderScope(analyzeState, sourceScope, outputScope,
+                selectList.isDistinct());
 
         List<OrderByElement> orderByElements =
-                analyzeOrderBy(sortClause, analyzeState, sourceAndOutputScope, outputExpressions);
+                analyzeOrderBy(sortClause, analyzeState, sourceAndOutputScope, outputExpressions, selectList.isDistinct());
         List<Expr> orderByExpressions =
                 orderByElements.stream().map(OrderByElement::getExpr).collect(Collectors.toList());
 
@@ -183,7 +185,7 @@ public void analyze(AnalyzeState analyzeState,
                     .collect(Collectors.toList());
 
             Scope sourceScopeForOrder = new Scope(RelationId.anonymous(), new RelationFields(sourceForOrderFields));
-            computeAndAssignOrderScope(analyzeState, sourceScopeForOrder, outputScope);
+            computeAndAssignOrderScope(analyzeState, sourceScopeForOrder, outputScope, selectList.isDistinct());
             analyzeState.setOrderSourceExpressions(orderSourceExpressions);
         }
 
@@ -317,7 +319,8 @@ private List<Expr> analyzeSelect(SelectList selectList, Relation fromRelation, b
 
     private List<OrderByElement> analyzeOrderBy(List<OrderByElement> orderByElements, AnalyzeState analyzeState,
                                                 Scope orderByScope,
-                                                List<Expr> outputExpressions) {
+                                                List<Expr> outputExpressions,
+                                                boolean isDistinct) {
         if (orderByElements == null) {
             analyzeState.setOrderBy(Collections.emptyList());
             return Collections.emptyList();
@@ -333,20 +336,33 @@ private List<OrderByElement> analyzeOrderBy(List<OrderByElement> orderByElements
                 if (ordinal < 1 || ordinal > outputExpressions.size()) {
                     throw new SemanticException("ORDER BY position %s is not in select list", ordinal);
                 }
+                // index can ensure no ambiguous, we don't need to re-analyze this output expression
                 expression = outputExpressions.get((int) ordinal - 1);
-            }
-
-            if (expression instanceof FieldReference) {
-                // If the expression of order by is a FieldReference, it means that the type of sql is
+            } else if (expression instanceof FieldReference) {
+                // If the expression of order by is a FieldReference, and it's not a distinct select,
+                // it means that the type of sql is
                 // "select * from t order by 1", then this FieldReference cannot be parsed in OrderByScope,
                 // but should be parsed in sourceScope
-                analyzeExpression(expression, analyzeState, orderByScope.getParent());
+                if (isDistinct) {
+                    analyzeExpression(expression, analyzeState, orderByScope);
+                } else {
+                    analyzeExpression(expression, analyzeState, orderByScope.getParent());
+                }
             } else {
                 ExpressionAnalyzer expressionAnalyzer = new ExpressionAnalyzer(session);
                 expressionAnalyzer.analyzeWithoutUpdateState(expression, analyzeState, orderByScope);
                 List<Expr> aggregations = Lists.newArrayList();
                 expression.collectAll(e -> e.isAggregate(), aggregations);
-                aggregations.forEach(e -> analyzeExpression(e, analyzeState, orderByScope.getParent()));
+                if (isDistinct && !aggregations.isEmpty()) {
+                    throw new SemanticException("for SELECT DISTINCT, ORDER BY expressions must appear in select list",
+                            expression.getPos());
+                }
+
+                if (!aggregations.isEmpty()) {
+                    // use parent scope to analyze agg func firstly
+                    Preconditions.checkState(orderByScope.getParent() != null, "parent scope not be set");
+                    aggregations.forEach(e -> analyzeExpression(e, analyzeState, orderByScope.getParent()));
+                }
                 analyzeExpression(expression, analyzeState, orderByScope);
             }
 
@@ -661,23 +677,24 @@ public Expr visitSlot(SlotRef slotRef, Void context) {
         }
     }
 
-    private Scope computeAndAssignOrderScope(AnalyzeState analyzeState, Scope sourceScope, Scope outputScope) {
-        // The Scope used by order by allows parsing of the same column,
-        // such as 'select v1 as v, v1 as v from t0 order by v'
-        // but normal parsing does not allow it. So add a de-duplication operation here.
+    private Scope computeAndAssignOrderScope(AnalyzeState analyzeState, Scope sourceScope, Scope outputScope,
+                                             boolean isDistinct) {
+
+        List<Field> allFields = Lists.newArrayList();
+        // order by can only "see" fields from distinct output
+        if (isDistinct) {
+            allFields = removeDuplicateField(outputScope.getRelationFields().getAllFields());
+            Scope orderScope = new Scope(outputScope.getRelationId(), new RelationFields(allFields));
+            analyzeState.setOrderScope(orderScope);
+            return orderScope;
+        }
 
-        List<Field> allFields = new ArrayList<>();
         for (int i = 0; i < analyzeState.getOutputExprInOrderByScope().size(); ++i) {
             Field field = outputScope.getRelationFields()
                     .getFieldByIndex(analyzeState.getOutputExprInOrderByScope().get(i));
-            if (field.getName() != null && field.getOriginExpression() != null &&
-                    allFields.stream().anyMatch(f -> f.getOriginExpression() != null
-                            && f.getName() != null && field.getName().equals(f.getName())
-                            && field.getOriginExpression().equals(f.getOriginExpression()))) {
-                continue;
-            }
             allFields.add(field);
         }
+        allFields = removeDuplicateField(allFields);
 
         Scope orderScope = new Scope(outputScope.getRelationId(), new RelationFields(allFields));
 
@@ -694,4 +711,29 @@ private Scope computeAndAssignOrderScope(AnalyzeState analyzeState, Scope source
     private void analyzeExpression(Expr expr, AnalyzeState analyzeState, Scope scope) {
         ExpressionAnalyzer.analyzeExpression(expr, analyzeState, scope, session);
     }
+
+
+    // The Scope used by order by allows parsing of the same column,
+    // such as 'select v1 as v, v1 as v from t0 order by v'
+    // but normal parsing does not allow it. So add a de-duplication operation here.
+    private List<Field> removeDuplicateField(List<Field> originalFields) {
+        List<Field> allFields = Lists.newArrayList();
+        for (Field field : originalFields) {
+            if (session.getSessionVariable().isEnableStrictOrderBy()) {
+                if (field.getName() != null && field.getOriginExpression() != null &&
+                        allFields.stream().anyMatch(f -> f.getOriginExpression() != null
+                                && f.getName() != null && field.getName().equals(f.getName())
+                                && field.getOriginExpression().equals(f.getOriginExpression()))) {
+                    continue;
+                }
+            } else {
+                if (field.getName() != null &&
+                        allFields.stream().anyMatch(f -> f.getName() != null && field.getName().equals(f.getName()))) {
+                    continue;
+                }
+            }
+            allFields.add(field);
+        }
+        return allFields;
+    }
 }
diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/ast/FieldReference.java b/fe/fe-core/src/main/java/com/starrocks/sql/ast/FieldReference.java
@@ -48,6 +48,10 @@ public int getFieldIndex() {
         return fieldIndex;
     }
 
+    public TableName getTblName() {
+        return tblName;
+    }
+
     @Override
     public boolean equals(Object o) {
         if (this == o) {
diff --git a/fe/fe-core/src/test/java/com/starrocks/sql/analyzer/AnalyzeAggregateTest.java b/fe/fe-core/src/test/java/com/starrocks/sql/analyzer/AnalyzeAggregateTest.java
@@ -131,9 +131,11 @@ public void testDistinct() {
         analyzeSuccess("select distinct v1, v2 as v from t0 order by v");
         analyzeSuccess("select distinct abs(v1) as v from t0 order by v");
         analyzeFail("select distinct v1 from t0 order by v2",
-                "must be an aggregate expression or appear in GROUP BY clause");
+                "Column 'v2' cannot be resolved");
         analyzeFail("select distinct v1 as v from t0 order by v2",
-                "must be an aggregate expression or appear in GROUP BY clause");
+                "Column 'v2' cannot be resolved");
+        analyzeFail("select * from t0 order by max(v2)",
+                "column must appear in the GROUP BY clause or be used in an aggregate function.");
 
         analyzeSuccess("select distinct v1 as v from t0 having v = 1");
         analyzeFail("select distinct v1 as v from t0 having v2 = 2",
diff --git a/fe/fe-core/src/test/java/com/starrocks/sql/parser/ParserTest.java b/fe/fe-core/src/test/java/com/starrocks/sql/parser/ParserTest.java
@@ -404,7 +404,7 @@ void testWrongVariableName() {
         assertContains(res, "{'pipeline_dop', 'pipeline_sink_dop', 'pipeline_profile_level'}");
 
         res = VariableMgr.findSimilarVarNames("disable_joinreorder");
-        assertContains(res, "{'disable_join_reorder', 'disable_colocate_join', 'enable_predicate_reorder'}");
+        assertContains(res, "{'disable_join_reorder', 'disable_colocate_join'");
     }
 
     @Test
diff --git a/fe/fe-core/src/test/java/com/starrocks/sql/plan/OrderByTest.java b/fe/fe-core/src/test/java/com/starrocks/sql/plan/OrderByTest.java

Original file line number	Diff line number	Diff line change
`@@ -404,7 +404,7 @@ void testWrongVariableName() {`
`404`	`404`	`assertContains(res, "{'pipeline_dop', 'pipeline_sink_dop', 'pipeline_profile_level'}");`
`405`	`405`
`406`	`406`	`res = VariableMgr.findSimilarVarNames("disable_joinreorder");`
`407`		`- assertContains(res, "{'disable_join_reorder', 'disable_colocate_join', 'enable_predicate_reorder'}");`
	`407`	`+ assertContains(res, "{'disable_join_reorder', 'disable_colocate_join'");`
`408`	`408`	`}`
`409`	`409`
`410`	`410`	`@Test`