Skip to content

[BugFix] fix wrong order by scope for distinct query #37910

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Dec 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions fe/fe-core/src/main/java/com/starrocks/qe/SessionVariable.java
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,8 @@ public class SessionVariable implements Serializable, Writable, Cloneable {

public static final String ENABLE_PLAN_SERIALIZE_CONCURRENTLY = "enable_plan_serialize_concurrently";

public static final String ENABLE_STRICT_ORDER_BY = "enable_strict_order_by";

// Flag to control whether to proxy follower's query statement to leader/follower.
public enum FollowerQueryForwardMode {
DEFAULT, // proxy queries by the follower's replay progress (default)
Expand Down Expand Up @@ -1597,6 +1599,9 @@ public boolean isCboPredicateSubfieldPath() {
@VarAttr(name = FOLLOWER_QUERY_FORWARD_MODE, flag = VariableMgr.INVISIBLE | VariableMgr.DISABLE_FORWARD_TO_LEADER)
private String followerForwardMode = "";

@VarAttr(name = ENABLE_STRICT_ORDER_BY)
private boolean enableStrictOrderBy = true;

public void setFollowerQueryForwardMode(String mode) {
this.followerForwardMode = mode;
}
Expand Down Expand Up @@ -3006,6 +3011,14 @@ public void setSkewJoinRandRange(int skewJoinRandRange) {
this.skewJoinRandRange = skewJoinRandRange;
}

public boolean isEnableStrictOrderBy() {
return enableStrictOrderBy;
}

public void setEnableStrictOrderBy(boolean enableStrictOrderBy) {
this.enableStrictOrderBy = enableStrictOrderBy;
}

// Serialize to thrift object
// used for rest api
public TQueryOptions toThrift() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
import com.starrocks.sql.ast.ArrayExpr;
import com.starrocks.sql.ast.AstVisitor;
import com.starrocks.sql.ast.DictionaryGetExpr;
import com.starrocks.sql.ast.FieldReference;
import com.starrocks.sql.ast.LambdaFunctionExpr;
import com.starrocks.sql.ast.QueryStatement;

Expand Down Expand Up @@ -117,9 +118,16 @@ public Boolean visit(ParseNode expr) {
return super.visit(expr);
}

@Override
public Boolean visitFieldReference(FieldReference node, Void context) {
String colInfo = node.getTblName() == null ? "column" : "column of " + node.getTblName().toString();
throw new SemanticException(colInfo + " must appear in the GROUP BY clause or be used in an aggregate function",
node.getPos());
}

@Override
public Boolean visitExpression(Expr node, Void context) {
throw new SemanticException(PARSER_ERROR_MSG.unsupportedExprWithInfo(node.toSql(), "GROUP BY"),
throw new SemanticException(node.toSql() + " must appear in the GROUP BY clause or be used in an aggregate function",
node.getPos());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

package com.starrocks.sql.analyzer;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
Expand Down Expand Up @@ -86,10 +87,11 @@ public void analyze(AnalyzeState analyzeState,
analyzeHaving(havingClause, analyzeState, sourceScope, outputScope, outputExpressions);

// Construct sourceAndOutputScope with sourceScope and outputScope
Scope sourceAndOutputScope = computeAndAssignOrderScope(analyzeState, sourceScope, outputScope);
Scope sourceAndOutputScope = computeAndAssignOrderScope(analyzeState, sourceScope, outputScope,
selectList.isDistinct());

List<OrderByElement> orderByElements =
analyzeOrderBy(sortClause, analyzeState, sourceAndOutputScope, outputExpressions);
analyzeOrderBy(sortClause, analyzeState, sourceAndOutputScope, outputExpressions, selectList.isDistinct());
List<Expr> orderByExpressions =
orderByElements.stream().map(OrderByElement::getExpr).collect(Collectors.toList());

Expand Down Expand Up @@ -183,7 +185,7 @@ public void analyze(AnalyzeState analyzeState,
.collect(Collectors.toList());

Scope sourceScopeForOrder = new Scope(RelationId.anonymous(), new RelationFields(sourceForOrderFields));
computeAndAssignOrderScope(analyzeState, sourceScopeForOrder, outputScope);
computeAndAssignOrderScope(analyzeState, sourceScopeForOrder, outputScope, selectList.isDistinct());
analyzeState.setOrderSourceExpressions(orderSourceExpressions);
}

Expand Down Expand Up @@ -311,7 +313,8 @@ private List<Expr> analyzeSelect(SelectList selectList, Relation fromRelation, b

private List<OrderByElement> analyzeOrderBy(List<OrderByElement> orderByElements, AnalyzeState analyzeState,
Scope orderByScope,
List<Expr> outputExpressions) {
List<Expr> outputExpressions,
boolean isDistinct) {
if (orderByElements == null) {
analyzeState.setOrderBy(Collections.emptyList());
return Collections.emptyList();
Expand All @@ -327,20 +330,33 @@ private List<OrderByElement> analyzeOrderBy(List<OrderByElement> orderByElements
if (ordinal < 1 || ordinal > outputExpressions.size()) {
throw new SemanticException("ORDER BY position %s is not in select list", ordinal);
}
// index can ensure no ambiguous, we don't need to re-analyze this output expression
expression = outputExpressions.get((int) ordinal - 1);
}

if (expression instanceof FieldReference) {
// If the expression of order by is a FieldReference, it means that the type of sql is
} else if (expression instanceof FieldReference) {
// If the expression of order by is a FieldReference, and it's not a distinct select,
// it means that the type of sql is
// "select * from t order by 1", then this FieldReference cannot be parsed in OrderByScope,
// but should be parsed in sourceScope
analyzeExpression(expression, analyzeState, orderByScope.getParent());
if (isDistinct) {
analyzeExpression(expression, analyzeState, orderByScope);
} else {
analyzeExpression(expression, analyzeState, orderByScope.getParent());
}
} else {
ExpressionAnalyzer expressionAnalyzer = new ExpressionAnalyzer(session);
expressionAnalyzer.analyzeWithoutUpdateState(expression, analyzeState, orderByScope);
List<Expr> aggregations = Lists.newArrayList();
expression.collectAll(e -> e.isAggregate(), aggregations);
aggregations.forEach(e -> analyzeExpression(e, analyzeState, orderByScope.getParent()));
if (isDistinct && !aggregations.isEmpty()) {
throw new SemanticException("for SELECT DISTINCT, ORDER BY expressions must appear in select list",
expression.getPos());
}

if (!aggregations.isEmpty()) {
// use parent scope to analyze agg func firstly
Preconditions.checkState(orderByScope.getParent() != null, "parent scope not be set");
aggregations.forEach(e -> analyzeExpression(e, analyzeState, orderByScope.getParent()));
}
analyzeExpression(expression, analyzeState, orderByScope);
}

Expand Down Expand Up @@ -655,23 +671,24 @@ public Expr visitSlot(SlotRef slotRef, Void context) {
}
}

private Scope computeAndAssignOrderScope(AnalyzeState analyzeState, Scope sourceScope, Scope outputScope) {
// The Scope used by order by allows parsing of the same column,
// such as 'select v1 as v, v1 as v from t0 order by v'
// but normal parsing does not allow it. So add a de-duplication operation here.
private Scope computeAndAssignOrderScope(AnalyzeState analyzeState, Scope sourceScope, Scope outputScope,
boolean isDistinct) {

List<Field> allFields = Lists.newArrayList();
// order by can only "see" fields from distinct output
if (isDistinct) {
allFields = removeDuplicateField(outputScope.getRelationFields().getAllFields());
Scope orderScope = new Scope(outputScope.getRelationId(), new RelationFields(allFields));
analyzeState.setOrderScope(orderScope);
return orderScope;
}

List<Field> allFields = new ArrayList<>();
for (int i = 0; i < analyzeState.getOutputExprInOrderByScope().size(); ++i) {
Field field = outputScope.getRelationFields()
.getFieldByIndex(analyzeState.getOutputExprInOrderByScope().get(i));
if (field.getName() != null && field.getOriginExpression() != null &&
allFields.stream().anyMatch(f -> f.getOriginExpression() != null
&& f.getName() != null && field.getName().equals(f.getName())
&& field.getOriginExpression().equals(f.getOriginExpression()))) {
continue;
}
allFields.add(field);
}
allFields = removeDuplicateField(allFields);

Scope orderScope = new Scope(outputScope.getRelationId(), new RelationFields(allFields));

Expand All @@ -688,4 +705,29 @@ private Scope computeAndAssignOrderScope(AnalyzeState analyzeState, Scope source
private void analyzeExpression(Expr expr, AnalyzeState analyzeState, Scope scope) {
ExpressionAnalyzer.analyzeExpression(expr, analyzeState, scope, session);
}


// The Scope used by order by allows parsing of the same column,
// such as 'select v1 as v, v1 as v from t0 order by v'
// but normal parsing does not allow it. So add a de-duplication operation here.
private List<Field> removeDuplicateField(List<Field> originalFields) {
List<Field> allFields = Lists.newArrayList();
for (Field field : originalFields) {
if (session.getSessionVariable().isEnableStrictOrderBy()) {
if (field.getName() != null && field.getOriginExpression() != null &&
allFields.stream().anyMatch(f -> f.getOriginExpression() != null
&& f.getName() != null && field.getName().equals(f.getName())
&& field.getOriginExpression().equals(f.getOriginExpression()))) {
continue;
}
} else {
if (field.getName() != null &&
allFields.stream().anyMatch(f -> f.getName() != null && field.getName().equals(f.getName()))) {
continue;
}
}
allFields.add(field);
}
return allFields;
}
}
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The most risky bug in this code is:
Inconsistent behavior when parsing ORDER BY expression for queries with SELECT DISTINCT.

You can modify the code like this:

// In analyzeOrderBy method, adjust the part where aggregations are checked within a SELECT DISTINCT context

if (isDistinct && !aggregations.isEmpty()) {
    // Here we need to not only throw an exception if there are aggregations,
    // but also ensure that columns in the ORDER BY clause are present in the SELECT list.
    boolean orderByNotInSelectList = !outputExpressions.containsAll(expression.toExprList());
    if(orderByNotInSelectList) {
        throw new SemanticException("for SELECT DISTINCT, ORDER BY expressions must appear in the select list",
                expression.getPos());
    }
}

// Later in the same method, we also need to ensure that the ORDER BY fields are consistently validated
// against the correct scope when SELECT DISTINCT is used.

if (!isDistinct) {
    analyzeExpression(expression, analyzeState, orderByScope);
} else {
    analyzeExpression(expression, analyzeState, orderByScope.getParent());
}

This addresses the issue where the ORDER BY clause might reference expressions not in the SELECT list when using DISTINCT, which should result in an error as implemented in this proposed fix. Additionally, when SELECT DISTINCT is true, the ORDER BY expressions would correctly be analyzed against the parent scope rather than the possibly incorrect orderByScope.

Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ public int getFieldIndex() {
return fieldIndex;
}

public TableName getTblName() {
return tblName;
}

@Override
public boolean equals(Object o) {
if (this == o) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,11 @@ public void testDistinct() {
analyzeSuccess("select distinct v1, v2 as v from t0 order by v");
analyzeSuccess("select distinct abs(v1) as v from t0 order by v");
analyzeFail("select distinct v1 from t0 order by v2",
"must be an aggregate expression or appear in GROUP BY clause");
"Column 'v2' cannot be resolved");
analyzeFail("select distinct v1 as v from t0 order by v2",
"must be an aggregate expression or appear in GROUP BY clause");
"Column 'v2' cannot be resolved");
analyzeFail("select * from t0 order by max(v2)",
"column must appear in the GROUP BY clause or be used in an aggregate function.");

analyzeSuccess("select distinct v1 as v from t0 having v = 1");
analyzeFail("select distinct v1 as v from t0 having v2 = 2",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ void testWrongVariableName() {
assertContains(res, "{'pipeline_dop', 'pipeline_sink_dop', 'pipeline_profile_level'}");

res = VariableMgr.findSimilarVarNames("disable_joinreorder");
assertContains(res, "{'disable_join_reorder', 'disable_colocate_join', 'enable_predicate_reorder'}");
assertContains(res, "{'disable_join_reorder', 'disable_colocate_join'");
}

@Test
Expand Down
Loading