Skip to content

Commit 3065dd9

Browse files
dtenedorcloud-fan
authored andcommitted
[SPARK-49561][SQL] Add SQL pipe syntax for the PIVOT and UNPIVOT operators
### What changes were proposed in this pull request? This PR adds SQL pipe syntax support for the PIVOT and UNPIVOT operators. For example: ``` CREATE TEMPORARY VIEW courseSales AS SELECT * FROM VALUES ("dotNET", 2012, 10000), ("Java", 2012, 20000), ("dotNET", 2012, 5000), ("dotNET", 2013, 48000), ("Java", 2013, 30000) as courseSales(course, year, earnings); TABLE courseSales |> SELECT `year`, course, earnings |> PIVOT ( SUM(earnings) FOR course IN ('dotNET', 'Java') ); 2012 15000 20000 2013 48000 30000 ``` ### Why are the changes needed? The SQL pipe operator syntax will let users compose queries in a more flexible fashion. ### Does this PR introduce _any_ user-facing change? Yes, see above. ### How was this patch tested? This PR adds a few unit test cases, but mostly relies on golden file test coverage. I did this to make sure the answers are correct as this feature is implemented and also so we can look at the analyzer output plans to ensure they look right as well. ### Was this patch authored or co-authored using generative AI tooling? No Closes #48093 from dtenedor/pipe-pivot. Authored-by: Daniel Tenedorio <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent dbfa909 commit 3065dd9

File tree

6 files changed

+849
-9
lines changed

6 files changed

+849
-9
lines changed

sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1499,6 +1499,11 @@ version
14991499
operatorPipeRightSide
15001500
: selectClause
15011501
| whereClause
1502+
// The following two cases match the PIVOT or UNPIVOT clause, respectively.
1503+
// For each one, we add the other clause as an option in order to return high-quality error
1504+
// messages in the event that both are present (this is not allowed).
1505+
| pivotClause unpivotClause?
1506+
| unpivotClause pivotClause?
15021507
;
15031508

15041509
// When `SQL_standard_keyword_behavior=true`, there are 2 kinds of keywords in Spark SQL.

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5893,7 +5893,17 @@ class AstBuilder extends DataTypeAstBuilder
58935893
SubqueryAlias(SubqueryAlias.generateSubqueryName(), left)
58945894
}
58955895
withWhereClause(c, withSubqueryAlias)
5896-
}.get)
5896+
}.getOrElse(Option(ctx.pivotClause()).map { c =>
5897+
if (ctx.unpivotClause() != null) {
5898+
throw QueryParsingErrors.unpivotWithPivotInFromClauseNotAllowedError(ctx)
5899+
}
5900+
withPivot(c, left)
5901+
}.getOrElse(Option(ctx.unpivotClause()).map { c =>
5902+
if (ctx.pivotClause() != null) {
5903+
throw QueryParsingErrors.unpivotWithPivotInFromClauseNotAllowedError(ctx)
5904+
}
5905+
withUnpivot(c, left)
5906+
}.get)))
58975907
}
58985908

58995909
/**

sql/core/src/test/resources/sql-tests/analyzer-results/pipe-operators.sql.out

Lines changed: 352 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,74 @@ InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_d
6262
+- LocalRelation [col1#x, col2#x]
6363

6464

65+
-- !query
66+
create temporary view courseSales as select * from values
67+
("dotNET", 2012, 10000),
68+
("Java", 2012, 20000),
69+
("dotNET", 2012, 5000),
70+
("dotNET", 2013, 48000),
71+
("Java", 2013, 30000)
72+
as courseSales(course, year, earnings)
73+
-- !query analysis
74+
CreateViewCommand `courseSales`, select * from values
75+
("dotNET", 2012, 10000),
76+
("Java", 2012, 20000),
77+
("dotNET", 2012, 5000),
78+
("dotNET", 2013, 48000),
79+
("Java", 2013, 30000)
80+
as courseSales(course, year, earnings), false, false, LocalTempView, UNSUPPORTED, true
81+
+- Project [course#x, year#x, earnings#x]
82+
+- SubqueryAlias courseSales
83+
+- LocalRelation [course#x, year#x, earnings#x]
84+
85+
86+
-- !query
87+
create temporary view courseEarnings as select * from values
88+
("dotNET", 15000, 48000, 22500),
89+
("Java", 20000, 30000, NULL)
90+
as courseEarnings(course, `2012`, `2013`, `2014`)
91+
-- !query analysis
92+
CreateViewCommand `courseEarnings`, select * from values
93+
("dotNET", 15000, 48000, 22500),
94+
("Java", 20000, 30000, NULL)
95+
as courseEarnings(course, `2012`, `2013`, `2014`), false, false, LocalTempView, UNSUPPORTED, true
96+
+- Project [course#x, 2012#x, 2013#x, 2014#x]
97+
+- SubqueryAlias courseEarnings
98+
+- LocalRelation [course#x, 2012#x, 2013#x, 2014#x]
99+
100+
101+
-- !query
102+
create temporary view courseEarningsAndSales as select * from values
103+
("dotNET", 15000, NULL, 48000, 1, 22500, 1),
104+
("Java", 20000, 1, 30000, 2, NULL, NULL)
105+
as courseEarningsAndSales(
106+
course, earnings2012, sales2012, earnings2013, sales2013, earnings2014, sales2014)
107+
-- !query analysis
108+
CreateViewCommand `courseEarningsAndSales`, select * from values
109+
("dotNET", 15000, NULL, 48000, 1, 22500, 1),
110+
("Java", 20000, 1, 30000, 2, NULL, NULL)
111+
as courseEarningsAndSales(
112+
course, earnings2012, sales2012, earnings2013, sales2013, earnings2014, sales2014), false, false, LocalTempView, UNSUPPORTED, true
113+
+- Project [course#x, earnings2012#x, sales2012#x, earnings2013#x, sales2013#x, earnings2014#x, sales2014#x]
114+
+- SubqueryAlias courseEarningsAndSales
115+
+- LocalRelation [course#x, earnings2012#x, sales2012#x, earnings2013#x, sales2013#x, earnings2014#x, sales2014#x]
116+
117+
118+
-- !query
119+
create temporary view yearsWithComplexTypes as select * from values
120+
(2012, array(1, 1), map('1', 1), struct(1, 'a')),
121+
(2013, array(2, 2), map('2', 2), struct(2, 'b'))
122+
as yearsWithComplexTypes(y, a, m, s)
123+
-- !query analysis
124+
CreateViewCommand `yearsWithComplexTypes`, select * from values
125+
(2012, array(1, 1), map('1', 1), struct(1, 'a')),
126+
(2013, array(2, 2), map('2', 2), struct(2, 'b'))
127+
as yearsWithComplexTypes(y, a, m, s), false, false, LocalTempView, UNSUPPORTED, true
128+
+- Project [y#x, a#x, m#x, s#x]
129+
+- SubqueryAlias yearsWithComplexTypes
130+
+- LocalRelation [y#x, a#x, m#x, s#x]
131+
132+
65133
-- !query
66134
table t
67135
|> select 1 as x
@@ -569,6 +637,290 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
569637
}
570638

571639

640+
-- !query
641+
table courseSales
642+
|> select `year`, course, earnings
643+
|> pivot (
644+
sum(earnings)
645+
for course in ('dotNET', 'Java')
646+
)
647+
-- !query analysis
648+
Project [year#x, __pivot_sum(coursesales.earnings) AS `sum(coursesales.earnings)`#x[0] AS dotNET#xL, __pivot_sum(coursesales.earnings) AS `sum(coursesales.earnings)`#x[1] AS Java#xL]
649+
+- Aggregate [year#x], [year#x, pivotfirst(course#x, sum(coursesales.earnings)#xL, dotNET, Java, 0, 0) AS __pivot_sum(coursesales.earnings) AS `sum(coursesales.earnings)`#x]
650+
+- Aggregate [year#x, course#x], [year#x, course#x, sum(earnings#x) AS sum(coursesales.earnings)#xL]
651+
+- Project [year#x, course#x, earnings#x]
652+
+- SubqueryAlias coursesales
653+
+- View (`courseSales`, [course#x, year#x, earnings#x])
654+
+- Project [cast(course#x as string) AS course#x, cast(year#x as int) AS year#x, cast(earnings#x as int) AS earnings#x]
655+
+- Project [course#x, year#x, earnings#x]
656+
+- SubqueryAlias courseSales
657+
+- LocalRelation [course#x, year#x, earnings#x]
658+
659+
660+
-- !query
661+
table courseSales
662+
|> select `year` as y, course as c, earnings as e
663+
|> pivot (
664+
sum(e) as s, avg(e) as a
665+
for y in (2012 as firstYear, 2013 as secondYear)
666+
)
667+
-- !query analysis
668+
Project [c#x, __pivot_sum(e) AS s AS `sum(e) AS s`#x[0] AS firstYear_s#xL, __pivot_avg(e) AS a AS `avg(e) AS a`#x[0] AS firstYear_a#x, __pivot_sum(e) AS s AS `sum(e) AS s`#x[1] AS secondYear_s#xL, __pivot_avg(e) AS a AS `avg(e) AS a`#x[1] AS secondYear_a#x]
669+
+- Aggregate [c#x], [c#x, pivotfirst(y#x, sum(e) AS s#xL, 2012, 2013, 0, 0) AS __pivot_sum(e) AS s AS `sum(e) AS s`#x, pivotfirst(y#x, avg(e) AS a#x, 2012, 2013, 0, 0) AS __pivot_avg(e) AS a AS `avg(e) AS a`#x]
670+
+- Aggregate [c#x, y#x], [c#x, y#x, sum(e#x) AS sum(e) AS s#xL, avg(e#x) AS avg(e) AS a#x]
671+
+- Project [pipeselect(year#x) AS y#x, pipeselect(course#x) AS c#x, pipeselect(earnings#x) AS e#x]
672+
+- SubqueryAlias coursesales
673+
+- View (`courseSales`, [course#x, year#x, earnings#x])
674+
+- Project [cast(course#x as string) AS course#x, cast(year#x as int) AS year#x, cast(earnings#x as int) AS earnings#x]
675+
+- Project [course#x, year#x, earnings#x]
676+
+- SubqueryAlias courseSales
677+
+- LocalRelation [course#x, year#x, earnings#x]
678+
679+
680+
-- !query
681+
select course, `year`, y, a
682+
from courseSales
683+
join yearsWithComplexTypes on `year` = y
684+
|> pivot (
685+
max(a)
686+
for (y, course) in ((2012, 'dotNET'), (2013, 'Java'))
687+
)
688+
-- !query analysis
689+
Aggregate [year#x], [year#x, max(if ((named_struct(y, y#x, course, course#x) <=> cast(named_struct(col1, 2012, col2, dotNET) as struct<y:int,course:string>))) a#x else cast(null as array<int>)) AS {2012, dotNET}#x, max(if ((named_struct(y, y#x, course, course#x) <=> cast(named_struct(col1, 2013, col2, Java) as struct<y:int,course:string>))) a#x else cast(null as array<int>)) AS {2013, Java}#x]
690+
+- Project [course#x, year#x, y#x, a#x]
691+
+- Join Inner, (year#x = y#x)
692+
:- SubqueryAlias coursesales
693+
: +- View (`courseSales`, [course#x, year#x, earnings#x])
694+
: +- Project [cast(course#x as string) AS course#x, cast(year#x as int) AS year#x, cast(earnings#x as int) AS earnings#x]
695+
: +- Project [course#x, year#x, earnings#x]
696+
: +- SubqueryAlias courseSales
697+
: +- LocalRelation [course#x, year#x, earnings#x]
698+
+- SubqueryAlias yearswithcomplextypes
699+
+- View (`yearsWithComplexTypes`, [y#x, a#x, m#x, s#x])
700+
+- Project [cast(y#x as int) AS y#x, cast(a#x as array<int>) AS a#x, cast(m#x as map<string,int>) AS m#x, cast(s#x as struct<col1:int,col2:string>) AS s#x]
701+
+- Project [y#x, a#x, m#x, s#x]
702+
+- SubqueryAlias yearsWithComplexTypes
703+
+- LocalRelation [y#x, a#x, m#x, s#x]
704+
705+
706+
-- !query
707+
select earnings, `year`, s
708+
from courseSales
709+
join yearsWithComplexTypes on `year` = y
710+
|> pivot (
711+
sum(earnings)
712+
for s in ((1, 'a'), (2, 'b'))
713+
)
714+
-- !query analysis
715+
Project [year#x, __pivot_sum(coursesales.earnings) AS `sum(coursesales.earnings)`#x[0] AS {1, a}#xL, __pivot_sum(coursesales.earnings) AS `sum(coursesales.earnings)`#x[1] AS {2, b}#xL]
716+
+- Aggregate [year#x], [year#x, pivotfirst(s#x, sum(coursesales.earnings)#xL, [1,a], [2,b], 0, 0) AS __pivot_sum(coursesales.earnings) AS `sum(coursesales.earnings)`#x]
717+
+- Aggregate [year#x, s#x], [year#x, s#x, sum(earnings#x) AS sum(coursesales.earnings)#xL]
718+
+- Project [earnings#x, year#x, s#x]
719+
+- Join Inner, (year#x = y#x)
720+
:- SubqueryAlias coursesales
721+
: +- View (`courseSales`, [course#x, year#x, earnings#x])
722+
: +- Project [cast(course#x as string) AS course#x, cast(year#x as int) AS year#x, cast(earnings#x as int) AS earnings#x]
723+
: +- Project [course#x, year#x, earnings#x]
724+
: +- SubqueryAlias courseSales
725+
: +- LocalRelation [course#x, year#x, earnings#x]
726+
+- SubqueryAlias yearswithcomplextypes
727+
+- View (`yearsWithComplexTypes`, [y#x, a#x, m#x, s#x])
728+
+- Project [cast(y#x as int) AS y#x, cast(a#x as array<int>) AS a#x, cast(m#x as map<string,int>) AS m#x, cast(s#x as struct<col1:int,col2:string>) AS s#x]
729+
+- Project [y#x, a#x, m#x, s#x]
730+
+- SubqueryAlias yearsWithComplexTypes
731+
+- LocalRelation [y#x, a#x, m#x, s#x]
732+
733+
734+
-- !query
735+
table courseEarnings
736+
|> unpivot (
737+
earningsYear for `year` in (`2012`, `2013`, `2014`)
738+
)
739+
-- !query analysis
740+
Filter isnotnull(coalesce(earningsYear#x))
741+
+- Expand [[course#x, 2012, 2012#x], [course#x, 2013, 2013#x], [course#x, 2014, 2014#x]], [course#x, year#x, earningsYear#x]
742+
+- SubqueryAlias courseearnings
743+
+- View (`courseEarnings`, [course#x, 2012#x, 2013#x, 2014#x])
744+
+- Project [cast(course#x as string) AS course#x, cast(2012#x as int) AS 2012#x, cast(2013#x as int) AS 2013#x, cast(2014#x as int) AS 2014#x]
745+
+- Project [course#x, 2012#x, 2013#x, 2014#x]
746+
+- SubqueryAlias courseEarnings
747+
+- LocalRelation [course#x, 2012#x, 2013#x, 2014#x]
748+
749+
750+
-- !query
751+
table courseEarnings
752+
|> unpivot include nulls (
753+
earningsYear for `year` in (`2012`, `2013`, `2014`)
754+
)
755+
-- !query analysis
756+
Expand [[course#x, 2012, 2012#x], [course#x, 2013, 2013#x], [course#x, 2014, 2014#x]], [course#x, year#x, earningsYear#x]
757+
+- SubqueryAlias courseearnings
758+
+- View (`courseEarnings`, [course#x, 2012#x, 2013#x, 2014#x])
759+
+- Project [cast(course#x as string) AS course#x, cast(2012#x as int) AS 2012#x, cast(2013#x as int) AS 2013#x, cast(2014#x as int) AS 2014#x]
760+
+- Project [course#x, 2012#x, 2013#x, 2014#x]
761+
+- SubqueryAlias courseEarnings
762+
+- LocalRelation [course#x, 2012#x, 2013#x, 2014#x]
763+
764+
765+
-- !query
766+
table courseEarningsAndSales
767+
|> unpivot include nulls (
768+
(earnings, sales) for `year` in (
769+
(earnings2012, sales2012) as `2012`,
770+
(earnings2013, sales2013) as `2013`,
771+
(earnings2014, sales2014) as `2014`)
772+
)
773+
-- !query analysis
774+
Expand [[course#x, 2012, earnings2012#x, sales2012#x], [course#x, 2013, earnings2013#x, sales2013#x], [course#x, 2014, earnings2014#x, sales2014#x]], [course#x, year#x, earnings#x, sales#x]
775+
+- SubqueryAlias courseearningsandsales
776+
+- View (`courseEarningsAndSales`, [course#x, earnings2012#x, sales2012#x, earnings2013#x, sales2013#x, earnings2014#x, sales2014#x])
777+
+- Project [cast(course#x as string) AS course#x, cast(earnings2012#x as int) AS earnings2012#x, cast(sales2012#x as int) AS sales2012#x, cast(earnings2013#x as int) AS earnings2013#x, cast(sales2013#x as int) AS sales2013#x, cast(earnings2014#x as int) AS earnings2014#x, cast(sales2014#x as int) AS sales2014#x]
778+
+- Project [course#x, earnings2012#x, sales2012#x, earnings2013#x, sales2013#x, earnings2014#x, sales2014#x]
779+
+- SubqueryAlias courseEarningsAndSales
780+
+- LocalRelation [course#x, earnings2012#x, sales2012#x, earnings2013#x, sales2013#x, earnings2014#x, sales2014#x]
781+
782+
783+
-- !query
784+
table courseSales
785+
|> select course, earnings
786+
|> pivot (
787+
sum(earnings)
788+
for `year` in (2012, 2013)
789+
)
790+
-- !query analysis
791+
org.apache.spark.sql.catalyst.ExtendedAnalysisException
792+
{
793+
"errorClass" : "UNRESOLVED_COLUMN.WITH_SUGGESTION",
794+
"sqlState" : "42703",
795+
"messageParameters" : {
796+
"objectName" : "`year`",
797+
"proposal" : "`course`, `earnings`"
798+
},
799+
"queryContext" : [ {
800+
"objectType" : "",
801+
"objectName" : "",
802+
"startIndex" : 49,
803+
"stopIndex" : 111,
804+
"fragment" : "pivot (\n sum(earnings)\n for `year` in (2012, 2013)\n )"
805+
} ]
806+
}
807+
808+
809+
-- !query
810+
table courseSales
811+
|> pivot (
812+
sum(earnings)
813+
for `year` in (course, 2013)
814+
)
815+
-- !query analysis
816+
org.apache.spark.sql.AnalysisException
817+
{
818+
"errorClass" : "NON_LITERAL_PIVOT_VALUES",
819+
"sqlState" : "42K08",
820+
"messageParameters" : {
821+
"expression" : "\"course\""
822+
}
823+
}
824+
825+
826+
-- !query
827+
table courseSales
828+
|> select course, earnings
829+
|> pivot (
830+
sum(earnings)
831+
for `year` in (2012, 2013)
832+
)
833+
unpivot (
834+
earningsYear for `year` in (`2012`, `2013`, `2014`)
835+
)
836+
-- !query analysis
837+
org.apache.spark.sql.catalyst.parser.ParseException
838+
{
839+
"errorClass" : "NOT_ALLOWED_IN_FROM.UNPIVOT_WITH_PIVOT",
840+
"sqlState" : "42601",
841+
"queryContext" : [ {
842+
"objectType" : "",
843+
"objectName" : "",
844+
"startIndex" : 1,
845+
"stopIndex" : 186,
846+
"fragment" : "table courseSales\n|> select course, earnings\n|> pivot (\n sum(earnings)\n for `year` in (2012, 2013)\n )\n unpivot (\n earningsYear for `year` in (`2012`, `2013`, `2014`)\n )"
847+
} ]
848+
}
849+
850+
851+
-- !query
852+
table courseSales
853+
|> select course, earnings
854+
|> unpivot (
855+
earningsYear for `year` in (`2012`, `2013`, `2014`)
856+
)
857+
pivot (
858+
sum(earnings)
859+
for `year` in (2012, 2013)
860+
)
861+
-- !query analysis
862+
org.apache.spark.sql.catalyst.parser.ParseException
863+
{
864+
"errorClass" : "NOT_ALLOWED_IN_FROM.UNPIVOT_WITH_PIVOT",
865+
"sqlState" : "42601",
866+
"queryContext" : [ {
867+
"objectType" : "",
868+
"objectName" : "",
869+
"startIndex" : 1,
870+
"stopIndex" : 186,
871+
"fragment" : "table courseSales\n|> select course, earnings\n|> unpivot (\n earningsYear for `year` in (`2012`, `2013`, `2014`)\n )\n pivot (\n sum(earnings)\n for `year` in (2012, 2013)\n )"
872+
} ]
873+
}
874+
875+
876+
-- !query
877+
table courseSales
878+
|> select course, earnings
879+
|> pivot (
880+
sum(earnings)
881+
for `year` in (2012, 2013)
882+
)
883+
pivot (
884+
sum(earnings)
885+
for `year` in (2012, 2013)
886+
)
887+
-- !query analysis
888+
org.apache.spark.sql.catalyst.parser.ParseException
889+
{
890+
"errorClass" : "PARSE_SYNTAX_ERROR",
891+
"sqlState" : "42601",
892+
"messageParameters" : {
893+
"error" : "'pivot'",
894+
"hint" : ""
895+
}
896+
}
897+
898+
899+
-- !query
900+
table courseSales
901+
|> select course, earnings
902+
|> unpivot (
903+
earningsYear for `year` in (`2012`, `2013`, `2014`)
904+
)
905+
unpivot (
906+
earningsYear for `year` in (`2012`, `2013`, `2014`)
907+
)
908+
pivot (
909+
sum(earnings)
910+
for `year` in (2012, 2013)
911+
)
912+
-- !query analysis
913+
org.apache.spark.sql.catalyst.parser.ParseException
914+
{
915+
"errorClass" : "PARSE_SYNTAX_ERROR",
916+
"sqlState" : "42601",
917+
"messageParameters" : {
918+
"error" : "'unpivot'",
919+
"hint" : ""
920+
}
921+
}
922+
923+
572924
-- !query
573925
drop table t
574926
-- !query analysis

0 commit comments

Comments
 (0)