@@ -40,6 +40,36 @@ trait Row {
40
40
}
41
41
}
42
42
43
+ /**
44
+ * SchemaTraverser aids in the traversal of the given SchemaType.
45
+ * In some cases (eg avro), it is more performant to create the
46
+ * top-level schema once and then traverse it top-to-bottom, rather
47
+ * than recreating at each node.
48
+ *
49
+ * This helper trait allows the Row.to function to traverse SchemaType
50
+ * without leaking details of the SchemaType structure.
51
+ */
52
+ trait SchemaTraverser [SchemaType ] {
53
+
54
+ def currentNode : SchemaType
55
+
56
+ // Returns the equivalent SchemaType representation of the given field
57
+ def getField (field : StructField ): SchemaTraverser [SchemaType ]
58
+
59
+ // Returns the inner type of the current collection field type.
60
+ // Throws if the current type is not a collection.
61
+ def getCollectionType : SchemaTraverser [SchemaType ]
62
+
63
+ // Returns the key type of the current map field type.
64
+ // Throws if the current type is not a map.
65
+ def getMapKeyType : SchemaTraverser [SchemaType ]
66
+
67
+ // Returns the valye type of the current map field type.
68
+ // Throws if the current type is not a map.
69
+ def getMapValueType : SchemaTraverser [SchemaType ]
70
+
71
+ }
72
+
43
73
object Row {
44
74
// recursively traverse a logical struct, and convert it chronon's row type
45
75
def from [CompositeType , BinaryType , ArrayType , StringType ](
@@ -95,49 +125,71 @@ object Row {
95
125
}
96
126
97
127
// recursively traverse a chronon dataType value, and convert it to an external type
98
- def to [StructType , BinaryType , ListType , MapType ](value : Any ,
99
- dataType : DataType ,
100
- composer : (Iterator [Any ], DataType ) => StructType ,
101
- binarizer : Array [Byte ] => BinaryType ,
102
- collector : (Iterator [Any ], Int ) => ListType ,
103
- mapper : (util.Map [Any , Any ] => MapType ),
104
- extraneousRecord : Any => Array [Any ] = null ): Any = {
128
+ def to [StructType , BinaryType , ListType , MapType , OutputSchema ](
129
+ value : Any ,
130
+ dataType : DataType ,
131
+ composer : (Iterator [Any ], DataType , Option [OutputSchema ]) => StructType ,
132
+ binarizer : Array [Byte ] => BinaryType ,
133
+ collector : (Iterator [Any ], Int ) => ListType ,
134
+ mapper : (util.Map [Any , Any ] => MapType ),
135
+ extraneousRecord : Any => Array [Any ] = null ,
136
+ schemaTraverser : Option [SchemaTraverser [OutputSchema ]] = None ): Any = {
105
137
106
138
if (value == null ) return null
107
- def edit (value : Any , dataType : DataType ): Any =
108
- to(value, dataType, composer, binarizer, collector, mapper, extraneousRecord)
139
+
140
+ def getFieldSchema (f : StructField ) = schemaTraverser.map(_.getField(f))
141
+
142
+ def edit (value : Any , dataType : DataType , subTreeTraverser : Option [SchemaTraverser [OutputSchema ]]): Any =
143
+ to(value, dataType, composer, binarizer, collector, mapper, extraneousRecord, subTreeTraverser)
144
+
109
145
dataType match {
110
146
case StructType (_, fields) =>
111
147
value match {
112
148
case arr : Array [Any ] =>
113
- composer(arr.iterator.zipWithIndex.map { case (value, idx) => edit(value, fields(idx).fieldType) },
114
- dataType)
149
+ composer(
150
+ arr.iterator.zipWithIndex.map {
151
+ case (value, idx) => edit(value, fields(idx).fieldType, getFieldSchema(fields(idx)))
152
+ },
153
+ dataType,
154
+ schemaTraverser.map(_.currentNode)
155
+ )
115
156
case list : util.ArrayList [Any ] =>
116
- composer(list
117
- .iterator()
118
- .asScala
119
- .zipWithIndex
120
- .map { case (value, idx) => edit(value, fields(idx).fieldType) },
121
- dataType)
122
- case list : List [Any ] =>
123
- composer(list.iterator.zipWithIndex
124
- .map { case (value, idx) => edit(value, fields(idx).fieldType) },
125
- dataType)
157
+ composer(
158
+ list
159
+ .iterator()
160
+ .asScala
161
+ .zipWithIndex
162
+ .map { case (value, idx) => edit(value, fields(idx).fieldType, getFieldSchema(fields(idx))) },
163
+ dataType,
164
+ schemaTraverser.map(_.currentNode)
165
+ )
126
166
case value : Any =>
127
167
assert(extraneousRecord != null , s " No handler for $value of class ${value.getClass}" )
128
- composer(extraneousRecord(value).iterator.zipWithIndex.map {
129
- case (value, idx) => edit(value, fields(idx).fieldType)
130
- },
131
- dataType)
168
+ composer(
169
+ extraneousRecord(value).iterator.zipWithIndex.map {
170
+ case (value, idx) => edit(value, fields(idx).fieldType, getFieldSchema(fields(idx)))
171
+ },
172
+ dataType,
173
+ schemaTraverser.map(_.currentNode)
174
+ )
132
175
}
133
176
case ListType (elemType) =>
134
177
value match {
135
178
case list : util.ArrayList [Any ] =>
136
- collector(list.iterator().asScala.map(edit(_, elemType)), list.size())
179
+ collector(
180
+ list.iterator().asScala.map(edit(_, elemType, schemaTraverser.map(_.getCollectionType))),
181
+ list.size()
182
+ )
137
183
case arr : Array [_] => // avro only recognizes arrayList for its ArrayType/ListType
138
- collector(arr.iterator.map(edit(_, elemType)), arr.length)
184
+ collector(
185
+ arr.iterator.map(edit(_, elemType, schemaTraverser.map(_.getCollectionType))),
186
+ arr.length
187
+ )
139
188
case arr : mutable.WrappedArray [Any ] => // handles the wrapped array type from transform function in spark sql
140
- collector(arr.iterator.map(edit(_, elemType)), arr.length)
189
+ collector(
190
+ arr.iterator.map(edit(_, elemType, schemaTraverser.map(_.getCollectionType))),
191
+ arr.length
192
+ )
141
193
}
142
194
case MapType (keyType, valueType) =>
143
195
value match {
@@ -147,12 +199,38 @@ object Row {
147
199
.entrySet()
148
200
.iterator()
149
201
.asScala
150
- .foreach { entry => newMap.put(edit(entry.getKey, keyType), edit(entry.getValue, valueType)) }
202
+ .foreach { entry =>
203
+ newMap.put(
204
+ edit(
205
+ entry.getKey,
206
+ keyType,
207
+ schemaTraverser.map(_.getMapKeyType)
208
+ ),
209
+ edit(
210
+ entry.getValue,
211
+ valueType,
212
+ schemaTraverser.map(_.getMapValueType)
213
+ )
214
+ )
215
+ }
151
216
mapper(newMap)
152
217
case map : collection.immutable.Map [Any , Any ] =>
153
218
val newMap = new util.HashMap [Any , Any ](map.size)
154
219
map
155
- .foreach { entry => newMap.put(edit(entry._1, keyType), edit(entry._2, valueType)) }
220
+ .foreach { entry =>
221
+ newMap.put(
222
+ edit(
223
+ entry._1,
224
+ keyType,
225
+ schemaTraverser.map(_.getMapKeyType)
226
+ ),
227
+ edit(
228
+ entry._2,
229
+ valueType,
230
+ schemaTraverser.map(_.getMapValueType)
231
+ )
232
+ )
233
+ }
156
234
mapper(newMap)
157
235
}
158
236
case BinaryType => binarizer(value.asInstanceOf [Array [Byte ]])
0 commit comments