[ https://issues.apache.org/jira/browse/DRILL-5323?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15972111#comment-15972111 ]
ASF GitHub Bot commented on DRILL-5323: --------------------------------------- Github user paul-rogers commented on a diff in the pull request: https://github.com/apache/drill/pull/785#discussion_r111866420 --- Diff: exec/java-exec/src/test/java/org/apache/drill/test/rowSet/HyperRowSetImpl.java --- @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.test.rowSet; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.drill.common.types.TypeProtos.MinorType; +import org.apache.drill.exec.memory.BufferAllocator; +import org.apache.drill.exec.record.BatchSchema.SelectionVectorMode; +import org.apache.drill.exec.record.HyperVectorWrapper; +import org.apache.drill.exec.record.VectorContainer; +import org.apache.drill.exec.record.VectorWrapper; +import org.apache.drill.exec.record.selection.SelectionVector4; +import org.apache.drill.exec.vector.ValueVector; +import org.apache.drill.exec.vector.accessor.AccessorUtilities; +import org.apache.drill.exec.vector.complex.AbstractMapVector; +import org.apache.drill.test.rowSet.AbstractRowSetAccessor.BoundedRowIndex; +import org.apache.drill.test.rowSet.RowSet.HyperRowSet; +import org.apache.drill.test.rowSet.RowSetSchema.LogicalColumn; +import org.apache.drill.test.rowSet.RowSetSchema.PhysicalSchema; + +public class HyperRowSetImpl extends AbstractRowSet implements HyperRowSet { + + public static class HyperRowIndex extends BoundedRowIndex { + + private final SelectionVector4 sv4; + + public HyperRowIndex(SelectionVector4 sv4) { + super(sv4.getCount()); + this.sv4 = sv4; + } + + @Override + public int index() { + return AccessorUtilities.sv4Index(sv4.get(rowIndex)); + } + + @Override + public int batch( ) { + return AccessorUtilities.sv4Batch(sv4.get(rowIndex)); + } + } + + /** + * Build a hyper row set by restructuring a hyper vector bundle into a uniform + * shape. Consider this schema: <pre><code> + * { a: 10, b: { c: 20, d: { e: 30 } } }</code></pre> + * <p> + * The hyper container, with two batches, has this structure: + * <table border="1"> + * <tr><th>Batch</th><th>a</th><th>b</th></tr> + * <tr><td>0</td><td>Int vector</td><td>Map Vector(Int vector, Map Vector(Int vector))</td></th> + * <tr><td>1</td><td>Int vector</td><td>Map Vector(Int vector, Map Vector(Int vector))</td></th> + * </table> + * <p> + * The above table shows that top-level scalar vectors (such as the Int Vector for column + * a) appear "end-to-end" as a hyper-vector. Maps also appear end-to-end. But, the + * contents of the map (column c) do not appear end-to-end. Instead, they appear as + * contents in the map vector. To get to c, one indexes into the map vector, steps inside + * the map to find c and indexes to the right row. + * <p> + * Similarly, the maps for d do not appear end-to-end, one must step to the right batch + * in b, then step to d. + * <p> + * Finally, to get to e, one must step + * into the hyper vector for b, then steps to the proper batch, steps to d, step to e + * and finally step to the row within e. This is a very complex, costly indexing scheme + * that differs depending on map nesting depth. + * <p> + * To simplify access, this class restructures the maps to flatten the scalar vectors + * into end-to-end hyper vectors. For example, for the above: + * <p> + * <table border="1"> + * <tr><th>Batch</th><th>a</th><th>c</th><th>d</th></tr> + * <tr><td>0</td><td>Int vector</td><td>Int vector</td><td>Int vector</td></th> + * <tr><td>1</td><td>Int vector</td><td>Int vector</td><td>Int vector</td></th> + * </table> + * + * The maps are still available as hyper vectors, but separated into map fields. + * (Scalar access no longer needs to access the maps.) The result is a uniform + * addressing scheme for both top-level and nested vectors. + */ + + public static class HyperVectorBuilder { + + protected final HyperVectorWrapper<?> valueVectors[]; + protected final HyperVectorWrapper<AbstractMapVector> mapVectors[]; + private final List<ValueVector> nestedScalars[]; + private int vectorIndex; + private int mapIndex; + private final PhysicalSchema physicalSchema; + + @SuppressWarnings("unchecked") + public HyperVectorBuilder(RowSetSchema schema) { + physicalSchema = schema.physical(); + valueVectors = new HyperVectorWrapper<?>[schema.access().count()]; + if (schema.access().mapCount() == 0) { + mapVectors = null; + nestedScalars = null; + } else { + mapVectors = (HyperVectorWrapper<AbstractMapVector>[]) + new HyperVectorWrapper<?>[schema.access().mapCount()]; + nestedScalars = new ArrayList[schema.access().count()]; + } + } + + @SuppressWarnings("unchecked") + public HyperVectorWrapper<ValueVector>[] mapContainer(VectorContainer container) { + int i = 0; + for (VectorWrapper<?> w : container) { + HyperVectorWrapper<?> hvw = (HyperVectorWrapper<?>) w; + if (w.getField().getType().getMinorType() == MinorType.MAP) { + HyperVectorWrapper<AbstractMapVector> mw = (HyperVectorWrapper<AbstractMapVector>) hvw; + mapVectors[mapIndex++] = mw; + buildHyperMap(physicalSchema.column(i).mapSchema(), mw); --- End diff -- Also, we assume that the caller either knows the number of columns (because the caller created the schema), or the caller checked the column count. Accessing a column out of range will throw an exception somewhere; there did not seem to be a burning need to add an extra check on top of those provided "naturally." > Provide test tools to create, populate and compare row sets > ----------------------------------------------------------- > > Key: DRILL-5323 > URL: https://issues.apache.org/jira/browse/DRILL-5323 > Project: Apache Drill > Issue Type: Sub-task > Components: Tools, Build & Test > Affects Versions: 1.11.0 > Reporter: Paul Rogers > Assignee: Paul Rogers > Fix For: 1.11.0 > > > Operators work with individual row sets. A row set is a collection of records > stored as column vectors. (Drill uses various terms for this concept. A > record batch is a row set with an operator implementation wrapped around it. > A vector container is a row set, but with much functionality left as an > exercise for the developer. And so on.) > To simplify tests, we need a {{TestRowSet}} concept that wraps a > {{VectorContainer}} and provides easy ways to: > * Define a schema for the row set. > * Create a set of vectors that implement the schema. > * Populate the row set with test data via code. > * Add an SV2 to the row set. > * Pass the row set to operator components (such as generated code blocks.) > * Compare the results of the operation with an expected result set. > * Dispose of the underling direct memory when work is done. -- This message was sent by Atlassian JIRA (v6.3.15#6346)