[GitHub] spark pull request: [SPARK-12635][SQL] Add ColumnarBatch, an in me...

davies Wed, 06 Jan 2016 22:05:05 -0800

Github user davies commented on a diff in the pull request:

    https://github.com/apache/spark/pull/10628#discussion_r49044395
  
    --- Diff: 
sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
 ---
    @@ -0,0 +1,181 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +package org.apache.spark.sql.execution.vectorized;
    +
    +import org.apache.spark.sql.types.DataType;
    +
    +/**
    + * This class represents a column of values and provides the main APIs to 
access the data
    + * values. It supports all the types and contains get/put APIs as well as 
their batched versions.
    + * The batched versions are preferable whenever possible.
    + *
    + * Most of the APIs take the rowId as a parameter. This is the local 
0-based row id for values
    + * in the current RowBatch.
    + *
    + * A ColumnVector should be considered immutable once originally created. 
In other words, it is not
    + * valid to call put APIs after reads until reset() is called.
    + */
    +public abstract class ColumnVector {
    +  /**
    +   * Allocates a column with each element of size `width` either on or off 
heap.
    +   */
    +  public static ColumnVector allocate(int capacity, DataType type, boolean 
offHeap) {
    +    if (offHeap) {
    +      return new OffHeapColumnVector(capacity, type);
    +    } else {
    +      return new OnHeapColumnVector(capacity, type);
    +    }
    +  }
    +
    +  public final DataType dataType() { return type; }
    +
    +  /**
    +   * Resets this column for writing. The currently stored values are no 
longer accessible.
    +   */
    +  public void reset() {
    +    numNulls = 0;
    +    if (anyNullsSet) {
    +      putNotNulls(0, capacity);
    +      anyNullsSet = false;
    +    }
    +  }
    +
    +  /**
    +   * Cleans up memory for this column. The column is not usable after this.
    +   * TODO: this should probably have ref-counted semantics.
    +   */
    +  public abstract void close();
    +
    +  /**
    +   * Returns the number of nulls in this column.
    +   */
    +  public final int numNulls() { return numNulls; }
    +
    +  /**
    +   * Returns true if any of the nulls indicator are set for this column. 
This can be used
    +   * as an optimization to prevent setting nulls.
    +   */
    +  public final boolean anyNullsSet() { return anyNullsSet; }
    +
    +  /**
    +   * Returns the off heap ptr for the arrays backing the NULLs and values 
buffer. Only valid
    +   * to call for off heap columns.
    +   */
    +  public abstract long nullsNativeAddress();
    +  public abstract long valuesNativeAddress();
    +
    +  /**
    +   * Sets the value at rowId to null/not null.
    +   */
    +  public abstract void putNotNull(int rowId);
    +  public abstract void putNull(int rowId);
    +
    +  /**
    +   * Sets the values from [rowId, rowId + count) to null/not null.
    +   */
    +  public abstract void putNulls(int rowId, int count);
    +  public abstract void putNotNulls(int rowId, int count);
    +
    +  /**
    +   * Returns whether the value at rowId is NULL.
    +   */
    +  public abstract boolean getIsNull(int rowId);
    +
    +  /**
    +   * Sets the value at rowId to `value`.
    +   */
    +  public abstract void putInt(int rowId, int value);
    +
    +  /**
    +   * Sets values from [rowId, rowId + count) to value.
    +   */
    +  public abstract void putInts(int rowId, int count, int value);
    +
    +  /**
    +   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + 
srcIndex + count)
    +   */
    +  public abstract void putInts(int rowId, int count, int[] src, int 
srcIndex);
    +
    +  /**
    +   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + 
srcIndex + count)
    +   * The data in src must be 4-byte little endian ints.
    +   */
    +  public abstract void putIntsLittleEndian(int rowId, int count, byte[] 
src, int srcIndex);
    +
    +  /**
    +   * Returns the integer for rowId.
    +   */
    +  public abstract int getInt(int rowId);
    +
    +  /**
    +   * Sets the value at rowId to `value`.
    +   */
    +  public abstract void putDouble(int rowId, double value);
    +
    +  /**
    +   * Sets values from [rowId, rowId + count) to value.
    +   */
    +  public abstract void putDoubles(int rowId, int count, double value);
    +
    +  /**
    +   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + 
srcIndex + count)
    +   * src should contain `count` doubles written as ieee format.
    +   */
    +  public abstract void putDoubles(int rowId, int count, double[] src, int 
srcIndex);
    +
    +  /**
    +   * Sets values from [rowId, rowId + count) to [src + srcIndex, src + 
srcIndex + count)
    +   * The data in src must be ieee formated doubles.
    +   */
    +  public abstract void putDoubles(int rowId, int count, byte[] src, int 
srcIndex);
    +
    +  /**
    +   * Returns the double for rowId.
    +   */
    +  public abstract double getDouble(int rowId);
    +
    +  /**
    +   * Maximum number of rows that can be stored in this column.
    +   */
    +  protected final int capacity;
    +
    +  /**
    +   * Byte width fo this column.
    +   */
    +  protected final int width;
    +
    +  /**
    +   * Number of nulls in this column.
    +   */
    +  protected int numNulls;
    +
    +  /**
    +   * True if there is at least one NULL byte set.
    +   */
    +  protected boolean anyNullsSet;
    --- End diff --
    
    Is this equal to `numNulls == 0`?



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-12635][SQL] Add ColumnarBatch, an in me...

Reply via email to