Repository: arrow Updated Branches: refs/heads/master 670612e6f -> 22c738cc7
ARROW-874: [JS] Read dictionary-encoded vectors Author: Brian Hulette <brian.hule...@ccri.com> Author: Emilio Lahr-Vivaz <elahrvi...@ccri.com> Author: Brian Hulette <hulet...@gmail.com> Closes #655 from TheNeuralBit/js_dictionary and squashes the following commits: 4fbaf9d [Brian Hulette] add unit tests, fix errors in file format dictionary reading d89c84b [Brian Hulette] Add dictionary support for file format 5bdf8a1 [Emilio Lahr-Vivaz] dictionary encoding c6eff38 [Brian Hulette] Updated API documenation 4c84362 [Brian Hulette] added struct_example tests (Struct type, and multiple record batches) 5a2efe6 [Brian Hulette] add tests for streaming format 9aed94b [Brian Hulette] Fix file format, unit tests e2e0d4d [Emilio Lahr-Vivaz] renaming reader method 844b5e9 [Emilio Lahr-Vivaz] fix for file format bfb7754 [Emilio Lahr-Vivaz] cleanup 162c9be [Emilio Lahr-Vivaz] working for streaming 290497f [Emilio Lahr-Vivaz] js support multiple record batches 4b3b412 [Emilio Lahr-Vivaz] initial support for streaming file format, added FixedSizeList c9d705d [Brian Hulette] Created npm build script 53db587 [Brian Hulette] Fixes to make tests pass 304c669 [Brian Hulette] Added basic js unit tests Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/22c738cc Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/22c738cc Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/22c738cc Branch: refs/heads/master Commit: 22c738cc7b26bff9e7319d438dd3fef1238d46ad Parents: 670612e Author: Brian Hulette <brian.hule...@ccri.com> Authored: Tue May 9 15:55:27 2017 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Tue May 9 15:55:27 2017 -0400 ---------------------------------------------------------------------- js/.gitignore | 2 + js/README.md | 17 +- js/bin/arrow2csv.js | 9 +- js/bin/arrow_schema.js | 8 +- js/examples/read_file.html | 6 +- js/flatbuffers.sh | 19 + js/lib/arrow.ts | 440 +++++++++++++++++++---- js/lib/bitarray.ts | 17 +- js/lib/types.ts | 581 ++++++++++++++++++++++--------- js/package.json | 12 +- js/postinstall.sh | 18 - js/spec/arrow.js | 179 ++++++++++ js/spec/dictionary-stream.arrow | Bin 0 -> 1776 bytes js/spec/dictionary.arrow | Bin 0 -> 2522 bytes js/spec/simple-stream.arrow | Bin 0 -> 1188 bytes js/spec/simple.arrow | Bin 0 -> 1642 bytes js/spec/struct_example-stream.arrow | Bin 0 -> 1884 bytes js/spec/struct_example.arrow | Bin 0 -> 2354 bytes 18 files changed, 1026 insertions(+), 282 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/.gitignore ---------------------------------------------------------------------- diff --git a/js/.gitignore b/js/.gitignore index 3b97e3a..f67c1cc 100644 --- a/js/.gitignore +++ b/js/.gitignore @@ -2,3 +2,5 @@ lib/*_generated.js dist node_modules typings +.idea +*.iml http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/README.md ---------------------------------------------------------------------- diff --git a/js/README.md b/js/README.md index 98ef756..cdabf54 100644 --- a/js/README.md +++ b/js/README.md @@ -20,6 +20,7 @@ From this directory, run: $ npm install # pull dependencies $ tsc # build typescript $ webpack # bundle for the browser +$ npm test # run unit tests ``` ### Usage @@ -42,9 +43,19 @@ Include `dist/arrow-bundle.js` in a `<script />` tag: See [examples/read_file.html](examples/read_file.html) for a usage example - or try it out now at [theneuralbit.github.io/arrow](http://theneuralbit.github.io/arrow) ### API -##### `arrow.loadSchema(buffer)` +##### `arrow.getReader(buffer)` +Returns an `ArrowReader` object representing the Arrow file or stream contained in +the `buffer`. + +##### `ArrowReader.loadNextBatch()` +Loads the next record batch and returns it's length. + +##### `ArrowReader.getSchema()` Returns a JSON representation of the file's Arrow schema. -##### `arrow.loadVectors(buffer)` -Returns a dictionary of `Vector` objects, one for each column, indexed by the column's name. +##### `ArrowReader.getVectors()` +Returns a list of `Vector` objects, one for each column. Vector objects have, at minimum, a `get(i)` method and a `length` attribute. + +##### `ArrowReader.getVector(name: String)` +Return a Vector object for column `name` http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/bin/arrow2csv.js ---------------------------------------------------------------------- diff --git a/js/bin/arrow2csv.js b/js/bin/arrow2csv.js index 48df2f9..8122e95 100755 --- a/js/bin/arrow2csv.js +++ b/js/bin/arrow2csv.js @@ -19,7 +19,7 @@ var fs = require('fs') var process = require('process'); -var loadVectors = require('../dist/arrow.js').loadVectors; +var arrow = require('../dist/arrow.js'); var program = require('commander'); function list (val) { @@ -38,10 +38,11 @@ if (!program.schema) { } var buf = fs.readFileSync(process.argv[process.argv.length - 1]); -var vectors = loadVectors(buf); +var reader = arrow.getReader(buf); +reader.loadNextBatch(); -for (var i = 0; i < vectors[program.schema[0]].length; i += 1|0) { +for (var i = 0; i < reader.getVector(program.schema[0]).length; i += 1|0) { console.log(program.schema.map(function (field) { - return '' + vectors[field].get(i); + return '' + reader.getVector(field).get(i); }).join(',')); } http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/bin/arrow_schema.js ---------------------------------------------------------------------- diff --git a/js/bin/arrow_schema.js b/js/bin/arrow_schema.js index 7044778..44dabb4 100755 --- a/js/bin/arrow_schema.js +++ b/js/bin/arrow_schema.js @@ -19,7 +19,11 @@ var fs = require('fs'); var process = require('process'); -var loadSchema = require('../dist/arrow.js').loadSchema; +var arrow = require('../dist/arrow.js'); var buf = fs.readFileSync(process.argv[process.argv.length - 1]); -console.log(JSON.stringify(loadSchema(buf), null, '\t')); +var reader = arrow.getReader(buf); +console.log(JSON.stringify(reader.getSchema(), null, '\t')); +//console.log(JSON.stringify(reader.getVectors(), null, '\t')); +console.log('block count: ' + reader.getBatchCount()); + http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/examples/read_file.html ---------------------------------------------------------------------- diff --git a/js/examples/read_file.html b/js/examples/read_file.html index 02b6f08..933b142 100644 --- a/js/examples/read_file.html +++ b/js/examples/read_file.html @@ -40,9 +40,11 @@ function addCell (tr, type, name) { } reader.onload = function (evt) { var buf = new Uint8Array(evt.target.result); - var schema = arrow.loadSchema(buf); - var vectors = arrow.loadVectors(buf); + var schema = arrow.loadSchemaFromStream(buf); + var vectors = arrow.loadVectorsFromStream(buf); var length = vectors[schema[0].name].length; +console.log(JSON.stringify(schema, null, '\t')); +console.log(JSON.stringify(vectors, null, '\t')); var thead = document.getElementById("thead"); var tbody = document.getElementById("tbody"); http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/flatbuffers.sh ---------------------------------------------------------------------- diff --git a/js/flatbuffers.sh b/js/flatbuffers.sh new file mode 100755 index 0000000..99d2815 --- /dev/null +++ b/js/flatbuffers.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + +echo "Compiling flatbuffer schemas..." +#flatc -o lib --js ../format/Message.fbs ../format/File.fbs +flatc -o lib --js ../format/*.fbs +rm -f lib/Arrow_generated.js +cat lib/*_generated.js > lib/Arrow_generated.js http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/lib/arrow.ts ---------------------------------------------------------------------- diff --git a/js/lib/arrow.ts b/js/lib/arrow.ts index 0762885..74def4d 100644 --- a/js/lib/arrow.ts +++ b/js/lib/arrow.ts @@ -17,48 +17,186 @@ import { flatbuffers } from 'flatbuffers'; import { org } from './Arrow_generated'; -var arrow = org.apache.arrow; import { vectorFromField, Vector } from './types'; -export function loadVectors(buf) { - var fileLength = buf.length, bb, footerLengthOffset, footerLength, - footerOffset, footer, schema, field, type, type_str, i, - len, rb_metas, rb_meta, rtrn, recordBatchBlock, recordBatchBlocks = []; - var vectors : Vector[] = []; +import ByteBuffer = flatbuffers.ByteBuffer; +var Footer = org.apache.arrow.flatbuf.Footer; +var Message = org.apache.arrow.flatbuf.Message; +var MessageHeader = org.apache.arrow.flatbuf.MessageHeader; +var RecordBatch = org.apache.arrow.flatbuf.RecordBatch; +var DictionaryBatch = org.apache.arrow.flatbuf.DictionaryBatch; +var Schema = org.apache.arrow.flatbuf.Schema; +var Type = org.apache.arrow.flatbuf.Type; +var VectorType = org.apache.arrow.flatbuf.VectorType; + +export class ArrowReader { + + private bb; + private schema: any = []; + private vectors: Vector[]; + private vectorMap: any = {}; + private dictionaries: any = {}; + private batches: any = []; + private batchIndex: number = 0; + + constructor(bb, schema, vectors: Vector[], batches, dictionaries) { + this.bb = bb; + this.schema = schema; + this.vectors = vectors; + for (var i = 0; i < vectors.length; i += 1|0) { + this.vectorMap[vectors[i].name] = vectors[i] + } + this.batches = batches; + this.dictionaries = dictionaries; + } + + loadNextBatch() { + if (this.batchIndex < this.batches.length) { + var batch = this.batches[this.batchIndex]; + this.batchIndex += 1; + loadVectors(this.bb, this.vectors, batch); + return batch.length; + } else { + return 0; + } + } + + getSchema() { + return this.schema; + } + + getVectors() { + return this.vectors; + } + + getVector(name) { + return this.vectorMap[name]; + } - bb = new flatbuffers.ByteBuffer(buf); + getBatchCount() { + return this.batches.length; + } + + // the index of the next batch to be loaded + getBatchIndex() { + return this.batchIndex; + } - footer = _loadFooter(bb); + // set the index of the next batch to be loaded + setBatchIndex(i: number) { + this.batchIndex = i; + } +} - schema = footer.schema(); +export function getSchema(buf) { return getReader(buf).getSchema(); } + +export function getReader(buf) : ArrowReader { + if (_checkMagic(buf, 0)) { + return getFileReader(buf); + } else { + return getStreamReader(buf); + } +} + +export function getStreamReader(buf) : ArrowReader { + var bb = new ByteBuffer(buf); + + var schema = _loadSchema(bb), + field, + vectors: Vector[] = [], + i,j, + iLen,jLen, + batch, + recordBatches = [], + dictionaryBatches = [], + dictionaries = {}; + + for (i = 0, iLen = schema.fieldsLength(); i < iLen; i += 1|0) { + field = schema.fields(i); + _createDictionaryVectors(field, dictionaries); + vectors.push(vectorFromField(field, dictionaries)); + } + + while (bb.position() < bb.capacity()) { + batch = _loadBatch(bb); + if (batch == null) { + break; + } else if (batch.type == MessageHeader.DictionaryBatch) { + dictionaryBatches.push(batch); + } else if (batch.type == MessageHeader.RecordBatch) { + recordBatches.push(batch) + } else { + console.error("Expected batch type" + MessageHeader.RecordBatch + " or " + + MessageHeader.DictionaryBatch + " but got " + batch.type); + } + } + + // load dictionary vectors + for (i = 0; i < dictionaryBatches.length; i += 1|0) { + batch = dictionaryBatches[i]; + loadVectors(bb, [dictionaries[batch.id]], batch); + } + + return new ArrowReader(bb, parseSchema(schema), vectors, recordBatches, dictionaries); +} + +export function getFileReader (buf) : ArrowReader { + var bb = new ByteBuffer(buf); + + var footer = _loadFooter(bb); + + var schema = footer.schema(); + var i, len, field, + vectors: Vector[] = [], + block, + batch, + recordBatchBlocks = [], + dictionaryBatchBlocks = [], + dictionaries = {}; for (i = 0, len = schema.fieldsLength(); i < len; i += 1|0) { field = schema.fields(i); - vectors.push(vectorFromField(field)); + _createDictionaryVectors(field, dictionaries); + vectors.push(vectorFromField(field, dictionaries)); + } + + for (i = 0; i < footer.dictionariesLength(); i += 1|0) { + block = footer.dictionaries(i); + dictionaryBatchBlocks.push({ + offset: block.offset().low, + metaDataLength: block.metaDataLength(), + bodyLength: block.bodyLength().low, + }) } for (i = 0; i < footer.recordBatchesLength(); i += 1|0) { - recordBatchBlock = footer.recordBatches(i); + block = footer.recordBatches(i); recordBatchBlocks.push({ - offset: recordBatchBlock.offset(), - metaDataLength: recordBatchBlock.metaDataLength(), - bodyLength: recordBatchBlock.bodyLength(), + offset: block.offset().low, + metaDataLength: block.metaDataLength(), + bodyLength: block.bodyLength().low, }) } - loadBuffersIntoVectors(recordBatchBlocks, bb, vectors); - var rtrn : any = {}; - for (var i : any = 0; i < vectors.length; i += 1|0) { - rtrn[vectors[i].name] = vectors[i] + var dictionaryBatches = dictionaryBatchBlocks.map(function (block) { + bb.setPosition(block.offset); + // TODO: Make sure this is a dictionary batch + return _loadBatch(bb); + }); + + var recordBatches = recordBatchBlocks.map(function (block) { + bb.setPosition(block.offset); + // TODO: Make sure this is a record batch + return _loadBatch(bb); + }); + + // load dictionary vectors + for (i = 0; i < dictionaryBatches.length; i += 1|0) { + batch = dictionaryBatches[i]; + loadVectors(bb, [dictionaries[batch.id]], batch); } - return rtrn; -} -export function loadSchema(buf) { - var footer = _loadFooter(new flatbuffers.ByteBuffer(buf)); - var schema = footer.schema(); - - return parseSchema(schema); + return new ArrowReader(bb, parseSchema(schema), vectors, recordBatches, dictionaries); } function _loadFooter(bb) { @@ -81,7 +219,7 @@ function _loadFooter(bb) { var footerLengthOffset: number = fileLength - MAGIC.length - 4; bb.setPosition(footerLengthOffset); - var footerLength: number = Int64FromByteBuffer(bb, footerLengthOffset) + var footerLength: number = Int32FromByteBuffer(bb, footerLengthOffset) if (footerLength <= 0 || footerLength + MAGIC.length*2 + 4 > fileLength) { console.log("Invalid footer length: " + footerLength) @@ -89,19 +227,166 @@ function _loadFooter(bb) { var footerOffset: number = footerLengthOffset - footerLength; bb.setPosition(footerOffset); - var footer = arrow.flatbuf.Footer.getRootAsFooter(bb); + var footer = Footer.getRootAsFooter(bb); return footer; } -function Int64FromByteBuffer(bb, offset) { +function _loadSchema(bb) { + var message =_loadMessage(bb); + if (message.headerType() != MessageHeader.Schema) { + console.error("Expected header type " + MessageHeader.Schema + " but got " + message.headerType()); + return; + } + return message.header(new Schema()); +} + +function _loadBatch(bb) { + var message = _loadMessage(bb); + if (message == null) { + return; + } else if (message.headerType() == MessageHeader.RecordBatch) { + var batch = { header: message.header(new RecordBatch()), length: message.bodyLength().low } + return _loadRecordBatch(bb, batch); + } else if (message.headerType() == MessageHeader.DictionaryBatch) { + var batch = { header: message.header(new DictionaryBatch()), length: message.bodyLength().low } + return _loadDictionaryBatch(bb, batch); + } else { + console.error("Expected header type " + MessageHeader.RecordBatch + " or " + MessageHeader.DictionaryBatch + + " but got " + message.headerType()); + return; + } +} + +function _loadRecordBatch(bb, batch) { + var data = batch.header; + var i, nodes_ = [], nodesLength = data.nodesLength(); + var buffer, buffers_ = [], buffersLength = data.buffersLength(); + + for (i = 0; i < nodesLength; i += 1) { + nodes_.push(data.nodes(i)); + } + for (i = 0; i < buffersLength; i += 1) { + buffer = data.buffers(i); + buffers_.push({ offset: bb.position() + buffer.offset().low, length: buffer.length().low }); + } + // position the buffer after the body to read the next message + bb.setPosition(bb.position() + batch.length); + + return { nodes: nodes_, buffers: buffers_, length: data.length().low, type: MessageHeader.RecordBatch }; +} + +function _loadDictionaryBatch(bb, batch) { + var id_ = batch.header.id().toFloat64().toString(), data = batch.header.data(); + var i, nodes_ = [], nodesLength = data.nodesLength(); + var buffer, buffers_ = [], buffersLength = data.buffersLength(); + + for (i = 0; i < nodesLength; i += 1) { + nodes_.push(data.nodes(i)); + } + for (i = 0; i < buffersLength; i += 1) { + buffer = data.buffers(i); + buffers_.push({ offset: bb.position() + buffer.offset().low, length: buffer.length().low }); + } + // position the buffer after the body to read the next message + bb.setPosition(bb.position() + batch.length); + + return { id: id_, nodes: nodes_, buffers: buffers_, length: data.length().low, type: MessageHeader.DictionaryBatch }; +} + +function _loadMessage(bb) { + var messageLength: number = Int32FromByteBuffer(bb, bb.position()); + if (messageLength == 0) { + return; + } + bb.setPosition(bb.position() + 4); + var message = Message.getRootAsMessage(bb); + // position the buffer at the end of the message so it's ready to read further + bb.setPosition(bb.position() + messageLength); + + return message; +} + +function _createDictionaryVectors(field, dictionaries) { + var encoding = field.dictionary(); + if (encoding != null) { + var id = encoding.id().toFloat64().toString(); + if (dictionaries[id] == null) { + // create a field for the dictionary + var dictionaryField = _createDictionaryField(id, field); + dictionaries[id] = vectorFromField(dictionaryField, null); + } + } + + // recursively examine child fields + for (var i = 0, len = field.childrenLength(); i < len; i += 1|0) { + _createDictionaryVectors(field.children(i), dictionaries); + } +} + +function _createDictionaryField(id, field) { + var builder = new flatbuffers.Builder(); + var nameOffset = builder.createString("dict-" + id); + + var typeType = field.typeType(); + var typeOffset; + if (typeType === Type.Int) { + var type = field.type(new org.apache.arrow.flatbuf.Int()); + org.apache.arrow.flatbuf.Int.startInt(builder); + org.apache.arrow.flatbuf.Int.addBitWidth(builder, type.bitWidth()); + org.apache.arrow.flatbuf.Int.addIsSigned(builder, type.isSigned()); + typeOffset = org.apache.arrow.flatbuf.Int.endInt(builder); + } else if (typeType === Type.FloatingPoint) { + var type = field.type(new org.apache.arrow.flatbuf.FloatingPoint()); + org.apache.arrow.flatbuf.FloatingPoint.startFloatingPoint(builder); + org.apache.arrow.flatbuf.FloatingPoint.addPrecision(builder, type.precision()); + typeOffset = org.apache.arrow.flatbuf.FloatingPoint.endFloatingPoint(builder); + } else if (typeType === Type.Utf8) { + org.apache.arrow.flatbuf.Utf8.startUtf8(builder); + typeOffset = org.apache.arrow.flatbuf.Utf8.endUtf8(builder); + } else if (typeType === Type.Date) { + var type = field.type(new org.apache.arrow.flatbuf.Date()); + org.apache.arrow.flatbuf.Date.startDate(builder); + org.apache.arrow.flatbuf.Date.addUnit(builder, type.unit()); + typeOffset = org.apache.arrow.flatbuf.Date.endDate(builder); + } else { + throw "Unimplemented dictionary type " + typeType; + } + if (field.childrenLength() > 0) { + throw "Dictionary encoded fields can't have children" + } + var childrenOffset = org.apache.arrow.flatbuf.Field.createChildrenVector(builder, []); + + var layout, layoutOffsets = []; + for (var i = 0, len = field.layoutLength(); i < len; i += 1|0) { + layout = field.layout(i); + org.apache.arrow.flatbuf.VectorLayout.startVectorLayout(builder); + org.apache.arrow.flatbuf.VectorLayout.addBitWidth(builder, layout.bitWidth()); + org.apache.arrow.flatbuf.VectorLayout.addType(builder, layout.type()); + layoutOffsets.push(org.apache.arrow.flatbuf.VectorLayout.endVectorLayout(builder)); + } + var layoutOffset = org.apache.arrow.flatbuf.Field.createLayoutVector(builder, layoutOffsets); + + org.apache.arrow.flatbuf.Field.startField(builder); + org.apache.arrow.flatbuf.Field.addName(builder, nameOffset); + org.apache.arrow.flatbuf.Field.addNullable(builder, field.nullable()); + org.apache.arrow.flatbuf.Field.addTypeType(builder, typeType); + org.apache.arrow.flatbuf.Field.addType(builder, typeOffset); + org.apache.arrow.flatbuf.Field.addChildren(builder, childrenOffset); + org.apache.arrow.flatbuf.Field.addLayout(builder, layoutOffset); + var offset = org.apache.arrow.flatbuf.Field.endField(builder); + builder.finish(offset); + + return org.apache.arrow.flatbuf.Field.getRootAsField(builder.bb); +} + +function Int32FromByteBuffer(bb, offset) { return ((bb.bytes_[offset + 3] & 255) << 24) | ((bb.bytes_[offset + 2] & 255) << 16) | ((bb.bytes_[offset + 1] & 255) << 8) | ((bb.bytes_[offset] & 255)); } - var MAGIC_STR = "ARROW1"; var MAGIC = new Uint8Array(MAGIC_STR.length); for (var i = 0; i < MAGIC_STR.length; i += 1|0) { @@ -118,27 +403,28 @@ function _checkMagic(buf, index) { } var TYPEMAP = {} -TYPEMAP[arrow.flatbuf.Type.NONE] = "NONE"; -TYPEMAP[arrow.flatbuf.Type.Null] = "Null"; -TYPEMAP[arrow.flatbuf.Type.Int] = "Int"; -TYPEMAP[arrow.flatbuf.Type.FloatingPoint] = "FloatingPoint"; -TYPEMAP[arrow.flatbuf.Type.Binary] = "Binary"; -TYPEMAP[arrow.flatbuf.Type.Utf8] = "Utf8"; -TYPEMAP[arrow.flatbuf.Type.Bool] = "Bool"; -TYPEMAP[arrow.flatbuf.Type.Decimal] = "Decimal"; -TYPEMAP[arrow.flatbuf.Type.Date] = "Date"; -TYPEMAP[arrow.flatbuf.Type.Time] = "Time"; -TYPEMAP[arrow.flatbuf.Type.Timestamp] = "Timestamp"; -TYPEMAP[arrow.flatbuf.Type.Interval] = "Interval"; -TYPEMAP[arrow.flatbuf.Type.List] = "List"; -TYPEMAP[arrow.flatbuf.Type.Struct_] = "Struct"; -TYPEMAP[arrow.flatbuf.Type.Union] = "Union"; +TYPEMAP[Type.NONE] = "NONE"; +TYPEMAP[Type.Null] = "Null"; +TYPEMAP[Type.Int] = "Int"; +TYPEMAP[Type.FloatingPoint] = "FloatingPoint"; +TYPEMAP[Type.Binary] = "Binary"; +TYPEMAP[Type.Utf8] = "Utf8"; +TYPEMAP[Type.Bool] = "Bool"; +TYPEMAP[Type.Decimal] = "Decimal"; +TYPEMAP[Type.Date] = "Date"; +TYPEMAP[Type.Time] = "Time"; +TYPEMAP[Type.Timestamp] = "Timestamp"; +TYPEMAP[Type.Interval] = "Interval"; +TYPEMAP[Type.List] = "List"; +TYPEMAP[Type.FixedSizeList] = "FixedSizeList"; +TYPEMAP[Type.Struct_] = "Struct"; +TYPEMAP[Type.Union] = "Union"; var VECTORTYPEMAP = {}; -VECTORTYPEMAP[arrow.flatbuf.VectorType.OFFSET] = 'OFFSET'; -VECTORTYPEMAP[arrow.flatbuf.VectorType.DATA] = 'DATA'; -VECTORTYPEMAP[arrow.flatbuf.VectorType.VALIDITY] = 'VALIDITY'; -VECTORTYPEMAP[arrow.flatbuf.VectorType.TYPE] = 'TYPE'; +VECTORTYPEMAP[VectorType.OFFSET] = 'OFFSET'; +VECTORTYPEMAP[VectorType.DATA] = 'DATA'; +VECTORTYPEMAP[VectorType.VALIDITY] = 'VALIDITY'; +VECTORTYPEMAP[VectorType.TYPE] = 'TYPE'; function parseField(field) { var children = []; @@ -149,7 +435,6 @@ function parseField(field) { var layouts = []; for (var i = 0; i < field.layoutLength(); i += 1|0) { layouts.push(VECTORTYPEMAP[field.layout(i).type()]); - } return { @@ -170,32 +455,39 @@ function parseSchema(schema) { return result; } -function parseBuffer(buffer) { - return { - offset: buffer.offset(), - length: buffer.length() - }; +function loadVectors(bb, vectors: Vector[], recordBatch) { + var indices = { bufferIndex: 0, nodeIndex: 0 }, i; + for (i = 0; i < vectors.length; i += 1) { + loadVector(bb, vectors[i], recordBatch, indices); + } } -function loadBuffersIntoVectors(recordBatchBlocks, bb, vectors : Vector[]) { - var fieldNode, recordBatchBlock, recordBatch, numBuffers, bufReader = {index: 0, node_index: 1}, field_ctr = 0; - var buffer = bb.bytes_.buffer; - var baseOffset = bb.bytes_.byteOffset; - for (var i = recordBatchBlocks.length - 1; i >= 0; i -= 1|0) { - recordBatchBlock = recordBatchBlocks[i]; - bb.setPosition(recordBatchBlock.offset.low); - recordBatch = arrow.flatbuf.RecordBatch.getRootAsRecordBatch(bb); - bufReader.index = 0; - bufReader.node_index = 0; - numBuffers = recordBatch.buffersLength(); - - //console.log('num buffers: ' + recordBatch.buffersLength()); - //console.log('num nodes: ' + recordBatch.nodesLength()); - - while (bufReader.index < numBuffers) { - //console.log('Allocating buffers starting at ' + bufReader.index + '/' + numBuffers + ' to field ' + field_ctr); - vectors[field_ctr].loadData(recordBatch, buffer, bufReader, baseOffset + recordBatchBlock.offset.low + recordBatchBlock.metaDataLength) - field_ctr += 1; - } +/** + * Loads a vector with data from a batch + * recordBatch: { nodes: org.apache.arrow.flatbuf.FieldNode[], buffers: { offset: number, length: number }[] } + */ +function loadVector(bb, vector: Vector, recordBatch, indices) { + var node = recordBatch.nodes[indices.nodeIndex], ownBuffersLength, ownBuffers = [], i; + indices.nodeIndex += 1; + + // dictionary vectors are always ints, so will have a data vector plus optional null vector + if (vector.field.dictionary() == null) { + ownBuffersLength = vector.field.layoutLength(); + } else if (vector.field.nullable()) { + ownBuffersLength = 2; + } else { + ownBuffersLength = 1; + } + + for (i = 0; i < ownBuffersLength; i += 1) { + ownBuffers.push(recordBatch.buffers[indices.bufferIndex + i]); + } + indices.bufferIndex += ownBuffersLength; + + vector.loadData(bb, node, ownBuffers); + + var children = vector.getChildVectors(); + for (i = 0; i < children.length; i++) { + loadVector(bb, children[i], recordBatch, indices); } } http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/lib/bitarray.ts ---------------------------------------------------------------------- diff --git a/js/lib/bitarray.ts b/js/lib/bitarray.ts index 82fff32..fc3c091 100644 --- a/js/lib/bitarray.ts +++ b/js/lib/bitarray.ts @@ -17,22 +17,9 @@ export class BitArray { private view: Uint8Array; - constructor(buffer: ArrayBuffer, offset: number, length: number) { - //if (ArrayBuffer.isView(buffer)) { - // var og_view = buffer; - // buffer = buffer.buffer; - // offset = og_view.offset; - // length = og_view.length/og_view.BYTES_PER_ELEMENT*8; - //} else if (buffer instanceof ArrayBuffer) { - var offset = offset || 0; - var length = length;// || buffer.length*8; - //} else if (buffer instanceof Number) { - // length = buffer; - // buffer = new ArrayBuffer(Math.ceil(length/8)); - // offset = 0; - //} - this.view = new Uint8Array(buffer, offset, Math.ceil(length/8)); + constructor(buffer: ArrayBuffer, offset: number, length: number) { + this.view = new Uint8Array(buffer, offset || 0, Math.ceil(length / 8)); } get(i) { http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/lib/types.ts ---------------------------------------------------------------------- diff --git a/js/lib/types.ts b/js/lib/types.ts index bbc7558..d656c6a 100644 --- a/js/lib/types.ts +++ b/js/lib/types.ts @@ -18,7 +18,8 @@ import { BitArray } from './bitarray'; import { TextDecoder } from 'text-encoding'; import { org } from './Arrow_generated'; -var arrow = org.apache.arrow; + +var Type = org.apache.arrow.flatbuf.Type; interface ArrayView { slice(start: number, end: number) : ArrayView @@ -26,72 +27,90 @@ interface ArrayView { } export abstract class Vector { + field: any; name: string; length: number; null_count: number; - constructor(name: string) { - this.name = name; + + constructor(field) { + this.field = field; + this.name = field.name(); } + /* Access datum at index i */ abstract get(i); /* Return array representing data in the range [start, end) */ abstract slice(start: number, end: number); - - /* Use recordBatch fieldNodes and Buffers to construct this Vector */ - public loadData(recordBatch: any, buffer: any, bufReader: any, baseOffset: any) { - var fieldNode = recordBatch.nodes(bufReader.node_index); - this.length = fieldNode.length(); - this.null_count = fieldNode.length(); - bufReader.node_index += 1|0; - - this.loadBuffers(recordBatch, buffer, bufReader, baseOffset); - } - - protected abstract loadBuffers(recordBatch: any, buffer: any, bufReader: any, baseOffset: any); - - /* Helper function for loading a VALIDITY buffer (for Nullable types) */ - static loadValidityBuffer(recordBatch, buffer, bufReader, baseOffset) : BitArray { - var buf_meta = recordBatch.buffers(bufReader.index); - var offset = baseOffset + buf_meta.offset().low; - var length = buf_meta.length().low; - bufReader.index += 1|0; - return new BitArray(buffer, offset, length*8); - } - - /* Helper function for loading an OFFSET buffer */ - static loadOffsetBuffer(recordBatch, buffer, bufReader, baseOffset) : Int32Array { - var buf_meta = recordBatch.buffers(bufReader.index); - var offset = baseOffset + buf_meta.offset().low; - var length = buf_meta.length().low/Int32Array.BYTES_PER_ELEMENT; - bufReader.index += 1|0; - return new Int32Array(buffer, offset, length); + /* Return array of child vectors, for container types */ + abstract getChildVectors(); + + /** + * Use recordBatch fieldNodes and Buffers to construct this Vector + * bb: flatbuffers.ByteBuffer + * node: org.apache.arrow.flatbuf.FieldNode + * buffers: { offset: number, length: number }[] + */ + public loadData(bb, node, buffers) { + this.length = node.length().low; + this.null_count = node.nullCount().low; + this.loadBuffers(bb, node, buffers); + } + + protected abstract loadBuffers(bb, node, buffers); + + /** + * Helper function for loading a VALIDITY buffer (for Nullable types) + * bb: flatbuffers.ByteBuffer + * buffer: org.apache.arrow.flatbuf.Buffer + */ + static loadValidityBuffer(bb, buffer) : BitArray { + var arrayBuffer = bb.bytes_.buffer; + var offset = bb.bytes_.byteOffset + buffer.offset; + return new BitArray(arrayBuffer, offset, buffer.length * 8); + } + + /** + * Helper function for loading an OFFSET buffer + * buffer: org.apache.arrow.flatbuf.Buffer + */ + static loadOffsetBuffer(bb, buffer) : Int32Array { + var arrayBuffer = bb.bytes_.buffer; + var offset = bb.bytes_.byteOffset + buffer.offset; + var length = buffer.length / Int32Array.BYTES_PER_ELEMENT; + return new Int32Array(arrayBuffer, offset, length); } } class SimpleVector<T extends ArrayView> extends Vector { protected dataView: T; - private TypedArray: {new(buffer: any, offset: number, length: number) : T, BYTES_PER_ELEMENT: number}; + private TypedArray: { new(buffer: any, offset: number, length: number): T, BYTES_PER_ELEMENT: number }; - constructor (TypedArray: {new(buffer: any, offset: number, length: number): T, BYTES_PER_ELEMENT: number}, name: string) { - super(name); + constructor (field, TypedArray: { new(buffer: any, offset: number, length: number): T, BYTES_PER_ELEMENT: number }) { + super(field); this.TypedArray = TypedArray; } + getChildVectors() { + return []; + } + get(i) { return this.dataView[i]; } - loadBuffers(recordBatch, buffer, bufReader, baseOffset) { - this.dataView = this.loadDataBuffer(recordBatch, buffer, bufReader, baseOffset); + loadBuffers(bb, node, buffers) { + this.loadDataBuffer(bb, buffers[0]); } - loadDataBuffer(recordBatch, buffer, bufReader, baseOffset) : T { - var buf_meta = recordBatch.buffers(bufReader.index); - var offset = baseOffset + buf_meta.offset().low; - var length = buf_meta.length().low/this.TypedArray.BYTES_PER_ELEMENT; - bufReader.index += 1|0; - return new this.TypedArray(buffer, offset, length); + /** + * buffer: org.apache.arrow.flatbuf.Buffer + */ + protected loadDataBuffer(bb, buffer) { + var arrayBuffer = bb.bytes_.buffer; + var offset = bb.bytes_.byteOffset + buffer.offset; + var length = buffer.length / this.TypedArray.BYTES_PER_ELEMENT; + this.dataView = new this.TypedArray(arrayBuffer, offset, length); } getDataView() { @@ -108,77 +127,173 @@ class SimpleVector<T extends ArrayView> extends Vector { } class NullableSimpleVector<T extends ArrayView> extends SimpleVector<T> { - private validityView: BitArray; + + protected validityView: BitArray; + + get(i: number) { + if (this.validityView.get(i)) { + return this.dataView[i]; + } else { + return null; + } + } + + loadBuffers(bb, node, buffers) { + this.validityView = Vector.loadValidityBuffer(bb, buffers[0]); + this.loadDataBuffer(bb, buffers[1]); + } + + getValidityVector() { + return this.validityView; + } +} + +class Uint8Vector extends SimpleVector<Uint8Array> { constructor(field) { super(field, Uint8Array); }; } +class Uint16Vector extends SimpleVector<Uint16Array> { constructor(field) { super(field, Uint16Array); }; } +class Uint32Vector extends SimpleVector<Uint32Array> { constructor(field) { super(field, Uint32Array); }; } +class Int8Vector extends SimpleVector<Uint8Array> { constructor(field) { super(field, Uint8Array); }; } +class Int16Vector extends SimpleVector<Uint16Array> { constructor(field) { super(field, Uint16Array); }; } +class Int32Vector extends SimpleVector<Uint32Array> { constructor(field) { super(field, Uint32Array); }; } +class Float32Vector extends SimpleVector<Float32Array> { constructor(field) { super(field, Float32Array); }; } +class Float64Vector extends SimpleVector<Float64Array> { constructor(field) { super(field, Float64Array); }; } + +class NullableUint8Vector extends NullableSimpleVector<Uint8Array> { constructor(field) { super(field, Uint8Array); }; } +class NullableUint16Vector extends NullableSimpleVector<Uint16Array> { constructor(field) { super(field, Uint16Array); }; } +class NullableUint32Vector extends NullableSimpleVector<Uint32Array> { constructor(field) { super(field, Uint32Array); }; } +class NullableInt8Vector extends NullableSimpleVector<Uint8Array> { constructor(field) { super(field, Uint8Array); }; } +class NullableInt16Vector extends NullableSimpleVector<Uint16Array> { constructor(field) { super(field, Uint16Array); }; } +class NullableInt32Vector extends NullableSimpleVector<Uint32Array> { constructor(field) { super(field, Uint32Array); }; } +class NullableFloat32Vector extends NullableSimpleVector<Float32Array> { constructor(field) { super(field, Float32Array); }; } +class NullableFloat64Vector extends NullableSimpleVector<Float64Array> { constructor(field) { super(field, Float64Array); }; } + +class Uint64Vector extends SimpleVector<Uint32Array> { + constructor(field) { + super(field, Uint32Array); + } get(i: number) { - if (this.validityView.get(i)) return this.dataView[i]; - else return null + return { low: this.dataView[i * 2], high: this.dataView[(i * 2) + 1] }; } +} - loadBuffers(recordBatch, buffer, bufReader, baseOffset) { - this.validityView = Vector.loadValidityBuffer(recordBatch, buffer, bufReader, baseOffset); - this.dataView = this.loadDataBuffer(recordBatch, buffer, bufReader, baseOffset); +class NullableUint64Vector extends NullableSimpleVector<Uint32Array> { + constructor(field) { + super(field, Uint32Array); } + get(i: number) { + if (this.validityView.get(i)) { + return { low: this.dataView[i * 2], high: this.dataView[(i * 2) + 1] }; + } else { + return null; + } + } } -class Uint8Vector extends SimpleVector<Uint8Array> { constructor(name: string) { super(Uint8Array, name); }; } -class Uint16Vector extends SimpleVector<Uint16Array> { constructor(name: string) { super(Uint16Array, name); }; } -class Uint32Vector extends SimpleVector<Uint32Array> { constructor(name: string) { super(Uint32Array, name); }; } -class Int8Vector extends SimpleVector<Uint8Array> { constructor(name: string) { super(Uint8Array, name); }; } -class Int16Vector extends SimpleVector<Uint16Array> { constructor(name: string) { super(Uint16Array, name); }; } -class Int32Vector extends SimpleVector<Uint32Array> { constructor(name: string) { super(Uint32Array, name); }; } -class Float32Vector extends SimpleVector<Float32Array> { constructor(name: string) { super(Float32Array, name); }; } -class Float64Vector extends SimpleVector<Float64Array> { constructor(name: string) { super(Float64Array, name); }; } - -class NullableUint8Vector extends NullableSimpleVector<Uint8Array> { constructor(name: string) { super(Uint8Array, name); }; } -class NullableUint16Vector extends NullableSimpleVector<Uint16Array> { constructor(name: string) { super(Uint16Array, name); }; } -class NullableUint32Vector extends NullableSimpleVector<Uint32Array> { constructor(name: string) { super(Uint32Array, name); }; } -class NullableInt8Vector extends NullableSimpleVector<Uint8Array> { constructor(name: string) { super(Uint8Array, name); }; } -class NullableInt16Vector extends NullableSimpleVector<Uint16Array> { constructor(name: string) { super(Uint16Array, name); }; } -class NullableInt32Vector extends NullableSimpleVector<Uint32Array> { constructor(name: string) { super(Uint32Array, name); }; } -class NullableFloat32Vector extends NullableSimpleVector<Float32Array> { constructor(name: string) { super(Float32Array, name); }; } -class NullableFloat64Vector extends NullableSimpleVector<Float64Array> { constructor(name: string) { super(Float64Array, name); }; } +class Int64Vector extends NullableSimpleVector<Uint32Array> { + constructor(field) { + super(field, Uint32Array); + } + + get(i: number) { + return { low: this.dataView[i * 2], high: this.dataView[(i * 2) + 1] }; + } +} + +class NullableInt64Vector extends NullableSimpleVector<Uint32Array> { + constructor(field) { + super(field, Uint32Array); + } + + get(i: number) { + if (this.validityView.get(i)) { + return { low: this.dataView[i * 2], high: this.dataView[(i * 2) + 1] }; + } else { + return null; + } + } +} + +class DateVector extends SimpleVector<Uint32Array> { + constructor(field) { + super(field, Uint32Array); + } + + get (i) { + return new Date(super.get(2*i+1)*Math.pow(2,32) + super.get(2*i)); + } +} + +class NullableDateVector extends DateVector { + private validityView: BitArray; + + loadBuffers(bb, node, buffers) { + this.validityView = Vector.loadValidityBuffer(bb, buffers[0]); + this.loadDataBuffer(bb, buffers[1]); + } + + get (i) { + if (this.validityView.get(i)) { + return super.get(i); + } else { + return null; + } + } + + getValidityVector() { + return this.validityView; + } +} class Utf8Vector extends SimpleVector<Uint8Array> { protected offsetView: Int32Array; static decoder: TextDecoder = new TextDecoder('utf8'); - constructor(name: string) { - super(Uint8Array, name); + constructor(field) { + super(field, Uint8Array); } - loadBuffers(recordBatch, buffer, bufReader, baseOffset) { - this.offsetView = Vector.loadOffsetBuffer(recordBatch, buffer, bufReader, baseOffset); - this.dataView = this.loadDataBuffer(recordBatch, buffer, bufReader, baseOffset); + loadBuffers(bb, node, buffers) { + this.offsetView = Vector.loadOffsetBuffer(bb, buffers[0]); + this.loadDataBuffer(bb, buffers[1]); } get(i) { - return Utf8Vector.decoder.decode - (this.dataView.slice(this.offsetView[i], this.offsetView[i + 1])); + return Utf8Vector.decoder.decode(this.dataView.slice(this.offsetView[i], this.offsetView[i + 1])); } slice(start: number, end: number) { - var rtrn: string[] = []; + var result: string[] = []; for (var i: number = start; i < end; i += 1|0) { - rtrn.push(this.get(i)); + result.push(this.get(i)); } - return rtrn; + return result; + } + + getOffsetView() { + return this.offsetView; } } class NullableUtf8Vector extends Utf8Vector { private validityView: BitArray; - loadBuffers(recordBatch, buffer, bufReader, baseOffset) { - this.validityView = Vector.loadValidityBuffer(recordBatch, buffer, bufReader, baseOffset); - this.offsetView = Vector.loadOffsetBuffer(recordBatch, buffer, bufReader, baseOffset); - this.dataView = this.loadDataBuffer(recordBatch, buffer, bufReader, baseOffset); + loadBuffers(bb, node, buffers) { + this.validityView = Vector.loadValidityBuffer(bb, buffers[0]); + this.offsetView = Vector.loadOffsetBuffer(bb, buffers[1]); + this.loadDataBuffer(bb, buffers[2]); } get(i) { - if (!this.validityView.get(i)) return null; - return super.get(i); + if (this.validityView.get(i)) { + return super.get(i); + } else { + return null; + } + } + + getValidityVector() { + return this.validityView; } } @@ -186,14 +301,17 @@ class NullableUtf8Vector extends Utf8Vector { class ListVector extends Uint32Vector { private dataVector: Vector; - constructor(name, dataVector : Vector) { - super(name); + constructor(field, dataVector: Vector) { + super(field); this.dataVector = dataVector; } - loadBuffers(recordBatch, buffer, bufReader, baseOffset) { - super.loadBuffers(recordBatch, buffer, bufReader, baseOffset); - this.dataVector.loadData(recordBatch, buffer, bufReader, baseOffset); + getChildVectors() { + return [this.dataVector]; + } + + loadBuffers(bb, node, buffers) { + super.loadBuffers(bb, node, buffers); this.length -= 1; } @@ -210,119 +328,262 @@ class ListVector extends Uint32Vector { return "length: " + (this.length); } - slice(start : number, end : number) { return []; }; + slice(start: number, end: number) { + var result = []; + for (var i = start; i < end; i += 1|0) { + result.push(this.get(i)); + } + return result; + } } class NullableListVector extends ListVector { private validityView: BitArray; - loadBuffers(recordBatch, buffer, bufReader, baseOffset) { - this.validityView = Vector.loadValidityBuffer(recordBatch, buffer, bufReader, baseOffset); - super.loadBuffers(recordBatch, buffer, bufReader, baseOffset); + loadBuffers(bb, node, buffers) { + this.validityView = Vector.loadValidityBuffer(bb, buffers[0]); + this.loadDataBuffer(bb, buffers[1]); + this.length -= 1; } get(i) { - if (!this.validityView.get(i)) return null; - return super.get(i); + if (this.validityView.get(i)) { + return super.get(i); + } else { + return null; + } + } + + getValidityVector() { + return this.validityView; + } +} + +class FixedSizeListVector extends Vector { + private size: number + private dataVector: Vector; + + constructor(field, size: number, dataVector: Vector) { + super(field); + this.size = size; + this.dataVector = dataVector; + } + + getChildVectors() { + return [this.dataVector]; + } + + loadBuffers(bb, node, buffers) { + // no buffers to load + } + + get(i: number) { + return this.dataVector.slice(i * this.size, (i + 1) * this.size); + } + + slice(start : number, end : number) { + var result = []; + for (var i = start; i < end; i += 1|0) { + result.push(this.get(i)); + } + return result; + } + + getListSize() { + return this.size; + } +} + +class NullableFixedSizeListVector extends FixedSizeListVector { + private validityView: BitArray; + + loadBuffers(bb, node, buffers) { + this.validityView = Vector.loadValidityBuffer(bb, buffers[0]); + } + + get(i: number) { + if (this.validityView.get(i)) { + return super.get(i); + } else { + return null; + } + } + + getValidityVector() { + return this.validityView; } } class StructVector extends Vector { private validityView: BitArray; - private vectors : Vector[]; - constructor(name: string, vectors: Vector[]) { - super(name); + private vectors: Vector[]; + + constructor(field, vectors: Vector[]) { + super(field); this.vectors = vectors; } - loadBuffers(recordBatch, buffer, bufReader, baseOffset) { - this.validityView = Vector.loadValidityBuffer(recordBatch, buffer, bufReader, baseOffset); - this.vectors.forEach((v: Vector) => v.loadData(recordBatch, buffer, bufReader, baseOffset)); + getChildVectors() { + return this.vectors; + } + + loadBuffers(bb, node, buffers) { + this.validityView = Vector.loadValidityBuffer(bb, buffers[0]); } get(i : number) { - if (!this.validityView.get(i)) return null; - return this.vectors.map((v: Vector) => v.get(i)); + if (this.validityView.get(i)) { + return this.vectors.map((v: Vector) => v.get(i)); + } else { + return null; + } } slice(start : number, end : number) { - var rtrn = []; - for (var i: number = start; i < end; i += 1|0) { - rtrn.push(this.get(i)); + var result = []; + for (var i = start; i < end; i += 1|0) { + result.push(this.get(i)); } - return rtrn; + return result; + } + + getValidityVector() { + return this.validityView; } } -class DateVector extends SimpleVector<Uint32Array> { - constructor (name: string) { - super(Uint32Array, name); +class DictionaryVector extends Vector { + + private indices: Vector; + private dictionary: Vector; + + constructor (field, indices: Vector, dictionary: Vector) { + super(field); + this.indices = indices; + this.dictionary = dictionary; } - get (i) { - return new Date(super.get(2*i+1)*Math.pow(2,32) + super.get(2*i)); + get(i) { + var encoded = this.indices.get(i); + if (encoded == null) { + return null; + } else { + return this.dictionary.get(encoded); + } } -} -class NullableDateVector extends DateVector { - private validityView: BitArray; + /** Get the dictionary encoded value */ + public getEncoded(i) { + return this.indices.get(i); + } - loadBuffers(recordBatch, buffer, bufReader, baseOffset) { - this.validityView = Vector.loadValidityBuffer(recordBatch, buffer, bufReader, baseOffset); - super.loadBuffers(recordBatch, buffer, bufReader, baseOffset); + slice(start, end) { + return this.indices.slice(start, end); // TODO decode } - get (i) { - if (!this.validityView.get(i)) return null; - return super.get(i); + getChildVectors() { + return this.indices.getChildVectors(); + } + + loadBuffers(bb, node, buffers) { + this.indices.loadData(bb, node, buffers); + } + + /** Get the index (encoded) vector */ + public getIndexVector() { + return this.indices; + } + + /** Get the dictionary vector */ + public getDictionaryVector() { + return this.dictionary; + } + + toString() { + return this.indices.toString(); } } -var BASIC_TYPES = [arrow.flatbuf.Type.Int, arrow.flatbuf.Type.FloatingPoint, arrow.flatbuf.Type.Utf8, arrow.flatbuf.Type.Date]; - -export function vectorFromField(field) : Vector { - var typeType = field.typeType(); - if (BASIC_TYPES.indexOf(typeType) >= 0) { - var type = field.typeType(); - if (type === arrow.flatbuf.Type.Int) { - type = field.type(new arrow.flatbuf.Int()); - var VectorConstructor : {new(string) : Vector}; - if (type.isSigned()) { - if (type.bitWidth() == 32) - VectorConstructor = field.nullable() ? NullableInt32Vector : Int32Vector; - else if (type.bitWidth() == 16) - VectorConstructor = field.nullable() ? NullableInt16Vector : Int16Vector; - else if (type.bitWidth() == 8) - VectorConstructor = field.nullable() ? NullableInt8Vector : Int8Vector; +export function vectorFromField(field, dictionaries) : Vector { + var dictionary = field.dictionary(), nullable = field.nullable(); + if (dictionary == null) { + var typeType = field.typeType(); + if (typeType === Type.List) { + var dataVector = vectorFromField(field.children(0), dictionaries); + return nullable ? new NullableListVector(field, dataVector) : new ListVector(field, dataVector); + } else if (typeType === Type.FixedSizeList) { + var dataVector = vectorFromField(field.children(0), dictionaries); + var size = field.type(new org.apache.arrow.flatbuf.FixedSizeList()).listSize(); + if (nullable) { + return new NullableFixedSizeListVector(field, size, dataVector); } else { - if (type.bitWidth() == 32) - VectorConstructor = field.nullable() ? NullableUint32Vector : Uint32Vector; - else if (type.bitWidth() == 16) - VectorConstructor = field.nullable() ? NullableUint16Vector : Uint16Vector; - else if (type.bitWidth() == 8) - VectorConstructor = field.nullable() ? NullableUint8Vector : Uint8Vector; + return new FixedSizeListVector(field, size, dataVector); + } + } else if (typeType === Type.Struct_) { + var vectors : Vector[] = []; + for (var i : number = 0; i < field.childrenLength(); i += 1|0) { + vectors.push(vectorFromField(field.children(i), dictionaries)); + } + return new StructVector(field, vectors); + } else { + if (typeType === Type.Int) { + var type = field.type(new org.apache.arrow.flatbuf.Int()); + return _createIntVector(field, type.bitWidth(), type.isSigned(), nullable) + } else if (typeType === Type.FloatingPoint) { + var precision = field.type(new org.apache.arrow.flatbuf.FloatingPoint()).precision(); + if (precision == org.apache.arrow.flatbuf.Precision.SINGLE) { + return nullable ? new NullableFloat32Vector(field) : new Float32Vector(field); + } else if (precision == org.apache.arrow.flatbuf.Precision.DOUBLE) { + return nullable ? new NullableFloat64Vector(field) : new Float64Vector(field); + } else { + throw "Unimplemented FloatingPoint precision " + precision; + } + } else if (typeType === Type.Utf8) { + return nullable ? new NullableUtf8Vector(field) : new Utf8Vector(field); + } else if (typeType === Type.Date) { + return nullable ? new NullableDateVector(field) : new DateVector(field); + } else { + throw "Unimplemented type " + typeType; } - } else if (type === arrow.flatbuf.Type.FloatingPoint) { - type = field.type(new arrow.flatbuf.FloatingPoint()); - if (type.precision() == arrow.flatbuf.Precision.SINGLE) - VectorConstructor = field.nullable() ? NullableFloat32Vector : Float32Vector; - else if (type.precision() == arrow.flatbuf.Precision.DOUBLE) - VectorConstructor = field.nullable() ? NullableFloat64Vector : Float64Vector; - } else if (type === arrow.flatbuf.Type.Utf8) { - VectorConstructor = field.nullable() ? NullableUtf8Vector : Utf8Vector; - } else if (type === arrow.flatbuf.Type.Date) { - VectorConstructor = field.nullable() ? NullableDateVector : DateVector; } + } else { + // determine arrow type - default is signed 32 bit int + var type = dictionary.indexType(), bitWidth = 32, signed = true; + if (type != null) { + bitWidth = type.bitWidth(); + signed = type.isSigned(); + } + var indices = _createIntVector(field, bitWidth, signed, nullable); + return new DictionaryVector(field, indices, dictionaries[dictionary.id().toFloat64().toString()]); + } +} - return new VectorConstructor(field.name()); - } else if (typeType === arrow.flatbuf.Type.List) { - var dataVector = vectorFromField(field.children(0)); - return field.nullable() ? new NullableListVector(field.name(), dataVector) : new ListVector(field.name(), dataVector); - } else if (typeType === arrow.flatbuf.Type.Struct_) { - var vectors : Vector[] = []; - for (var i : number = 0; i < field.childrenLength(); i += 1|0) { - vectors.push(vectorFromField(field.children(i))); +function _createIntVector(field, bitWidth, signed, nullable) { + if (bitWidth == 64) { + if (signed) { + return nullable ? new NullableInt64Vector(field) : new Int64Vector(field); + } else { + return nullable ? new NullableUint64Vector(field) : new Uint64Vector(field); + } + } else if (bitWidth == 32) { + if (signed) { + return nullable ? new NullableInt32Vector(field) : new Int32Vector(field); + } else { + return nullable ? new NullableUint32Vector(field) : new Uint32Vector(field); + } + } else if (bitWidth == 16) { + if (signed) { + return nullable ? new NullableInt16Vector(field) : new Int16Vector(field); + } else { + return nullable ? new NullableUint16Vector(field) : new Uint16Vector(field); + } + } else if (bitWidth == 8) { + if (signed) { + return nullable ? new NullableInt8Vector(field) : new Int8Vector(field); + } else { + return nullable ? new NullableUint8Vector(field) : new Uint8Vector(field); } - return new StructVector(field.name(), vectors); + } else { + throw "Unimplemented Int bit width " + bitWidth; } } http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/package.json ---------------------------------------------------------------------- diff --git a/js/package.json b/js/package.json index b1e583b..8687f50 100644 --- a/js/package.json +++ b/js/package.json @@ -4,16 +4,20 @@ "description": "", "main": "dist/arrow.js", "scripts": { - "postinstall": "./postinstall.sh", - "test": "echo \"Error: no test specified\" && exit 1" + "postinstall": "./flatbuffers.sh", + "build": "./flatbuffers.sh && tsc && webpack", + "test": "./node_modules/mocha/bin/mocha ./spec/arrow.js" }, "author": "", "license": "Apache-2.0", "devDependencies": { - "flatbuffers": "^1.5.0", - "text-encoding": "^0.6.4" + "chai": "^3.5.0", + "mocha": "^3.3.0", + "webpack": "^2.3.3" }, "dependencies": { + "flatbuffers": "^1.5.0", + "text-encoding": "^0.6.4", "commander": "^2.9.0" } } http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/postinstall.sh ---------------------------------------------------------------------- diff --git a/js/postinstall.sh b/js/postinstall.sh deleted file mode 100755 index 1e6622f..0000000 --- a/js/postinstall.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. See accompanying LICENSE file. - -echo "Compiling flatbuffer schemas..." -#flatc -o lib --js ../format/Message.fbs ../format/File.fbs -flatc -o lib --js ../format/*.fbs -cat lib/*_generated.js > lib/Arrow_generated.js http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/spec/arrow.js ---------------------------------------------------------------------- diff --git a/js/spec/arrow.js b/js/spec/arrow.js new file mode 100644 index 0000000..61a6f81 --- /dev/null +++ b/js/spec/arrow.js @@ -0,0 +1,179 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +var fs = require('fs'); +var chai = require('chai'); +var assert = chai.assert; +var path= require('path'); +var arrow = require('../dist/arrow.js'); + +test_files = [ + { + name: 'simple', + batches: 1, + fields: [ + { + "name": "foo", + "type": "Int", + "data": [[1, null, 3, 4, 5]] + }, + { + "name": "bar", + "type": "FloatingPoint", + "data": [[1.0, null, null, 4.0, 5.0]] + }, + { + "name": "baz", + "type": "Utf8", + "data": [["aa", null, null, "bbb", "cccc"]] + } + ] + }, + { + name: 'struct_example', + batches: 2, + fields: [ + { + "name": "struct_nullable", + "type": "Struct", + "data": [ + [ + null, + [null, 'MhRNxD4'], + [137773603, '3F9HBxK'], + [410361374, 'aVd88fp'], + null, + [null, '3loZrRf'], + null + ], [ + null, + [null,null], + [null,null], + null, + [null, '78SLiRw'], + null, + null, + [null, '0ilsf82'], + [null, 'LjS9MbU'], + [null, null], + ] + ] + } + ] + }, + { + name: 'dictionary', + batches: 2, + fields: [ + { + "name": "example-csv", + "type": "Struct", + "data": [ + [ + ["Hermione", 25, new Float32Array([-53.235599517822266, 40.231998443603516])], + ["Severus", 30, new Float32Array([-62.22999954223633, 3])], + ], [ + ["Harry", 20, new Float32Array([23, -100.23652648925781])] + ] + ] + } + ] + }, +]; + +var buf; + +function makeSchemaChecks(fields) { + describe('schema', function () { + var schema; + beforeEach(function () { + schema = arrow.getSchema(buf); + }); + + it('should read the number of fields', function () { + assert.lengthOf(schema, fields.length); + }); + + it("should understand fields", function () { + for (i = 0; i < fields.length; i += 1|0) { + assert.equal(schema[i].name, fields[i].name); + assert.equal(schema[i].type, fields[i].type, + 'bad type for field ' + schema[i].name); + } + }); + }); +} + +function makeDataChecks (batches, fields) { + describe('data', function() { + var reader; + beforeEach(function () { + reader = arrow.getReader(buf) + }); + it('should read the correct number of record batches', function () { + assert.equal(reader.getBatchCount(), batches); + }); + fields.forEach(function (field, i) { + it('should read ' + field.type + ' vector ' + field.name, function () { + for (var batch_idx = 0; batch_idx < batches; batch_idx += 1|0) { + reader.loadNextBatch(); + var batch = field.data[batch_idx]; + var vector = reader.getVector(field.name) + assert.isDefined(vector, "vector " + field.name); + assert.lengthOf(vector, batch.length, "vector " + field.name) + for (i = 0; i < vector.length; i += 1|0) { + if (field.type == "Date") { + assert.equal(vector.get(i).getTime(), batch[i].getTime(), + "vector " + field.name + " index " + i); + } else { + assert.deepEqual(vector.get(i), batch[i], + "vector " + field.name + " index " + i); + } + } + } + }); + }); + }); +} + +describe('arrow random-access file', function () { + test_files.forEach(function (test_file) { + describe(test_file.name, function () { + var fields = test_file.fields + beforeEach(function () { + buf = fs.readFileSync(path.resolve(__dirname, test_file.name + '.arrow')); + }); + + makeSchemaChecks(fields); + makeDataChecks(test_file.batches, fields); + }) + }); +}); + +describe('arrow streaming file format', function () { + test_files.forEach(function (test_file) { + describe(test_file.name, function () { + var fields = test_file.fields + beforeEach(function () { + buf = fs.readFileSync(path.resolve(__dirname, test_file.name + '-stream.arrow')); + }); + + makeSchemaChecks(fields); + makeDataChecks(test_file.batches, fields); + }) + }); +}); http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/spec/dictionary-stream.arrow ---------------------------------------------------------------------- diff --git a/js/spec/dictionary-stream.arrow b/js/spec/dictionary-stream.arrow new file mode 100644 index 0000000..17ca48b Binary files /dev/null and b/js/spec/dictionary-stream.arrow differ http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/spec/dictionary.arrow ---------------------------------------------------------------------- diff --git a/js/spec/dictionary.arrow b/js/spec/dictionary.arrow new file mode 100644 index 0000000..34d41db Binary files /dev/null and b/js/spec/dictionary.arrow differ http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/spec/simple-stream.arrow ---------------------------------------------------------------------- diff --git a/js/spec/simple-stream.arrow b/js/spec/simple-stream.arrow new file mode 100644 index 0000000..2c68c0e Binary files /dev/null and b/js/spec/simple-stream.arrow differ http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/spec/simple.arrow ---------------------------------------------------------------------- diff --git a/js/spec/simple.arrow b/js/spec/simple.arrow new file mode 100644 index 0000000..838db6d Binary files /dev/null and b/js/spec/simple.arrow differ http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/spec/struct_example-stream.arrow ---------------------------------------------------------------------- diff --git a/js/spec/struct_example-stream.arrow b/js/spec/struct_example-stream.arrow new file mode 100644 index 0000000..4e97b70 Binary files /dev/null and b/js/spec/struct_example-stream.arrow differ http://git-wip-us.apache.org/repos/asf/arrow/blob/22c738cc/js/spec/struct_example.arrow ---------------------------------------------------------------------- diff --git a/js/spec/struct_example.arrow b/js/spec/struct_example.arrow new file mode 100644 index 0000000..3d2c018 Binary files /dev/null and b/js/spec/struct_example.arrow differ