[ https://issues.apache.org/jira/browse/ARROW-1693?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16250159#comment-16250159 ]
ASF GitHub Bot commented on ARROW-1693: --------------------------------------- trxcllnt commented on a change in pull request #1294: ARROW-1693: [JS] Fix reading C++ dictionary-encoded vectors URL: https://github.com/apache/arrow/pull/1294#discussion_r150652141 ########## File path: js/src/reader/arrow.ts ########## @@ -15,64 +15,135 @@ // specific language governing permissions and limitations // under the License. +import { Vector } from '../vector/vector'; import { flatbuffers } from 'flatbuffers'; +import { readVector, readValueVector } from './vector'; +import { + readFileFooter, readFileMessages, + readStreamSchema, readStreamMessages +} from './format'; + +import * as File_ from '../format/File_generated'; import * as Schema_ from '../format/Schema_generated'; import * as Message_ from '../format/Message_generated'; -export import Schema = Schema_.org.apache.arrow.flatbuf.Schema; -export import RecordBatch = Message_.org.apache.arrow.flatbuf.RecordBatch; - -import { readFile } from './file'; -import { readStream } from './stream'; -import { readVector } from './vector'; -import { readDictionary } from './dictionary'; -import { Vector, Column } from '../types/types'; import ByteBuffer = flatbuffers.ByteBuffer; +import Footer = File_.org.apache.arrow.flatbuf.Footer; import Field = Schema_.org.apache.arrow.flatbuf.Field; -export type Dictionaries = { [k: string]: Vector<any> } | null; -export type IteratorState = { nodeIndex: number; bufferIndex: number }; - -export function* readRecords(...bytes: ByteBuffer[]) { - try { - yield* readFile(...bytes); - } catch (e) { - try { - yield* readStream(...bytes); - } catch (e) { - throw new Error('Invalid Arrow buffer'); - } +import Schema = Schema_.org.apache.arrow.flatbuf.Schema; +import Message = Message_.org.apache.arrow.flatbuf.Message; +import RecordBatch = Message_.org.apache.arrow.flatbuf.RecordBatch; +import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader; +import DictionaryBatch = Message_.org.apache.arrow.flatbuf.DictionaryBatch; +import DictionaryEncoding = Schema_.org.apache.arrow.flatbuf.DictionaryEncoding; + +export type ArrowReaderContext = { + schema?: Schema; + footer?: Footer | null; + dictionaries: Map<string, Vector>; + dictionaryEncodedFields: Map<string, Field>; + readMessages: (bb: ByteBuffer, footer: Footer) => Iterable<Message>; +}; + +export type VectorReaderContext = { + node: number; + buffer: number; + offset: number; + bytes: Uint8Array; + batch: RecordBatch; + dictionaries: Map<string, Vector>; +}; + +export function* readVectors(buffers: Iterable<Uint8Array | Buffer | string>, context?: ArrowReaderContext) { + const context_ = context || {} as ArrowReaderContext; + for (const buffer of buffers) { + yield* readBuffer(toByteBuffer(buffer), context_); } } -export function* readBuffers(...bytes: Array<Uint8Array | Buffer | string>) { - const dictionaries: Dictionaries = {}; - const byteBuffers = bytes.map(toByteBuffer); - for (let { schema, batch } of readRecords(...byteBuffers)) { - let vectors: Column<any>[] = []; - let state = { nodeIndex: 0, bufferIndex: 0 }; - let fieldsLength = schema.fieldsLength(); - let index = -1, field: Field, vector: Vector<any>; - if (batch.id) { - // A dictionary batch only contain a single vector. Traverse each - // field and its children until we find one that uses this dictionary - while (++index < fieldsLength) { - if (field = schema.fields(index)!) { - if (vector = readDictionary<any>(field, batch, state, dictionaries)!) { - dictionaries[batch.id] = dictionaries[batch.id] && dictionaries[batch.id].concat(vector) || vector; - break; - } +export async function* readVectorsAsync(buffers: AsyncIterable<Uint8Array | Buffer | string>, context?: ArrowReaderContext) { + const context_ = context || {} as ArrowReaderContext; + for await (const buffer of buffers) { + yield* readBuffer(toByteBuffer(buffer), context_); + } +} + +function* readBuffer(bb: ByteBuffer, readerContext: ArrowReaderContext) { Review comment: @wesm anything type-related (type annotations, interfaces, generics, and declarations like `type = { foo: string }`) is TypeScript, the rest is ES. If you're curious about an individual feature, you can run the build (`npm run build`) and compare the transpiled output (in the `targets` directory) with the TS source. We transpile to multiple JS versions and module formats, but it's probably easiest to compare against the `targets/es2015/esm` or `targets/esnext/esm`. TS code-gens/polyfills missing features depending on the target environment. For example, ES5 doesn't have generators, so TS code-gens an iterator state machine into generator functions, but the ES2015 target leaves them as-is. And ES2015 doesn't have async iterator functions, so TS coge-gens the async iterator state machine with Promises. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JS] Error reading dictionary-encoded integration test files > ------------------------------------------------------------ > > Key: ARROW-1693 > URL: https://issues.apache.org/jira/browse/ARROW-1693 > Project: Apache Arrow > Issue Type: Bug > Components: JavaScript > Reporter: Brian Hulette > Assignee: Brian Hulette > Labels: pull-request-available > Fix For: 0.8.0 > > Attachments: dictionary-cpp.arrow, dictionary-java.arrow, > dictionary.json > > > The JS implementation crashes when reading the dictionary test case from the > integration tests. > To replicate, first generate the test files with java and cpp impls: > {code} > $ cd ${ARROW_HOME}/integration/ > $ python -c 'from integration_test import generate_dictionary_case; > generate_dictionary_case().write("dictionary.json")' > $ ../cpp/debug/debug/json-integration-test --integration > --json=dictionary.json --arrow=dictionary-cpp.arrow --mode=JSON_TO_ARROW > $ java -cp > ../java/tools/target/arrow-tools-0.8.0-SNAPSHOT-jar-with-dependencies.jar > org.apache.arrow.tools.Integration -c JSON_TO_ARROW -a dictionary-java.arrow > -j dictionary.json > {code} > Attempt to read the files with the JS impl: > {code} > $ cd ${ARROW_HOME}/js/ > $ ./bin/arrow2csv.js -s dict1_0 -f ../integration/dictionary-{java,cpp}.arrow > {code} > Both files result in an error for me on > [a8f51858|https://github.com/apache/arrow/commit/a8f518588fda471b2e3cc8e0f0064e7c4bb99899]: > {{TypeError: Cannot read property 'buffer' of undefined}} -- This message was sent by Atlassian JIRA (v6.4.14#64029)