[jira] [Commented] (ARROW-1693) [JS] Error reading dictionary-encoded integration test files

ASF GitHub Bot (JIRA) Sun, 12 Nov 2017 21:34:59 -0800

    [ 
https://issues.apache.org/jira/browse/ARROW-1693?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16249119#comment-16249119
 ]


ASF GitHub Bot commented on ARROW-1693:
---------------------------------------

wesm commented on a change in pull request #1294: ARROW-1693: [JS] Fix reading 
C++ dictionary-encoded vectors
URL: https://github.com/apache/arrow/pull/1294#discussion_r150448976
 
 

 ##########
 File path: js/src/reader/arrow.ts
 ##########
 @@ -15,64 +15,135 @@
 // specific language governing permissions and limitations
 // under the License.
 
+import { Vector } from '../vector/vector';
 import { flatbuffers } from 'flatbuffers';
+import { readVector, readValueVector } from './vector';
+import {
+    readFileFooter, readFileMessages,
+    readStreamSchema, readStreamMessages
+} from './format';
+
+import * as File_ from '../format/File_generated';
 import * as Schema_ from '../format/Schema_generated';
 import * as Message_ from '../format/Message_generated';
-export import Schema = Schema_.org.apache.arrow.flatbuf.Schema;
-export import RecordBatch = Message_.org.apache.arrow.flatbuf.RecordBatch;
-
-import { readFile } from './file';
-import { readStream } from './stream';
-import { readVector } from './vector';
-import { readDictionary } from './dictionary';
-import { Vector, Column } from '../types/types';
 
 import ByteBuffer = flatbuffers.ByteBuffer;
+import Footer = File_.org.apache.arrow.flatbuf.Footer;
 import Field = Schema_.org.apache.arrow.flatbuf.Field;
-export type Dictionaries = { [k: string]: Vector<any> } | null;
-export type IteratorState = { nodeIndex: number; bufferIndex: number };
-
-export function* readRecords(...bytes: ByteBuffer[]) {
-    try {
-        yield* readFile(...bytes);
-    } catch (e) {
-        try {
-            yield* readStream(...bytes);
-        } catch (e) {
-            throw new Error('Invalid Arrow buffer');
-        }
+import Schema = Schema_.org.apache.arrow.flatbuf.Schema;
+import Message = Message_.org.apache.arrow.flatbuf.Message;
+import RecordBatch = Message_.org.apache.arrow.flatbuf.RecordBatch;
+import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader;
+import DictionaryBatch = Message_.org.apache.arrow.flatbuf.DictionaryBatch;
+import DictionaryEncoding = 
Schema_.org.apache.arrow.flatbuf.DictionaryEncoding;
+
+export type ArrowReaderContext = {
+    schema?: Schema;
+    footer?: Footer | null;
+    dictionaries: Map<string, Vector>;
+    dictionaryEncodedFields: Map<string, Field>;
+    readMessages: (bb: ByteBuffer, footer: Footer) => Iterable<Message>;
+};
+
+export type VectorReaderContext = {
+    node: number;
+    buffer: number;
+    offset: number;
+    bytes: Uint8Array;
+    batch: RecordBatch;
+    dictionaries: Map<string, Vector>;
+};
+
+export function* readVectors(buffers: Iterable<Uint8Array | Buffer | string>, 
context?: ArrowReaderContext) {
+    const context_ = context || {} as ArrowReaderContext;
+    for (const buffer of buffers) {
+        yield* readBuffer(toByteBuffer(buffer), context_);
     }
 }
 
-export function* readBuffers(...bytes: Array<Uint8Array | Buffer | string>) {
-    const dictionaries: Dictionaries = {};
-    const byteBuffers = bytes.map(toByteBuffer);
-    for (let { schema, batch } of readRecords(...byteBuffers)) {
-        let vectors: Column<any>[] = [];
-        let state = { nodeIndex: 0, bufferIndex: 0 };
-        let fieldsLength = schema.fieldsLength();
-        let index = -1, field: Field, vector: Vector<any>;
-        if (batch.id) {
-            // A dictionary batch only contain a single vector. Traverse each
-            // field and its children until we find one that uses this 
dictionary
-            while (++index < fieldsLength) {
-                if (field = schema.fields(index)!) {
-                    if (vector = readDictionary<any>(field, batch, state, 
dictionaries)!) {
-                        dictionaries[batch.id] = dictionaries[batch.id] && 
dictionaries[batch.id].concat(vector) || vector;
-                        break;
-                    }
+export async function* readVectorsAsync(buffers: AsyncIterable<Uint8Array | 
Buffer | string>, context?: ArrowReaderContext) {
+    const context_ = context || {} as ArrowReaderContext;
+    for await (const buffer of buffers) {
+        yield* readBuffer(toByteBuffer(buffer), context_);
+    }
+}
+
+function* readBuffer(bb: ByteBuffer, readerContext: ArrowReaderContext) {
 
 Review comment:
   What do you recommend as a resource for getting up to speed on TypeScript? 
Where is the line between TypeScript and ES6? 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [JS] Error reading dictionary-encoded integration test files
> ------------------------------------------------------------
>
>                 Key: ARROW-1693
>                 URL: https://issues.apache.org/jira/browse/ARROW-1693
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: JavaScript
>            Reporter: Brian Hulette
>            Assignee: Brian Hulette
>              Labels: pull-request-available
>             Fix For: 0.8.0
>
>         Attachments: dictionary-cpp.arrow, dictionary-java.arrow, 
> dictionary.json
>
>
> The JS implementation crashes when reading the dictionary test case from the 
> integration tests.
> To replicate, first generate the test files with java and cpp impls:
> {code}
> $ cd ${ARROW_HOME}/integration/
> $ python -c 'from integration_test import generate_dictionary_case; 
> generate_dictionary_case().write("dictionary.json")'
> $ ../cpp/debug/debug/json-integration-test --integration 
> --json=dictionary.json --arrow=dictionary-cpp.arrow --mode=JSON_TO_ARROW
> $ java -cp 
> ../java/tools/target/arrow-tools-0.8.0-SNAPSHOT-jar-with-dependencies.jar 
> org.apache.arrow.tools.Integration -c JSON_TO_ARROW -a dictionary-java.arrow 
> -j dictionary.json
> {code}
> Attempt to read the files with the JS impl:
> {code}
> $ cd ${ARROW_HOME}/js/
> $ ./bin/arrow2csv.js -s dict1_0 -f ../integration/dictionary-{java,cpp}.arrow
> {code}
> Both files result in an error for me on 
> [a8f51858|https://github.com/apache/arrow/commit/a8f518588fda471b2e3cc8e0f0064e7c4bb99899]:
> {{TypeError: Cannot read property 'buffer' of undefined}}



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

[jira] [Commented] (ARROW-1693) [JS] Error reading dictionary-encoded integration test files

Reply via email to