domoritz commented on PR #35780:
URL: https://github.com/apache/arrow/pull/35780#issuecomment-1847413986

   @kylebarron @bmschmidt @FrNecas This pull request is ready for a review. I 
will look into actually supporting large value buffers in a follow up pull 
request. For now, people can chunk vectors as we discussed and at least we have 
support for the new type. @trxcllnt is currently out so it would be great if 
you could review this pull request so I can merge it. I will work on 
LargeBinary in a follow up pull request but it will be easier to have this 
merged first. 
   
   <details>
   <summary>
   Test scripts I played with to debug this code (just to have it saved 
somewhere).
   </summary>
   ```ts
   import { constants } from 'node:buffer';
   import { vectorFromArray, LargeUtf8, Utf8, makeBuilder, Table, tableToIPC, 
RecordBatchJSONWriter, tableFromIPC, Vector, Data, Int32, makeTable } from 
'./src/Arrow.dom.ts';
   
   console.log(`buffer.constants.MAX_LENGTH: 
${constants.MAX_LENGTH.toLocaleString()}`)
   console.log(`buffer.constants.MAX_STRING_LENGTH: 
${constants.MAX_STRING_LENGTH.toLocaleString()}`)
   console.log(`Number.MAX_SAFE_INTEGER: 
${Number.MAX_SAFE_INTEGER.toLocaleString()}`)
   console.log(`2**32: ${(2 ** 32).toLocaleString()}`)
   
   // const roundLengthUpToNearest64Bytes_1 = (len: number, BPE: number) => 
((((len * BPE) + 63) & ~63) || 64) / BPE;
   // function roundLengthUpToNearest64Bytes_2(len: number, BPE: number) {
   //     const bytesMinus1 = len * BPE - 1;
   //     return ((bytesMinus1 - bytesMinus1 % 64 + 64) || 64) / BPE;
   // }
   
   // const roundLengthUpToNearest64Bytes = (len: number, BPE: number) => 
(round64(len * BPE) || 64) / BPE;
   // const roundLengthUpToNearest64Bytes = (len: number, BPE: number) => len + 
64 - 1 - (len + 63) % 64;
   
   // const round64_1 = (num: number) => Math.ceil(num / 64) * 64
   // const round64_2 = (num: number) => ((num) + 63) & ~63
   // const round64_3 = (num: number) => num + 63 - (num + 63) % 64;
   // const round64_4 = (num: number) => num - 1 - (num - 1) % 64 + 64;
   // const round64_5 = (num: number) => ((num + 63) >> 6) << 6;
   
   // {
   //     for (const f of [round64_1, round64_2, round64_3, round64_4, 
round64_5]) {
   //         console.log(f)
   //         console.log(f(-1))
   //         console.log(f(0))
   //         console.log(f(1))
   //         console.log(f(2))
   //         console.log(f(63))
   //         console.log(f(64))
   //         console.log(f(65))
   //         console.log(f(2 ** 16))
   //         console.log(f(2 ** 32))
   //         console.log(f(2 ** 42))
   //         console.log(f(Number.MAX_SAFE_INTEGER / 100))
   //         console.log(f(Number.MAX_SAFE_INTEGER))
   //         console.log(f(Number.MAX_SAFE_INTEGER * 10))
   //     }
   // }
   
   // {
   //     const compare = (v, b) => roundLengthUpToNearest64Bytes_1(v, b) === 
roundLengthUpToNearest64Bytes_2(v, b);
   
   //     for (const b of [1, 2]) {
   //         console.log(`b: ${b}`);
   //         console.log(compare(-1, b));
   //         console.log(compare(0, b));
   //         console.log(compare(1, b));
   //         console.log(compare(2, b));
   //         console.log(compare(63, b));
   //         console.log(compare(64, b));
   //         console.log(compare(65, b));
   //         console.log(compare(128, b));
   //         console.log(compare(672396, b));
   //         console.log(compare(2 ** 20, b));
   //         // console.log(compare(Number.MAX_SAFE_INTEGER / 100, b));
   //     }
   // }
   
   // {
   //     for (const f of [roundLengthUpToNearest64Bytes_1, 
roundLengthUpToNearest64Bytes_2]) {
   //         console.log(f)
   
   //         console.log("1 byte");
   //         console.log(f(0, 1));
   //         console.log(f(1, 1));
   //         console.log(f(2, 1));
   //         console.log(f(63, 1));
   //         console.log(f(64, 1));
   //         console.log(f(65, 1));
   //         console.log(f(2 ** 20, 1));
   //         console.log(f(Number.MAX_SAFE_INTEGER / 100, 1));
   
   //         console.log("2 bytes");
   //         console.log(f(0, 2));
   //         console.log(f(1, 2));
   //         console.log(f(2, 2));
   //         console.log(f(63, 2));
   //         console.log(f(64, 2));
   //         console.log(f(65, 2));
   //         console.log(f(2 ** 20, 2));
   //         console.log(f(Number.MAX_SAFE_INTEGER / 100, 2));
   //     }
   // }
   
   // {
   //     const utf8Vector = vectorFromArray(["foo", "bar", "baz"], new Utf8);
   //     const largeUtf8Vector = vectorFromArray(["foo", "bar", "baz"], new 
LargeUtf8);
   
   //     console.log(largeUtf8Vector);
   
   //     console.log(largeUtf8Vector.toArray());
   //     console.log(utf8Vector.toArray());
   
   //     const table = new Table({ utf8Vector, largeUtf8Vector });
   //     const writer = RecordBatchJSONWriter.writeAll(table);
   //     const string = await writer.toString();
   
   //     // JSON serialization
   //     // console.log(string);
   
   //     const table2 = tableFromIPC(JSON.parse(string))
   //     console.log(table2.toString())
   
   //     const table3 = tableFromIPC(tableToIPC(table))
   //     console.log(table3.toString())
   // }
   
   // {
   //     // try putting a lot of strings in a map (works in node but not bun)
   //     const N = 1e4;
   
   //     const map = new Map();
   //     for (let i = 0; i < N; i++) {
   //         const longString = "a".repeat(constants.MAX_STRING_LENGTH);
   //         map.set(i, longString);
   //     }
   
   //     console.log(`Total characters in map: ${(map.size * 
constants.MAX_STRING_LENGTH).toLocaleString()}`);
   // }
   
   // {
   //     const builder = makeBuilder({ type: new LargeUtf8 });
   
   //     const string = "hello world";
   
   //     builder.append(string);
   
   //     console.log(builder.finish().toVector());
   // }
   
   
   {
       // table with two columns with different chunk lengths
   
       const b1 = makeBuilder({ type: new Int32 });
       const b2 = makeBuilder({ type: new Int32 });
   
       const d1 = [] as Data[];
       const d2 = [] as Data[];
   
       b1.append(1);
       b1.append(2);
       d1.push(b1.flush());
       b1.append(3);
       b2.append(4);
       d1.push(b1.flush());
   
       b2.append(5);
       d2.push(b2.flush());
       b2.append(6);
       b2.append(7);
       b2.append(8);
       d2.push(b2.flush());
   
       const v1 = new Vector(d1);
       const v2 = new Vector(d2);
   
       const table = new Table({ v1, v2 });
   
       console.log(table.batches.map((x) => x.toArray()));
   
       // console.log(table.toArray());
       // console.log(table.data);
   }
   
   // {
   //     const longString = "a".repeat(constants.MAX_STRING_LENGTH);
   
   //     const builder = makeBuilder({ type: new LargeUtf8 });
   
   //     const data = [] as Data[];
   //     // const vectors = [] as Vector<any>[];
   
   //     builder.append(longString);
   //     builder.append(longString);
   //     builder.append(longString);
   //     // vectors.push(builder.toVector());
   //     data.push(builder.flush());
   
   //     builder.append(longString);
   //     builder.append(longString);
   //     builder.append(longString);
   //     // vectors.push(builder.toVector());
   //     data.push(builder.flush());
   
   //     console.log(new Vector(data));
   // }
   
   {
       // make the longest possible string (longer will crash with `RangeError: 
Invalid string length`)
       const longString = "a".repeat(constants.MAX_STRING_LENGTH);
   
       const builder = makeBuilder({ type: new LargeUtf8 });
   
       // length of vector
       const N = 8;
   
       console.log(`Building vector with total string length (potential need 
for offsets): ${(constants.MAX_STRING_LENGTH * N).toLocaleString()} or <= 
2^${Math.ceil(Math.log2(constants.MAX_STRING_LENGTH * N))}`);
   
       for (let i = 0; i < N; i++) {
           builder.append(longString);
       }
       // add string to force another offset
       builder.append("");
       const vector = builder.finish().toVector();
   
       console.log(vector.length);
       console.log(vector.byteLength.toLocaleString());
   
       // const table = new Table({ strings: vector });
       // console.log(table);
   
       // const ipc = tableToIPC(table);
       // console.log(ipc);
   
   
       // console.log(vector.toArray());
   }
   
   // {
   //     const builder = makeBuilder({ type: new LargeUtf8 });
   
   //     const string = "a".repeat(1e5);
   //     const N = 1e7;
   
   //     for (let i = 0; i < N; i++) {
   //         builder.append(string);
   //     }
   //     const vector = builder.toVector();
   
   //     console.log(vector);
   // }
   ```
   </details>


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to