domoritz commented on PR #35780:
URL: https://github.com/apache/arrow/pull/35780#issuecomment-1847413986
@kylebarron @bmschmidt @FrNecas This pull request is ready for a review. I
will look into actually supporting large value buffers in a follow up pull
request. For now, people can chunk vectors as we discussed and at least we have
support for the new type. @trxcllnt is currently out so it would be great if
you could review this pull request so I can merge it. I will work on
LargeBinary in a follow up pull request but it will be easier to have this
merged first.
<details>
<summary>
Test scripts I played with to debug this code (just to have it saved
somewhere).
</summary>
```ts
import { constants } from 'node:buffer';
import { vectorFromArray, LargeUtf8, Utf8, makeBuilder, Table, tableToIPC,
RecordBatchJSONWriter, tableFromIPC, Vector, Data, Int32, makeTable } from
'./src/Arrow.dom.ts';
console.log(`buffer.constants.MAX_LENGTH:
${constants.MAX_LENGTH.toLocaleString()}`)
console.log(`buffer.constants.MAX_STRING_LENGTH:
${constants.MAX_STRING_LENGTH.toLocaleString()}`)
console.log(`Number.MAX_SAFE_INTEGER:
${Number.MAX_SAFE_INTEGER.toLocaleString()}`)
console.log(`2**32: ${(2 ** 32).toLocaleString()}`)
// const roundLengthUpToNearest64Bytes_1 = (len: number, BPE: number) =>
((((len * BPE) + 63) & ~63) || 64) / BPE;
// function roundLengthUpToNearest64Bytes_2(len: number, BPE: number) {
// const bytesMinus1 = len * BPE - 1;
// return ((bytesMinus1 - bytesMinus1 % 64 + 64) || 64) / BPE;
// }
// const roundLengthUpToNearest64Bytes = (len: number, BPE: number) =>
(round64(len * BPE) || 64) / BPE;
// const roundLengthUpToNearest64Bytes = (len: number, BPE: number) => len +
64 - 1 - (len + 63) % 64;
// const round64_1 = (num: number) => Math.ceil(num / 64) * 64
// const round64_2 = (num: number) => ((num) + 63) & ~63
// const round64_3 = (num: number) => num + 63 - (num + 63) % 64;
// const round64_4 = (num: number) => num - 1 - (num - 1) % 64 + 64;
// const round64_5 = (num: number) => ((num + 63) >> 6) << 6;
// {
// for (const f of [round64_1, round64_2, round64_3, round64_4,
round64_5]) {
// console.log(f)
// console.log(f(-1))
// console.log(f(0))
// console.log(f(1))
// console.log(f(2))
// console.log(f(63))
// console.log(f(64))
// console.log(f(65))
// console.log(f(2 ** 16))
// console.log(f(2 ** 32))
// console.log(f(2 ** 42))
// console.log(f(Number.MAX_SAFE_INTEGER / 100))
// console.log(f(Number.MAX_SAFE_INTEGER))
// console.log(f(Number.MAX_SAFE_INTEGER * 10))
// }
// }
// {
// const compare = (v, b) => roundLengthUpToNearest64Bytes_1(v, b) ===
roundLengthUpToNearest64Bytes_2(v, b);
// for (const b of [1, 2]) {
// console.log(`b: ${b}`);
// console.log(compare(-1, b));
// console.log(compare(0, b));
// console.log(compare(1, b));
// console.log(compare(2, b));
// console.log(compare(63, b));
// console.log(compare(64, b));
// console.log(compare(65, b));
// console.log(compare(128, b));
// console.log(compare(672396, b));
// console.log(compare(2 ** 20, b));
// // console.log(compare(Number.MAX_SAFE_INTEGER / 100, b));
// }
// }
// {
// for (const f of [roundLengthUpToNearest64Bytes_1,
roundLengthUpToNearest64Bytes_2]) {
// console.log(f)
// console.log("1 byte");
// console.log(f(0, 1));
// console.log(f(1, 1));
// console.log(f(2, 1));
// console.log(f(63, 1));
// console.log(f(64, 1));
// console.log(f(65, 1));
// console.log(f(2 ** 20, 1));
// console.log(f(Number.MAX_SAFE_INTEGER / 100, 1));
// console.log("2 bytes");
// console.log(f(0, 2));
// console.log(f(1, 2));
// console.log(f(2, 2));
// console.log(f(63, 2));
// console.log(f(64, 2));
// console.log(f(65, 2));
// console.log(f(2 ** 20, 2));
// console.log(f(Number.MAX_SAFE_INTEGER / 100, 2));
// }
// }
// {
// const utf8Vector = vectorFromArray(["foo", "bar", "baz"], new Utf8);
// const largeUtf8Vector = vectorFromArray(["foo", "bar", "baz"], new
LargeUtf8);
// console.log(largeUtf8Vector);
// console.log(largeUtf8Vector.toArray());
// console.log(utf8Vector.toArray());
// const table = new Table({ utf8Vector, largeUtf8Vector });
// const writer = RecordBatchJSONWriter.writeAll(table);
// const string = await writer.toString();
// // JSON serialization
// // console.log(string);
// const table2 = tableFromIPC(JSON.parse(string))
// console.log(table2.toString())
// const table3 = tableFromIPC(tableToIPC(table))
// console.log(table3.toString())
// }
// {
// // try putting a lot of strings in a map (works in node but not bun)
// const N = 1e4;
// const map = new Map();
// for (let i = 0; i < N; i++) {
// const longString = "a".repeat(constants.MAX_STRING_LENGTH);
// map.set(i, longString);
// }
// console.log(`Total characters in map: ${(map.size *
constants.MAX_STRING_LENGTH).toLocaleString()}`);
// }
// {
// const builder = makeBuilder({ type: new LargeUtf8 });
// const string = "hello world";
// builder.append(string);
// console.log(builder.finish().toVector());
// }
{
// table with two columns with different chunk lengths
const b1 = makeBuilder({ type: new Int32 });
const b2 = makeBuilder({ type: new Int32 });
const d1 = [] as Data[];
const d2 = [] as Data[];
b1.append(1);
b1.append(2);
d1.push(b1.flush());
b1.append(3);
b2.append(4);
d1.push(b1.flush());
b2.append(5);
d2.push(b2.flush());
b2.append(6);
b2.append(7);
b2.append(8);
d2.push(b2.flush());
const v1 = new Vector(d1);
const v2 = new Vector(d2);
const table = new Table({ v1, v2 });
console.log(table.batches.map((x) => x.toArray()));
// console.log(table.toArray());
// console.log(table.data);
}
// {
// const longString = "a".repeat(constants.MAX_STRING_LENGTH);
// const builder = makeBuilder({ type: new LargeUtf8 });
// const data = [] as Data[];
// // const vectors = [] as Vector<any>[];
// builder.append(longString);
// builder.append(longString);
// builder.append(longString);
// // vectors.push(builder.toVector());
// data.push(builder.flush());
// builder.append(longString);
// builder.append(longString);
// builder.append(longString);
// // vectors.push(builder.toVector());
// data.push(builder.flush());
// console.log(new Vector(data));
// }
{
// make the longest possible string (longer will crash with `RangeError:
Invalid string length`)
const longString = "a".repeat(constants.MAX_STRING_LENGTH);
const builder = makeBuilder({ type: new LargeUtf8 });
// length of vector
const N = 8;
console.log(`Building vector with total string length (potential need
for offsets): ${(constants.MAX_STRING_LENGTH * N).toLocaleString()} or <=
2^${Math.ceil(Math.log2(constants.MAX_STRING_LENGTH * N))}`);
for (let i = 0; i < N; i++) {
builder.append(longString);
}
// add string to force another offset
builder.append("");
const vector = builder.finish().toVector();
console.log(vector.length);
console.log(vector.byteLength.toLocaleString());
// const table = new Table({ strings: vector });
// console.log(table);
// const ipc = tableToIPC(table);
// console.log(ipc);
// console.log(vector.toArray());
}
// {
// const builder = makeBuilder({ type: new LargeUtf8 });
// const string = "a".repeat(1e5);
// const N = 1e7;
// for (let i = 0; i < N; i++) {
// builder.append(string);
// }
// const vector = builder.toVector();
// console.log(vector);
// }
```
</details>
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]