This is an automated email from the ASF dual-hosted git repository. kszucs pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 42aa782fb334ee907b99962ac81c53488e35fa41 Author: Taylor Baldwin <tbald...@uber.com> AuthorDate: Thu Oct 3 10:34:12 2019 +0200 ARROW-6767: [JS] Lazily bind batches in scan/scanReverse I noticed some `TODO` comments in the JS client library that expressed interest in calling `bind(batch)` lazily. This PR implements that optimization and updates related tests. Happy to make updates per feedback! Created a JIRA issue [here](https://issues.apache.org/jira/browse/ARROW-6767). Closes #5565 from rolyatmax/tb/lazily-bind-batches and squashes the following commits: 8877a8f8f <Taylor Baldwin> ARROW-6767: lazily bind batches in scan/scanReverse Authored-by: Taylor Baldwin <tbald...@uber.com> Signed-off-by: Krisztián Szűcs <szucs.kriszt...@gmail.com> --- js/src/compute/dataframe.ts | 30 ++++++++++++++++++++---------- js/test/unit/table-tests.ts | 28 ++++++++++++++-------------- 2 files changed, 34 insertions(+), 24 deletions(-) diff --git a/js/src/compute/dataframe.ts b/js/src/compute/dataframe.ts index e82e65c..db7f3d7 100644 --- a/js/src/compute/dataframe.ts +++ b/js/src/compute/dataframe.ts @@ -132,14 +132,19 @@ export class FilteredDataFrame<T extends { [key: string]: DataType } = any> exte for (let batchIndex = -1; ++batchIndex < numBatches;) { // load batches const batch = batches[batchIndex]; - // TODO: bind batches lazily - // If predicate doesn't match anything in the batch we don't need - // to bind the callback - if (bind) { bind(batch); } const predicate = this._predicate.bind(batch); + let isBound = false; // yield all indices for (let index = -1, numRows = batch.length; ++index < numRows;) { - if (predicate(index, batch)) { next(index, batch); } + if (predicate(index, batch)) { + // bind batches lazily - if predicate doesn't match anything + // in the batch we don't need to call bind on the batch + if (bind && !isBound) { + bind(batch); + isBound = true; + } + next(index, batch); + } } } } @@ -149,14 +154,19 @@ export class FilteredDataFrame<T extends { [key: string]: DataType } = any> exte for (let batchIndex = numBatches; --batchIndex >= 0;) { // load batches const batch = batches[batchIndex]; - // TODO: bind batches lazily - // If predicate doesn't match anything in the batch we don't need - // to bind the callback - if (bind) { bind(batch); } const predicate = this._predicate.bind(batch); + let isBound = false; // yield all indices for (let index = batch.length; --index >= 0;) { - if (predicate(index, batch)) { next(index, batch); } + if (predicate(index, batch)) { + // bind batches lazily - if predicate doesn't match anything + // in the batch we don't need to call bind on the batch + if (bind && !isBound) { + bind(batch); + isBound = true; + } + next(index, batch); + } } } } diff --git a/js/test/unit/table-tests.ts b/js/test/unit/table-tests.ts index ae2f058..da74b32 100644 --- a/js/test/unit/table-tests.ts +++ b/js/test/unit/table-tests.ts @@ -424,6 +424,10 @@ describe(`Table`, () => { get_i32 = col('i32').bind(batch); })), expected: values.filter((row) => (row[F32] as number) * (row[I32] as number) > 0) + }, { + name: `filter out all records`, + filtered: table.filter(lit(1).eq(0)), + expected: [] } ]; for (let this_test of filter_tests) { @@ -440,15 +444,13 @@ describe(`Table`, () => { expect(columns.map((c) => c.get(idx))).toEqual(expected[expected_idx++]); }); }); - test(`calls bind function on every batch`, () => { - // Techincally, we only need to call bind on - // batches with data that match the predicate, so - // this test may fail in the future if we change - // that - and that's ok! + test(`calls bind function lazily`, () => { let bind = jest.fn(); filtered.scan(() => { }, bind); - for (let batch of table.chunks) { - expect(bind).toHaveBeenCalledWith(batch); + if (expected.length) { + expect(bind).toHaveBeenCalled(); + } else { + expect(bind).not.toHaveBeenCalled(); } }); }); @@ -460,15 +462,13 @@ describe(`Table`, () => { expect(columns.map((c) => c.get(idx))).toEqual(expected[--expected_idx]); }); }); - test(`calls bind function on every batch`, () => { - // Techincally, we only need to call bind on - // batches with data that match the predicate, so - // this test may fail in the future if we change - // that - and that's ok! + test(`calls bind function lazily`, () => { let bind = jest.fn(); filtered.scanReverse(() => { }, bind); - for (let batch of table.chunks) { - expect(bind).toHaveBeenCalledWith(batch); + if (expected.length) { + expect(bind).toHaveBeenCalled(); + } else { + expect(bind).not.toHaveBeenCalled(); } }); });