[GitHub] [flink] dianfu commented on a change in pull request #13369: [FLINK-19173][python] Add Pandas Batch Group Aggregation Function Operator

GitBox Tue, 15 Sep 2020 23:49:48 -0700


dianfu commented on a change in pull request #13369:
URL: https://github.com/apache/flink/pull/13369#discussion_r489173894




##########
File path: 
flink-python/src/main/java/org/apache/flink/table/runtime/operators/python/aggregate/arrow/batch/AbstractBatchArrowPythonAggregateFunctionOperator.java
##########
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.table.runtime.operators.python.aggregate.arrow.batch;
+
+import org.apache.flink.annotation.Internal;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.table.api.TableConfig;
+import org.apache.flink.table.data.RowData;
+import org.apache.flink.table.data.binary.BinaryRowData;
+import org.apache.flink.table.data.binary.BinaryRowDataUtil;
+import org.apache.flink.table.functions.AggregateFunction;
+import org.apache.flink.table.functions.python.PythonFunctionInfo;
+import org.apache.flink.table.planner.codegen.CodeGeneratorContext;
+import org.apache.flink.table.planner.codegen.ProjectionCodeGenerator;
+import org.apache.flink.table.runtime.generated.GeneratedProjection;
+import org.apache.flink.table.runtime.generated.Projection;
+import 
org.apache.flink.table.runtime.operators.python.aggregate.arrow.AbstractArrowPythonAggregateFunctionOperator;
+import org.apache.flink.table.types.logical.RowType;
+import org.apache.flink.util.Preconditions;
+
+import java.util.Arrays;
+import java.util.stream.Collectors;
+
+/**
+ * The Abstract class of Batch Arrow Aggregate Operator for Pandas {@link 
AggregateFunction}.
+ */
+@Internal
+abstract class AbstractBatchArrowPythonAggregateFunctionOperator

Review comment:
       It seems that there is no need to add an extra abstract class. Could we 
merge this class and BatchArrowPythonAggregateFunctionOperator into a single 
class?

##########
File path: 
flink-python/src/main/java/org/apache/flink/table/runtime/operators/python/aggregate/arrow/batch/BatchArrowPythonGroupAggregateFunctionOperator.java
##########
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.table.runtime.operators.python.aggregate.arrow.batch;
+
+import org.apache.flink.annotation.Internal;
+import org.apache.flink.api.java.tuple.Tuple2;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.table.data.RowData;
+import org.apache.flink.table.data.binary.BinaryRowData;
+import org.apache.flink.table.functions.AggregateFunction;
+import org.apache.flink.table.functions.python.PythonFunctionInfo;
+import org.apache.flink.table.types.logical.RowType;
+
+/**
+ * The Batch Arrow Python {@link AggregateFunction} Operator for Group 
Aggregation.
+ */
+@Internal
+public class BatchArrowPythonGroupAggregateFunctionOperator
+       extends AbstractBatchArrowPythonAggregateFunctionOperator {
+
+       private static final long serialVersionUID = 1L;
+
+       public BatchArrowPythonGroupAggregateFunctionOperator(
+               Configuration config,
+               PythonFunctionInfo[] pandasAggFunctions,
+               RowType inputType,
+               RowType outputType,
+               int[] groupKey,
+               int[] groupingSet,
+               int[] udafInputOffsets) {
+               super(config, pandasAggFunctions, inputType, outputType, 
groupKey, groupingSet, udafInputOffsets);
+       }
+
+       @Override
+       public void open() throws Exception {
+               userDefinedFunctionOutputType = new RowType(
+                       outputType.getFields().subList(groupingSet.length, 
outputType.getFieldCount()));
+               super.open();
+       }
+
+       @Override
+       protected void invokeCurrentBatch() throws Exception {
+               if (currentBatchCount > 0) {
+                       arrowSerializer.finishCurrentBatch();
+                       pythonFunctionRunner.process(baos.toByteArray());
+                       baos.reset();
+                       checkInvokeFinishBundleByCount();
+               }
+       }
+
+       @Override
+       public void bufferInput(RowData input) throws Exception {
+               BinaryRowData currentKey = 
groupKeyProjection.apply(input).copy();
+               if (lastGroupKey == null) {
+                       lastGroupKey = currentKey;
+                       lastGroupSet = groupSetProjection.apply(input).copy();
+                       forwardedInputQueue.add(lastGroupSet);
+               } else if (isNewKey(currentKey)) {
+                       invokeCurrentBatch();

Review comment:
       So it will trigger finishBundle for each key? Is it possible to trigger 
finishBundle for multiple keys?

##########
File path: 
flink-python/src/test/java/org/apache/flink/table/runtime/operators/python/aggregate/arrow/batch/BatchArrowPythonGroupAggregateFunctionOperatorTest.java
##########
@@ -0,0 +1,255 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.table.runtime.operators.python.aggregate.arrow.batch;
+
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.python.PythonFunctionRunner;
+import org.apache.flink.python.PythonOptions;
+import org.apache.flink.streaming.api.watermark.Watermark;
+import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
+import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness;
+import org.apache.flink.table.api.DataTypes;
+import org.apache.flink.table.data.RowData;
+import org.apache.flink.table.functions.python.PythonFunctionInfo;
+import 
org.apache.flink.table.runtime.operators.python.aggregate.arrow.AbstractArrowPythonAggregateFunctionOperator;
+import 
org.apache.flink.table.runtime.operators.python.aggregate.arrow.ArrowPythonAggregateFunctionOperatorTestBase;
+import 
org.apache.flink.table.runtime.utils.PassThroughPythonAggregateFunctionRunner;
+import org.apache.flink.table.runtime.utils.PythonTestUtils;
+import org.apache.flink.table.types.logical.BigIntType;
+import org.apache.flink.table.types.logical.LogicalType;
+import org.apache.flink.table.types.logical.RowType;
+import org.apache.flink.table.types.logical.VarCharType;
+
+import org.junit.Test;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.concurrent.ConcurrentLinkedQueue;
+
+/**
+ * Test for {@link BatchArrowPythonGroupAggregateFunctionOperator}. These test 
that:
+ *
+ * <ul>
+ * <li>FinishBundle is called when checkpoint is encountered</li>
+ * <li>Watermarks are buffered and only sent to downstream when finishedBundle 
is triggered</li>
+ * </ul>
+ */
+public class BatchArrowPythonGroupAggregateFunctionOperatorTest
+       extends ArrowPythonAggregateFunctionOperatorTestBase {
+
+       @Test
+       public void testGroupAggregateFunction() throws Exception {
+               OneInputStreamOperatorTestHarness<RowData, RowData> testHarness 
= getTestHarness(
+                       new Configuration());
+               long initialTime = 0L;
+               ConcurrentLinkedQueue<Object> expectedOutput = new 
ConcurrentLinkedQueue<>();
+
+               testHarness.open();
+
+               testHarness.processElement(new StreamRecord<>(newRow(true, 
"c1", "c2", 0L), initialTime + 1));
+               testHarness.processElement(new StreamRecord<>(newRow(true, 
"c1", "c4", 1L), initialTime + 2));
+               testHarness.processElement(new StreamRecord<>(newRow(true, 
"c2", "c6", 2L), initialTime + 3));
+               testHarness.close();
+
+               expectedOutput.add(new StreamRecord<>(newRow(true, "c1", 0L)));
+               expectedOutput.add(new StreamRecord<>(newRow(true, "c2", 2L)));
+
+               assertOutputEquals("Output was not correct.", expectedOutput, 
testHarness.getOutput());
+       }
+
+       @Test
+       public void testFinishBundleTriggeredOnCheckpoint() throws Exception {
+               Configuration conf = new Configuration();
+               conf.setInteger(PythonOptions.MAX_BUNDLE_SIZE, 10);
+               OneInputStreamOperatorTestHarness<RowData, RowData> testHarness 
= getTestHarness(conf);
+
+               long initialTime = 0L;
+               ConcurrentLinkedQueue<Object> expectedOutput = new 
ConcurrentLinkedQueue<>();
+
+               testHarness.open();
+
+               testHarness.processElement(new StreamRecord<>(newRow(true, 
"c1", "c2", 0L), initialTime + 1));
+               testHarness.processElement(new StreamRecord<>(newRow(true, 
"c1", "c4", 1L), initialTime + 2));
+               testHarness.processElement(new StreamRecord<>(newRow(true, 
"c2", "c6", 2L), initialTime + 3));
+               // checkpoint trigger finishBundle
+               testHarness.prepareSnapshotPreBarrier(0L);
+
+               expectedOutput.add(new StreamRecord<>(newRow(true, "c1", 0L)));
+
+               assertOutputEquals("Output was not correct.", expectedOutput, 
testHarness.getOutput());
+
+               testHarness.close();
+
+               expectedOutput.add(new StreamRecord<>(newRow(true, "c2", 2L)));
+
+               assertOutputEquals("Output was not correct.", expectedOutput, 
testHarness.getOutput());
+       }
+
+       @Test
+       public void testFinishBundleTriggeredByCount() throws Exception {
+               Configuration conf = new Configuration();
+               conf.setInteger(PythonOptions.MAX_BUNDLE_SIZE, 2);
+               OneInputStreamOperatorTestHarness<RowData, RowData> testHarness 
= getTestHarness(conf);
+
+               long initialTime = 0L;
+               ConcurrentLinkedQueue<Object> expectedOutput = new 
ConcurrentLinkedQueue<>();
+
+               testHarness.open();
+
+               testHarness.processElement(new StreamRecord<>(newRow(true, 
"c1", "c2", 0L), initialTime + 1));
+               testHarness.processElement(new StreamRecord<>(newRow(true, 
"c1", "c2", 1L), initialTime + 2));
+               assertOutputEquals("FinishBundle should not be triggered.", 
expectedOutput, testHarness.getOutput());
+
+               testHarness.processElement(new StreamRecord<>(newRow(true, 
"c2", "c6", 2L), initialTime + 2));
+               expectedOutput.add(new StreamRecord<>(newRow(true, "c1", 0L)));
+
+               assertOutputEquals("Output was not correct.", expectedOutput, 
testHarness.getOutput());
+
+               testHarness.close();
+
+               expectedOutput.add(new StreamRecord<>(newRow(true, "c2", 2L)));
+
+               assertOutputEquals("Output was not correct.", expectedOutput, 
testHarness.getOutput());
+       }
+
+       @Test
+       public void testFinishBundleTriggeredByTime() throws Exception {

Review comment:
       Should we disable "finish bundle trigger by time"?

##########
File path: 
flink-python/src/main/java/org/apache/flink/table/runtime/operators/python/scalar/arrow/ArrowPythonScalarFunctionOperator.java
##########
@@ -38,7 +38,7 @@
 
        private static final long serialVersionUID = 1L;
 
-       private static final String SCHEMA_ARROW_CODER_URN = 
"flink:coder:schema:scalar_function:arrow:v1";

Review comment:
       Why change this?

##########
File path: 
flink-python/src/main/java/org/apache/flink/table/runtime/operators/python/aggregate/arrow/AbstractArrowPythonAggregateFunctionOperator.java
##########
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.table.runtime.operators.python.aggregate.arrow;
+
+import org.apache.flink.annotation.Internal;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.fnexecution.v1.FlinkFnApi;
+import org.apache.flink.streaming.api.watermark.Watermark;
+import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
+import org.apache.flink.table.api.TableConfig;
+import org.apache.flink.table.data.JoinedRowData;
+import org.apache.flink.table.data.RowData;
+import org.apache.flink.table.data.binary.BinaryRowData;
+import org.apache.flink.table.functions.AggregateFunction;
+import org.apache.flink.table.functions.python.PythonEnv;
+import org.apache.flink.table.functions.python.PythonFunctionInfo;
+import org.apache.flink.table.planner.codegen.CodeGeneratorContext;
+import org.apache.flink.table.planner.codegen.ProjectionCodeGenerator;
+import org.apache.flink.table.runtime.arrow.serializers.ArrowSerializer;
+import org.apache.flink.table.runtime.arrow.serializers.RowDataArrowSerializer;
+import org.apache.flink.table.runtime.generated.GeneratedProjection;
+import org.apache.flink.table.runtime.generated.Projection;
+import 
org.apache.flink.table.runtime.operators.python.AbstractStatelessFunctionOperator;
+import org.apache.flink.table.types.logical.RowType;
+import org.apache.flink.util.Preconditions;
+
+/**
+ * The Abstract class of Arrow Aggregate Operator for Pandas {@link 
AggregateFunction}.
+ */
+@Internal
+public abstract class AbstractArrowPythonAggregateFunctionOperator
+       extends AbstractStatelessFunctionOperator<RowData, RowData, RowData> {
+
+       private static final long serialVersionUID = 1L;
+
+       private static final String SCHEMA_ARROW_CODER_URN = 
"flink:coder:schema:arrow:v1";
+
+       private static final String PANDAS_AGGREGATE_FUNCTION_URN = 
"flink:transform:pandas_aggregate_function:v1";
+
+       /**
+        * The Pandas {@link AggregateFunction}s to be executed.
+        */
+       private final PythonFunctionInfo[] pandasAggFunctions;
+
+       protected final int[] groupingSet;
+
+       protected transient ArrowSerializer<RowData> arrowSerializer;
+
+       /**
+        * The collector used to collect records.
+        */
+       protected transient StreamRecordRowDataWrappingCollector rowDataWrapper;
+
+       /**
+        * The JoinedRowData reused holding the execution result.
+        */
+       protected transient JoinedRowData reuseJoinedRow;
+
+       /**
+        * The current number of elements to be included in an arrow batch.
+        */
+       protected transient int currentBatchCount;
+
+       /**
+        * The Projection which projects the udaf input fields from the input 
row.
+        */
+       private transient Projection<RowData, BinaryRowData> 
udafInputProjection;
+
+       public AbstractArrowPythonAggregateFunctionOperator(
+               Configuration config,
+               PythonFunctionInfo[] pandasAggFunctions,
+               RowType inputType,
+               RowType outputType,
+               int[] groupingSet,
+               int[] udafInputOffsets) {
+               super(config, inputType, outputType, udafInputOffsets);
+               this.pandasAggFunctions = 
Preconditions.checkNotNull(pandasAggFunctions);
+               this.groupingSet = Preconditions.checkNotNull(groupingSet);
+       }
+
+       @Override
+       public void open() throws Exception {
+               super.open();
+               rowDataWrapper = new 
StreamRecordRowDataWrappingCollector(output);
+               reuseJoinedRow = new JoinedRowData();
+
+               udafInputProjection = createUdafInputProjection();
+               arrowSerializer = new 
RowDataArrowSerializer(userDefinedFunctionInputType, 
userDefinedFunctionOutputType);
+               arrowSerializer.open(bais, baos);
+               currentBatchCount = 0;
+       }
+
+       @Override
+       public void dispose() throws Exception {
+               super.dispose();
+               arrowSerializer.close();
+       }
+
+       @Override
+       public void processElement(StreamRecord<RowData> element) throws 
Exception {
+               RowData value = element.getValue();
+               bufferInput(value);
+               processElementInternal(value);
+               emitResults();
+       }
+
+       @Override
+       protected void checkInvokeFinishBundleByCount() throws Exception {
+               elementCount += currentBatchCount;
+               currentBatchCount = 0;
+               if (elementCount >= maxBundleSize) {
+                       invokeFinishBundle();
+               }
+       }
+
+       @Override
+       public void processWatermark(Watermark mark) throws Exception {

Review comment:
       This implementation is very similar to the implementation in the super 
class. It seems that we add this method just because **elementCount** was 
computed in **checkInvokeFinishBundleByCount**. If so, we just need to improve 
this, then we could avoid the duplicate code.

##########
File path: 
flink-python/src/main/java/org/apache/flink/table/runtime/operators/python/aggregate/arrow/AbstractArrowPythonAggregateFunctionOperator.java
##########
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.table.runtime.operators.python.aggregate.arrow;
+
+import org.apache.flink.annotation.Internal;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.fnexecution.v1.FlinkFnApi;
+import org.apache.flink.streaming.api.watermark.Watermark;
+import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
+import org.apache.flink.table.api.TableConfig;
+import org.apache.flink.table.data.JoinedRowData;
+import org.apache.flink.table.data.RowData;
+import org.apache.flink.table.data.binary.BinaryRowData;
+import org.apache.flink.table.functions.AggregateFunction;
+import org.apache.flink.table.functions.python.PythonEnv;
+import org.apache.flink.table.functions.python.PythonFunctionInfo;
+import org.apache.flink.table.planner.codegen.CodeGeneratorContext;
+import org.apache.flink.table.planner.codegen.ProjectionCodeGenerator;
+import org.apache.flink.table.runtime.arrow.serializers.ArrowSerializer;
+import org.apache.flink.table.runtime.arrow.serializers.RowDataArrowSerializer;
+import org.apache.flink.table.runtime.generated.GeneratedProjection;
+import org.apache.flink.table.runtime.generated.Projection;
+import 
org.apache.flink.table.runtime.operators.python.AbstractStatelessFunctionOperator;
+import org.apache.flink.table.types.logical.RowType;
+import org.apache.flink.util.Preconditions;
+
+/**
+ * The Abstract class of Arrow Aggregate Operator for Pandas {@link 
AggregateFunction}.
+ */
+@Internal
+public abstract class AbstractArrowPythonAggregateFunctionOperator
+       extends AbstractStatelessFunctionOperator<RowData, RowData, RowData> {
+
+       private static final long serialVersionUID = 1L;
+
+       private static final String SCHEMA_ARROW_CODER_URN = 
"flink:coder:schema:arrow:v1";
+
+       private static final String PANDAS_AGGREGATE_FUNCTION_URN = 
"flink:transform:pandas_aggregate_function:v1";

Review comment:
       ```suggestion
        private static final String PANDAS_AGGREGATE_FUNCTION_URN = 
"flink:transform:aggregate_function:arrow:v1";
   ```

##########
File path: 
flink-python/src/main/java/org/apache/flink/table/runtime/operators/python/aggregate/arrow/AbstractArrowPythonAggregateFunctionOperator.java
##########
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.table.runtime.operators.python.aggregate.arrow;
+
+import org.apache.flink.annotation.Internal;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.fnexecution.v1.FlinkFnApi;
+import org.apache.flink.streaming.api.watermark.Watermark;
+import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
+import org.apache.flink.table.api.TableConfig;
+import org.apache.flink.table.data.JoinedRowData;
+import org.apache.flink.table.data.RowData;
+import org.apache.flink.table.data.binary.BinaryRowData;
+import org.apache.flink.table.functions.AggregateFunction;
+import org.apache.flink.table.functions.python.PythonEnv;
+import org.apache.flink.table.functions.python.PythonFunctionInfo;
+import org.apache.flink.table.planner.codegen.CodeGeneratorContext;
+import org.apache.flink.table.planner.codegen.ProjectionCodeGenerator;
+import org.apache.flink.table.runtime.arrow.serializers.ArrowSerializer;
+import org.apache.flink.table.runtime.arrow.serializers.RowDataArrowSerializer;
+import org.apache.flink.table.runtime.generated.GeneratedProjection;
+import org.apache.flink.table.runtime.generated.Projection;
+import 
org.apache.flink.table.runtime.operators.python.AbstractStatelessFunctionOperator;
+import org.apache.flink.table.types.logical.RowType;
+import org.apache.flink.util.Preconditions;
+
+/**
+ * The Abstract class of Arrow Aggregate Operator for Pandas {@link 
AggregateFunction}.
+ */
+@Internal
+public abstract class AbstractArrowPythonAggregateFunctionOperator
+       extends AbstractStatelessFunctionOperator<RowData, RowData, RowData> {
+
+       private static final long serialVersionUID = 1L;
+
+       private static final String SCHEMA_ARROW_CODER_URN = 
"flink:coder:schema:arrow:v1";
+
+       private static final String PANDAS_AGGREGATE_FUNCTION_URN = 
"flink:transform:pandas_aggregate_function:v1";
+
+       /**
+        * The Pandas {@link AggregateFunction}s to be executed.
+        */
+       private final PythonFunctionInfo[] pandasAggFunctions;
+
+       protected final int[] groupingSet;
+
+       protected transient ArrowSerializer<RowData> arrowSerializer;
+
+       /**
+        * The collector used to collect records.
+        */
+       protected transient StreamRecordRowDataWrappingCollector rowDataWrapper;
+
+       /**
+        * The JoinedRowData reused holding the execution result.
+        */
+       protected transient JoinedRowData reuseJoinedRow;
+
+       /**
+        * The current number of elements to be included in an arrow batch.
+        */
+       protected transient int currentBatchCount;
+
+       /**
+        * The Projection which projects the udaf input fields from the input 
row.
+        */
+       private transient Projection<RowData, BinaryRowData> 
udafInputProjection;
+
+       public AbstractArrowPythonAggregateFunctionOperator(
+               Configuration config,
+               PythonFunctionInfo[] pandasAggFunctions,
+               RowType inputType,
+               RowType outputType,
+               int[] groupingSet,
+               int[] udafInputOffsets) {
+               super(config, inputType, outputType, udafInputOffsets);
+               this.pandasAggFunctions = 
Preconditions.checkNotNull(pandasAggFunctions);
+               this.groupingSet = Preconditions.checkNotNull(groupingSet);
+       }
+
+       @Override
+       public void open() throws Exception {
+               super.open();

Review comment:
       Should we disable the timer based **finishBundle**?

##########
File path: 
flink-python/src/main/java/org/apache/flink/table/runtime/operators/python/aggregate/arrow/batch/BatchArrowPythonGroupAggregateFunctionOperator.java
##########
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.table.runtime.operators.python.aggregate.arrow.batch;
+
+import org.apache.flink.annotation.Internal;
+import org.apache.flink.api.java.tuple.Tuple2;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.table.data.RowData;
+import org.apache.flink.table.data.binary.BinaryRowData;
+import org.apache.flink.table.functions.AggregateFunction;
+import org.apache.flink.table.functions.python.PythonFunctionInfo;
+import org.apache.flink.table.types.logical.RowType;
+
+/**
+ * The Batch Arrow Python {@link AggregateFunction} Operator for Group 
Aggregation.
+ */
+@Internal
+public class BatchArrowPythonGroupAggregateFunctionOperator
+       extends AbstractBatchArrowPythonAggregateFunctionOperator {
+
+       private static final long serialVersionUID = 1L;
+
+       public BatchArrowPythonGroupAggregateFunctionOperator(
+               Configuration config,
+               PythonFunctionInfo[] pandasAggFunctions,
+               RowType inputType,
+               RowType outputType,
+               int[] groupKey,
+               int[] groupingSet,
+               int[] udafInputOffsets) {
+               super(config, pandasAggFunctions, inputType, outputType, 
groupKey, groupingSet, udafInputOffsets);
+       }
+
+       @Override
+       public void open() throws Exception {
+               userDefinedFunctionOutputType = new RowType(
+                       outputType.getFields().subList(groupingSet.length, 
outputType.getFieldCount()));
+               super.open();
+       }
+
+       @Override
+       protected void invokeCurrentBatch() throws Exception {
+               if (currentBatchCount > 0) {
+                       arrowSerializer.finishCurrentBatch();
+                       pythonFunctionRunner.process(baos.toByteArray());
+                       baos.reset();
+                       checkInvokeFinishBundleByCount();
+               }
+       }
+
+       @Override
+       public void bufferInput(RowData input) throws Exception {
+               BinaryRowData currentKey = 
groupKeyProjection.apply(input).copy();
+               if (lastGroupKey == null) {
+                       lastGroupKey = currentKey;
+                       lastGroupSet = groupSetProjection.apply(input).copy();
+                       forwardedInputQueue.add(lastGroupSet);
+               } else if (isNewKey(currentKey)) {
+                       invokeCurrentBatch();
+                       lastGroupKey = currentKey;
+                       lastGroupSet = groupSetProjection.apply(input).copy();
+                       forwardedInputQueue.add(lastGroupSet);
+               }
+       }
+
+       @Override
+       public void processElementInternal(RowData value) {
+               arrowSerializer.write(getFunctionInput(value));
+               currentBatchCount++;
+       }
+
+       @Override
+       @SuppressWarnings("ConstantConditions")
+       public void emitResult(Tuple2<byte[], Integer> resultTuple) throws 
Exception {
+               byte[] udafResult = resultTuple.f0;
+               int length = resultTuple.f1;
+               bais.setBuffer(udafResult, 0, length);
+               int rowCount = arrowSerializer.load();
+               for (int i = 0; i < rowCount; i++) {
+                       RowData input = forwardedInputQueue.poll();

Review comment:
       ```suggestion
                        RowData key = forwardedInputQueue.poll();
   ```

##########
File path: 
flink-python/src/main/java/org/apache/flink/table/runtime/operators/python/aggregate/arrow/batch/BatchArrowPythonGroupAggregateFunctionOperator.java
##########
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.table.runtime.operators.python.aggregate.arrow.batch;
+
+import org.apache.flink.annotation.Internal;
+import org.apache.flink.api.java.tuple.Tuple2;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.table.data.RowData;
+import org.apache.flink.table.data.binary.BinaryRowData;
+import org.apache.flink.table.functions.AggregateFunction;
+import org.apache.flink.table.functions.python.PythonFunctionInfo;
+import org.apache.flink.table.types.logical.RowType;
+
+/**
+ * The Batch Arrow Python {@link AggregateFunction} Operator for Group 
Aggregation.
+ */
+@Internal
+public class BatchArrowPythonGroupAggregateFunctionOperator
+       extends AbstractBatchArrowPythonAggregateFunctionOperator {
+
+       private static final long serialVersionUID = 1L;
+
+       public BatchArrowPythonGroupAggregateFunctionOperator(
+               Configuration config,
+               PythonFunctionInfo[] pandasAggFunctions,
+               RowType inputType,
+               RowType outputType,
+               int[] groupKey,
+               int[] groupingSet,
+               int[] udafInputOffsets) {
+               super(config, pandasAggFunctions, inputType, outputType, 
groupKey, groupingSet, udafInputOffsets);
+       }
+
+       @Override
+       public void open() throws Exception {
+               userDefinedFunctionOutputType = new RowType(
+                       outputType.getFields().subList(groupingSet.length, 
outputType.getFieldCount()));
+               super.open();
+       }
+
+       @Override
+       protected void invokeCurrentBatch() throws Exception {
+               if (currentBatchCount > 0) {
+                       arrowSerializer.finishCurrentBatch();
+                       pythonFunctionRunner.process(baos.toByteArray());
+                       baos.reset();
+                       checkInvokeFinishBundleByCount();
+               }
+       }
+
+       @Override
+       public void bufferInput(RowData input) throws Exception {
+               BinaryRowData currentKey = 
groupKeyProjection.apply(input).copy();
+               if (lastGroupKey == null) {
+                       lastGroupKey = currentKey;
+                       lastGroupSet = groupSetProjection.apply(input).copy();
+                       forwardedInputQueue.add(lastGroupSet);
+               } else if (isNewKey(currentKey)) {
+                       invokeCurrentBatch();
+                       lastGroupKey = currentKey;
+                       lastGroupSet = groupSetProjection.apply(input).copy();
+                       forwardedInputQueue.add(lastGroupSet);
+               }
+       }
+
+       @Override
+       public void processElementInternal(RowData value) {
+               arrowSerializer.write(getFunctionInput(value));
+               currentBatchCount++;
+       }
+
+       @Override
+       @SuppressWarnings("ConstantConditions")
+       public void emitResult(Tuple2<byte[], Integer> resultTuple) throws 
Exception {
+               byte[] udafResult = resultTuple.f0;
+               int length = resultTuple.f1;
+               bais.setBuffer(udafResult, 0, length);
+               int rowCount = arrowSerializer.load();
+               for (int i = 0; i < rowCount; i++) {
+                       RowData input = forwardedInputQueue.poll();
+                       reuseJoinedRow.setRowKind(input.getRowKind());
+                       RowData data = arrowSerializer.read(i);

Review comment:
       ```suggestion
                        RowData result = arrowSerializer.read(i);
   ```




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [flink] dianfu commented on a change in pull request #13369: [FLINK-19173][python] Add Pandas Batch Group Aggregation Function Operator

Reply via email to