This is an automated email from the ASF dual-hosted git repository.

jiafengzheng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris-spark-connector.git


The following commit(s) were added to refs/heads/master by this push:
     new 739725c  [improvement] splits large collections to normal collection 
to avoid the "Requested array size exceeds VM limit" exception (#61)
739725c is described below

commit 739725c1be9188f3955e7cef042931d78df4598f
Author: lexluo09 <[email protected]>
AuthorDate: Tue Dec 20 13:29:49 2022 +0800

    [improvement] splits large collections to normal collection to avoid the 
"Requested array size exceeds VM limit" exception (#61)
---
 .../org/apache/doris/spark/DorisStreamLoad.java    |  7 ++-
 .../org/apache/doris/spark/util/ListUtils.java     | 69 ++++++++++++++++++++++
 .../org/apache/doris/spark/util/TestListUtils.java | 42 +++++++++++++
 3 files changed, 117 insertions(+), 1 deletion(-)

diff --git 
a/spark-doris-connector/src/main/java/org/apache/doris/spark/DorisStreamLoad.java
 
b/spark-doris-connector/src/main/java/org/apache/doris/spark/DorisStreamLoad.java
index a868eb9..8cebe68 100644
--- 
a/spark-doris-connector/src/main/java/org/apache/doris/spark/DorisStreamLoad.java
+++ 
b/spark-doris-connector/src/main/java/org/apache/doris/spark/DorisStreamLoad.java
@@ -29,6 +29,7 @@ import org.apache.doris.spark.exception.StreamLoadException;
 import org.apache.doris.spark.rest.RestService;
 import org.apache.doris.spark.rest.models.BackendV2;
 import org.apache.doris.spark.rest.models.RespContent;
+import org.apache.doris.spark.util.ListUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -222,7 +223,11 @@ public class DorisStreamLoad implements Serializable{
         } catch (Exception e) {
             throw new StreamLoadException("The number of configured columns 
does not match the number of data columns.");
         }
-        load((new ObjectMapper()).writeValueAsString(dataList));
+        // splits large collections to normal collection to avoid the 
"Requested array size exceeds VM limit" exception
+        List<String> serializedList = ListUtils.getSerializedList(dataList);
+        for (String serializedRows : serializedList) {
+            load(serializedRows);
+        }
     }
 
     public void load(String value) throws StreamLoadException {
diff --git 
a/spark-doris-connector/src/main/java/org/apache/doris/spark/util/ListUtils.java
 
b/spark-doris-connector/src/main/java/org/apache/doris/spark/util/ListUtils.java
new file mode 100644
index 0000000..43f5b77
--- /dev/null
+++ 
b/spark-doris-connector/src/main/java/org/apache/doris/spark/util/ListUtils.java
@@ -0,0 +1,69 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.spark.util;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+public class ListUtils {
+    private static final Logger LOG = LoggerFactory.getLogger(ListUtils.class);
+
+    public static List<String> getSerializedList(List<Map<Object, Object>> 
batch) throws JsonProcessingException {
+        List<String> result = new ArrayList<>();
+        divideAndSerialize(batch, result);
+        return result;
+    }
+
+    /***
+     * recursively splits large collections to normal collection and 
serializes the collection
+     * @param batch
+     * @param result
+     * @throws JsonProcessingException
+     */
+    public static void divideAndSerialize(List<Map<Object, Object>> batch, 
List<String> result) throws JsonProcessingException {
+        String serializedResult = (new 
ObjectMapper()).writeValueAsString(batch);
+        // if an error occurred in the batch call to getBytes ,average divide 
the batch
+        try {
+            //the "Requested array size exceeds VM limit" exception occurs 
when the collection is large
+            serializedResult.getBytes("UTF-8");
+            result.add(serializedResult);
+            return;
+        } catch (Throwable error) {
+            LOG.error("getBytes error:{} ,average divide the collection", 
error);
+        }
+        for (List<Map<Object, Object>> avgSubCollection : 
getAvgSubCollections(batch)) {
+            divideAndSerialize(avgSubCollection, result);
+        }
+    }
+
+    /***
+     * average divide the collection
+     * @param values
+     * @return
+     */
+    public static List<List<Map<Object, Object>>> 
getAvgSubCollections(List<Map<Object, Object>> values) {
+        return Lists.partition(values, (values.size() + 1) / 2);
+    }
+}
diff --git 
a/spark-doris-connector/src/test/java/org/apache/doris/spark/util/TestListUtils.java
 
b/spark-doris-connector/src/test/java/org/apache/doris/spark/util/TestListUtils.java
new file mode 100644
index 0000000..c0ec102
--- /dev/null
+++ 
b/spark-doris-connector/src/test/java/org/apache/doris/spark/util/TestListUtils.java
@@ -0,0 +1,42 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.spark.util;
+
+import org.junit.Assert;
+import org.junit.Test;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class TestListUtils {
+
+    @Test
+    public void testGetSerializedList() throws Exception {
+        int size = 15000;
+        List<Map<Object, Object>> batch = new ArrayList<>();
+        for (int i = 0; i < size; i++) {
+            Map<Object, Object> entity = new HashMap<>();
+            batch.add(entity);
+        }
+        Assert.assertEquals(ListUtils.getSerializedList(batch).size(), 1);
+
+        Assert.assertEquals(ListUtils.getSerializedList(new 
ArrayList<>()).size(), 1);
+
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to