This is an automated email from the ASF dual-hosted git repository.
jiafengzheng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris-spark-connector.git
The following commit(s) were added to refs/heads/master by this push:
new 739725c [improvement] splits large collections to normal collection
to avoid the "Requested array size exceeds VM limit" exception (#61)
739725c is described below
commit 739725c1be9188f3955e7cef042931d78df4598f
Author: lexluo09 <[email protected]>
AuthorDate: Tue Dec 20 13:29:49 2022 +0800
[improvement] splits large collections to normal collection to avoid the
"Requested array size exceeds VM limit" exception (#61)
---
.../org/apache/doris/spark/DorisStreamLoad.java | 7 ++-
.../org/apache/doris/spark/util/ListUtils.java | 69 ++++++++++++++++++++++
.../org/apache/doris/spark/util/TestListUtils.java | 42 +++++++++++++
3 files changed, 117 insertions(+), 1 deletion(-)
diff --git
a/spark-doris-connector/src/main/java/org/apache/doris/spark/DorisStreamLoad.java
b/spark-doris-connector/src/main/java/org/apache/doris/spark/DorisStreamLoad.java
index a868eb9..8cebe68 100644
---
a/spark-doris-connector/src/main/java/org/apache/doris/spark/DorisStreamLoad.java
+++
b/spark-doris-connector/src/main/java/org/apache/doris/spark/DorisStreamLoad.java
@@ -29,6 +29,7 @@ import org.apache.doris.spark.exception.StreamLoadException;
import org.apache.doris.spark.rest.RestService;
import org.apache.doris.spark.rest.models.BackendV2;
import org.apache.doris.spark.rest.models.RespContent;
+import org.apache.doris.spark.util.ListUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -222,7 +223,11 @@ public class DorisStreamLoad implements Serializable{
} catch (Exception e) {
throw new StreamLoadException("The number of configured columns
does not match the number of data columns.");
}
- load((new ObjectMapper()).writeValueAsString(dataList));
+ // splits large collections to normal collection to avoid the
"Requested array size exceeds VM limit" exception
+ List<String> serializedList = ListUtils.getSerializedList(dataList);
+ for (String serializedRows : serializedList) {
+ load(serializedRows);
+ }
}
public void load(String value) throws StreamLoadException {
diff --git
a/spark-doris-connector/src/main/java/org/apache/doris/spark/util/ListUtils.java
b/spark-doris-connector/src/main/java/org/apache/doris/spark/util/ListUtils.java
new file mode 100644
index 0000000..43f5b77
--- /dev/null
+++
b/spark-doris-connector/src/main/java/org/apache/doris/spark/util/ListUtils.java
@@ -0,0 +1,69 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.spark.util;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+public class ListUtils {
+ private static final Logger LOG = LoggerFactory.getLogger(ListUtils.class);
+
+ public static List<String> getSerializedList(List<Map<Object, Object>>
batch) throws JsonProcessingException {
+ List<String> result = new ArrayList<>();
+ divideAndSerialize(batch, result);
+ return result;
+ }
+
+ /***
+ * recursively splits large collections to normal collection and
serializes the collection
+ * @param batch
+ * @param result
+ * @throws JsonProcessingException
+ */
+ public static void divideAndSerialize(List<Map<Object, Object>> batch,
List<String> result) throws JsonProcessingException {
+ String serializedResult = (new
ObjectMapper()).writeValueAsString(batch);
+ // if an error occurred in the batch call to getBytes ,average divide
the batch
+ try {
+ //the "Requested array size exceeds VM limit" exception occurs
when the collection is large
+ serializedResult.getBytes("UTF-8");
+ result.add(serializedResult);
+ return;
+ } catch (Throwable error) {
+ LOG.error("getBytes error:{} ,average divide the collection",
error);
+ }
+ for (List<Map<Object, Object>> avgSubCollection :
getAvgSubCollections(batch)) {
+ divideAndSerialize(avgSubCollection, result);
+ }
+ }
+
+ /***
+ * average divide the collection
+ * @param values
+ * @return
+ */
+ public static List<List<Map<Object, Object>>>
getAvgSubCollections(List<Map<Object, Object>> values) {
+ return Lists.partition(values, (values.size() + 1) / 2);
+ }
+}
diff --git
a/spark-doris-connector/src/test/java/org/apache/doris/spark/util/TestListUtils.java
b/spark-doris-connector/src/test/java/org/apache/doris/spark/util/TestListUtils.java
new file mode 100644
index 0000000..c0ec102
--- /dev/null
+++
b/spark-doris-connector/src/test/java/org/apache/doris/spark/util/TestListUtils.java
@@ -0,0 +1,42 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.spark.util;
+
+import org.junit.Assert;
+import org.junit.Test;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class TestListUtils {
+
+ @Test
+ public void testGetSerializedList() throws Exception {
+ int size = 15000;
+ List<Map<Object, Object>> batch = new ArrayList<>();
+ for (int i = 0; i < size; i++) {
+ Map<Object, Object> entity = new HashMap<>();
+ batch.add(entity);
+ }
+ Assert.assertEquals(ListUtils.getSerializedList(batch).size(), 1);
+
+ Assert.assertEquals(ListUtils.getSerializedList(new
ArrayList<>()).size(), 1);
+
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]