imbajin commented on code in PR #683:
URL:
https://github.com/apache/incubator-hugegraph-toolchain/pull/683#discussion_r2444647586
##########
hugegraph-loader/src/main/java/org/apache/hugegraph/loader/HugeGraphLoader.java:
##########
@@ -200,27 +594,133 @@ private void loadInputs(List<InputStruct> structs) {
}
}
- private void loadStructs(List<InputStruct> structs) {
- // Load input structs one by one
+ private List<InputTaskItem> prepareTaskItems(List<InputStruct> structs,
+ boolean scatter) {
+ ArrayList<InputTaskItem> tasks = new ArrayList<>();
+ int curFile = 0;
+ int curIndex = 0;
for (InputStruct struct : structs) {
- if (this.context.stopped()) {
- break;
- }
if (struct.skip()) {
continue;
}
- // Create and init InputReader, fetch next batch lines
- try (InputReader reader = InputReader.create(struct.input())) {
- // Init reader
- reader.init(this.context, struct);
- // Load data from current input mapping
- this.loadStruct(struct, reader);
+
+ // Create and init InputReader
+ try {
+ LOG.info("Start loading: '{}'", struct);
+
+ InputReader reader = InputReader.create(struct.input());
+ List<InputReader> readerList = reader.multiReaders() ?
+ reader.split() :
+ ImmutableList.of(reader);
+
+ LOG.info("total {} found in '{}'", readerList.size(), struct);
+ tasks.ensureCapacity(tasks.size() + readerList.size());
+ int seq = 0;
+ for (InputReader r : readerList) {
+ if (curFile >= this.context.options().startFile &&
+ (this.context.options().endFile == -1 ||
+ curFile < this.context.options().endFile)) {
+ // Load data from current input mapping
+ tasks.add(new InputTaskItem(struct, r, seq, curIndex));
+ } else {
+ r.close();
+ }
+ seq += 1;
+ curFile += 1;
+ }
+ if (this.context.options().endFile != -1 &&
+ curFile >= this.context.options().endFile) {
+ break;
+ }
} catch (InitException e) {
throw new LoadException("Failed to init input reader", e);
}
+ curIndex += 1;
+ }
+ // sort by seqNumber to allow scatter loading from different sources
+ if (scatter) {
+ tasks.sort(Comparator.comparingInt((InputTaskItem o) ->
o.structIndex)
+ .thenComparingInt(o -> o.seqNumber));
+ }
+
+ return tasks;
+ }
+
+ private void loadStructs(List<InputStruct> structs) {
+ int parallelCount = this.context.options().parallelCount;
+ if (structs.size() == 0) {
+ return;
+ }
+ if (parallelCount <= 0) {
+ parallelCount = structs.size();
+ }
+
+ boolean scatter = this.context.options().scatterSources;
+
+ LOG.info("{} threads for loading {} structs, from {} to {} in {} mode",
+ parallelCount, structs.size(),
this.context.options().startFile,
+ this.context.options().endFile,
+ scatter ? "scatter" : "sequential");
Review Comment:
**Thread Safety Concern**: The `loadService` ExecutorService is an instance
field created in `loadStructs()`. If `load()` is called multiple times, this
could leak executors. Consider: 1) Creating it in the constructor, or 2) Adding
null check + cleanup before reassigning, or 3) Making it a local variable.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]