Github user jianqiao commented on a diff in the pull request: https://github.com/apache/incubator-quickstep/pull/19#discussion_r66470811 --- Diff: relational_operators/TextScanOperator.cpp --- @@ -155,116 +63,50 @@ bool TextScanOperator::getAllWorkOrders( InsertDestination *output_destination = query_context->getInsertDestination(output_destination_index_); - if (parallelize_load_) { - // Parallel implementation: Split work orders are generated for each file - // being bulk-loaded. (More than one file can be loaded, because we support - // glob() semantics in file name.) These work orders read the input file, - // and split them in the blobs that can be parsed independently. - if (blocking_dependencies_met_) { - if (!work_generated_) { - // First, generate text-split work orders. - for (const auto &file : files) { - container->addNormalWorkOrder( - new TextSplitWorkOrder(query_id_, - file, - process_escape_sequences_, - storage_manager, - op_index_, - scheduler_client_id, - bus), - op_index_); - ++num_split_work_orders_; - } - work_generated_ = true; - return false; - } else { - // Check if there are blobs to parse. - while (!text_blob_queue_.empty()) { - const TextBlob blob_work = text_blob_queue_.popOne(); - container->addNormalWorkOrder( - new TextScanWorkOrder(query_id_, - blob_work.blob_id, - blob_work.size, - field_terminator_, - process_escape_sequences_, - output_destination, - storage_manager), - op_index_); - } - // Done if all split work orders are completed, and no blobs are left to - // process. - return num_done_split_work_orders_.load(std::memory_order_acquire) == num_split_work_orders_ && - text_blob_queue_.empty(); - } - } - return false; - } else { - // Serial implementation. - if (blocking_dependencies_met_ && !work_generated_) { - for (const auto &file : files) { + // Text segment size set to 256KB. + constexpr std::size_t kTextSegmentSize = 0x40000u; + + if (blocking_dependencies_met_ && !work_generated_) { + for (const std::string &file : files) { + // Use standard C libary to retrieve the file size. + FILE *fp = std::fopen(file.c_str(), "rb"); + std::fseek(fp, 0, SEEK_END); + const std::size_t file_size = std::ftell(fp); + std::fclose(fp); + + std::size_t text_offset = 0; + while (text_offset < file_size) { container->addNormalWorkOrder( new TextScanWorkOrder(query_id_, file, + text_offset, + std::min(kTextSegmentSize, file_size - text_offset), field_terminator_, process_escape_sequences_, output_destination, storage_manager), op_index_); + text_offset += kTextSegmentSize; --- End diff -- Yes alternatively we can do ``` std::size_t text_segment_size = std::min(kTextSegmentSize, file_size - text_offset); ... // addNormalWorkOrder text_offset += text_segment_size; ```
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---