Github user jianqiao commented on a diff in the pull request:

    https://github.com/apache/incubator-quickstep/pull/19#discussion_r66470811
  
    --- Diff: relational_operators/TextScanOperator.cpp ---
    @@ -155,116 +63,50 @@ bool TextScanOperator::getAllWorkOrders(
       InsertDestination *output_destination =
           query_context->getInsertDestination(output_destination_index_);
     
    -  if (parallelize_load_) {
    -    // Parallel implementation: Split work orders are generated for each 
file
    -    // being bulk-loaded. (More than one file can be loaded, because we 
support
    -    // glob() semantics in file name.) These work orders read the input 
file,
    -    // and split them in the blobs that can be parsed independently.
    -    if (blocking_dependencies_met_) {
    -      if (!work_generated_) {
    -        // First, generate text-split work orders.
    -        for (const auto &file : files) {
    -          container->addNormalWorkOrder(
    -              new TextSplitWorkOrder(query_id_,
    -                                     file,
    -                                     process_escape_sequences_,
    -                                     storage_manager,
    -                                     op_index_,
    -                                     scheduler_client_id,
    -                                     bus),
    -              op_index_);
    -          ++num_split_work_orders_;
    -        }
    -        work_generated_ = true;
    -        return false;
    -      } else {
    -        // Check if there are blobs to parse.
    -        while (!text_blob_queue_.empty()) {
    -          const TextBlob blob_work = text_blob_queue_.popOne();
    -          container->addNormalWorkOrder(
    -              new TextScanWorkOrder(query_id_,
    -                                    blob_work.blob_id,
    -                                    blob_work.size,
    -                                    field_terminator_,
    -                                    process_escape_sequences_,
    -                                    output_destination,
    -                                    storage_manager),
    -              op_index_);
    -        }
    -        // Done if all split work orders are completed, and no blobs are 
left to
    -        // process.
    -        return num_done_split_work_orders_.load(std::memory_order_acquire) 
== num_split_work_orders_ &&
    -               text_blob_queue_.empty();
    -      }
    -    }
    -    return false;
    -  } else {
    -    // Serial implementation.
    -    if (blocking_dependencies_met_ && !work_generated_) {
    -      for (const auto &file : files) {
    +  // Text segment size set to 256KB.
    +  constexpr std::size_t kTextSegmentSize = 0x40000u;
    +
    +  if (blocking_dependencies_met_ && !work_generated_) {
    +    for (const std::string &file : files) {
    +      // Use standard C libary to retrieve the file size.
    +      FILE *fp = std::fopen(file.c_str(), "rb");
    +      std::fseek(fp, 0, SEEK_END);
    +      const std::size_t file_size = std::ftell(fp);
    +      std::fclose(fp);
    +
    +      std::size_t text_offset = 0;
    +      while (text_offset < file_size) {
             container->addNormalWorkOrder(
                 new TextScanWorkOrder(query_id_,
                                       file,
    +                                  text_offset,
    +                                  std::min(kTextSegmentSize, file_size - 
text_offset),
                                       field_terminator_,
                                       process_escape_sequences_,
                                       output_destination,
                                       storage_manager),
                 op_index_);
    +        text_offset += kTextSegmentSize;
    --- End diff --
    
    Yes alternatively we can do
    ```
    std::size_t text_segment_size = std::min(kTextSegmentSize, file_size - 
text_offset);
    
    ... // addNormalWorkOrder
    
    text_offset += text_segment_size;
    ```


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

Reply via email to