[GitHub] [carbondata] VenuReddy2103 commented on a change in pull request #3819: [CARBONDATA-3855]support carbon SDK to load data from different files

GitBox Thu, 27 Aug 2020 05:19:29 -0700


VenuReddy2103 commented on a change in pull request #3819:
URL: https://github.com/apache/carbondata/pull/3819#discussion_r478375043




##########
File path: 
sdk/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonWriterBuilder.java
##########
@@ -594,6 +607,446 @@ public CarbonWriterBuilder withJsonInput(Schema 
carbonSchema) {
     return this;
   }
 
+  private void validateCsvFiles() throws IOException {
+    CarbonFile[] dataFiles = 
this.extractDataFiles(CarbonCommonConstants.CSV_FILE_EXTENSION);
+    if (CollectionUtils.isEmpty(Arrays.asList(dataFiles))) {
+      throw new RuntimeException("CSV files can't be empty.");
+    }
+    for (CarbonFile dataFile : dataFiles) {
+      try {
+        CsvParser csvParser = SDKUtil.buildCsvParser(this.hadoopConf);
+        
csvParser.beginParsing(FileFactory.getDataInputStream(dataFile.getPath(),
+            -1, this.hadoopConf));
+      } catch (IllegalArgumentException ex) {
+        if (ex.getCause() instanceof FileNotFoundException) {
+          throw new FileNotFoundException("File " + dataFile +
+              " not found to build carbon writer.");
+        }
+        throw ex;
+      }
+    }
+    this.dataFiles = dataFiles;
+  }
+
+  /**
+   * to build a {@link CarbonWriter}, which accepts loading CSV files.
+   *
+   * @param filePath absolute path under which files should be loaded.
+   * @return CarbonWriterBuilder
+   */
+  public CarbonWriterBuilder withCsvPath(String filePath) throws IOException {
+    this.validateFilePath(filePath);
+    this.filePath = filePath;
+    this.setIsDirectory(filePath);
+    this.withCsvInput();
+    this.validateCsvFiles();
+    return this;
+  }
+
+  /**
+   * to build a {@link CarbonWriter}, which accepts CSV files directory and
+   * list of file which has to be loaded.
+   *
+   * @param filePath directory where the CSV file exists.
+   * @param fileList list of files which has to be loaded.
+   * @return CarbonWriterBuilder
+   */
+  public CarbonWriterBuilder withCsvPath(String filePath, List<String> 
fileList)
+      throws IOException {
+    this.fileList = fileList;
+    this.withCsvPath(filePath);
+    return this;
+  }
+
+  private void validateJsonFiles() throws IOException {
+    CarbonFile[] dataFiles = 
this.extractDataFiles(CarbonCommonConstants.JSON_FILE_EXTENSION);
+    for (CarbonFile dataFile : dataFiles) {
+      try {
+        new JSONParser().parse(SDKUtil.buildJsonReader(dataFile, 
this.hadoopConf));
+      } catch (FileNotFoundException ex) {
+        throw new FileNotFoundException("File " + dataFile + " not found to 
build carbon writer.");
+      } catch (ParseException ex) {
+        throw new RuntimeException("File " + dataFile + " is not in json 
format.");
+      }
+    }
+    this.dataFiles = dataFiles;
+  }
+
+  /**
+   * to build a {@link CarbonWriter}, which accepts loading JSON files.
+   *
+   * @param filePath absolute path under which files should be loaded.
+   * @return CarbonWriterBuilder
+   */
+  public CarbonWriterBuilder withJsonPath(String filePath) throws IOException {
+    this.validateFilePath(filePath);
+    this.filePath = filePath;
+    this.setIsDirectory(filePath);
+    this.withJsonInput();
+    this.validateJsonFiles();
+    return this;
+  }
+
+  /**
+   * to build a {@link CarbonWriter}, which accepts JSON file directory and
+   * list of file which has to be loaded.
+   *
+   * @param filePath directory where the json file exists.
+   * @param fileList list of files which has to be loaded.
+   * @return CarbonWriterBuilder
+   * @throws IOException
+   */
+  public CarbonWriterBuilder withJsonPath(String filePath, List<String> 
fileList)
+      throws IOException {
+    this.fileList = fileList;
+    this.withJsonPath(filePath);
+    return this;
+  }
+
+  private void validateFilePath(String filePath) {
+    if (StringUtils.isEmpty(filePath)) {
+      throw new IllegalArgumentException("filePath can not be empty");
+    }
+  }
+
+  /**
+   * to build a {@link CarbonWriter}, which accepts loading Parquet files.
+   *
+   * @param filePath absolute path under which files should be loaded.
+   * @return CarbonWriterBuilder
+   */
+  public CarbonWriterBuilder withParquetPath(String filePath) throws 
IOException {
+    this.validateFilePath(filePath);
+    this.filePath = filePath;
+    this.setIsDirectory(filePath);
+    this.writerType = WRITER_TYPE.PARQUET;
+    this.validateParquetFiles();
+    return this;
+  }
+
+  private void setIsDirectory(String filePath) {
+    if (this.hadoopConf == null) {
+      this.hadoopConf = new Configuration(FileFactory.getConfiguration());
+    }
+    CarbonFile carbonFile = FileFactory.getCarbonFile(filePath, hadoopConf);
+    this.isDirectory = carbonFile.isDirectory();
+  }
+
+  /**
+   * to build a {@link CarbonWriter}, which accepts parquet files directory and
+   * list of file which has to be loaded.
+   *
+   * @param filePath directory where the parquet file exists.
+   * @param fileList list of files which has to be loaded.
+   * @return CarbonWriterBuilder
+   * @throws IOException
+   */
+  public CarbonWriterBuilder withParquetPath(String filePath, List<String> 
fileList)
+      throws IOException {
+    this.fileList = fileList;
+    this.withParquetPath(filePath);
+    return this;
+  }
+
+  private void validateParquetFiles() throws IOException {
+    CarbonFile[] dataFiles = 
this.extractDataFiles(CarbonCommonConstants.PARQUET_FILE_EXT);
+    org.apache.avro.Schema parquetSchema = null;
+    for (CarbonFile dataFile : dataFiles) {
+      try {
+        ParquetReader<GenericRecord> parquetReader =
+            SDKUtil.buildParquetReader(dataFile.getPath(), this.hadoopConf);
+        if (parquetSchema == null) {
+          parquetSchema = parquetReader.read().getSchema();
+        } else {
+          if (!parquetSchema.equals(parquetReader.read().getSchema())) {
+            throw new RuntimeException("All the parquet files must be having 
the same schema.");
+          }
+        }
+      } catch (IllegalArgumentException ex) {
+        if (ex.getMessage() != null && ex.getMessage()
+            .contains("INT96 not implemented and is deprecated")) {
+          throw new IllegalArgumentException("Carbon does not support parquet 
INT96 data type.");
+        }
+        throw ex;
+      } catch (UnsupportedOperationException ex) {
+        if (ex.getMessage() != null && ex.getMessage()
+            .contains("REPEATED not supported outside LIST or MAP.")) {
+          throw new UnsupportedOperationException("Carbon does not support " +
+              "repeated parquet schema outside of list or map.");
+        }
+        throw ex;
+      } catch (RuntimeException ex) {
+        if (ex.getMessage() != null && ex.getMessage().contains("not a Parquet 
file")) {
+          throw new RuntimeException("File " + dataFile + " is not in parquet 
format.");
+        }
+        throw ex;
+      }
+    }
+    this.dataFiles = dataFiles;
+    this.avroSchema = parquetSchema;
+    this.schema = 
AvroCarbonWriter.getCarbonSchemaFromAvroSchema(this.avroSchema);
+  }
+
+  private CarbonFile[] extractDataFiles(String suf) {
+    List<CarbonFile> dataFiles;
+    if (this.hadoopConf == null) {
+      this.hadoopConf = new Configuration(FileFactory.getConfiguration());
+    }
+    if (this.isDirectory) {
+      if (CollectionUtils.isEmpty(this.fileList)) {
+        dataFiles = SDKUtil.extractFilesFromFolder(this.filePath, suf, 
this.hadoopConf);
+      } else {
+        dataFiles = this.appendFileListWithPath();
+      }
+    } else {
+      dataFiles = new ArrayList<>();
+      dataFiles.add(FileFactory.getCarbonFile(this.filePath, this.hadoopConf));
+    }
+    return dataFiles.toArray(new CarbonFile[0]);
+  }
+
+  /**
+   * to build a {@link CarbonWriter}, which accepts loading ORC files.
+   *
+   * @param filePath absolute path under which files should be loaded.
+   * @return CarbonWriterBuilder
+   */
+  public CarbonWriterBuilder withOrcPath(String filePath) throws IOException {
+    this.validateFilePath(filePath);
+    this.filePath = filePath;
+    this.setIsDirectory(filePath);
+    this.writerType = WRITER_TYPE.ORC;
+    Map<String, String> options = new HashMap<>();
+    options.put("complex_delimiter_level_1",
+        CarbonCommonConstants.COMPLEX_DELIMITERS_LEVEL_1_DEFAULT);
+    options.put("complex_delimiter_level_2",
+        CarbonCommonConstants.COMPLEX_DELIMITERS_LEVEL_2_DEFAULT);
+    options.put("complex_delimiter_level_3",
+        CarbonCommonConstants.COMPLEX_DELIMITERS_LEVEL_3_DEFAULT);
+    this.withLoadOptions(options);
+    this.buildOrcReader();

Review comment:
       This buildOrcReader seem to be for validation. Same as above comment 
regarding validate method




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [carbondata] VenuReddy2103 commented on a change in pull request #3819: [CARBONDATA-3855]support carbon SDK to load data from different files

Reply via email to