This is an automated email from the ASF dual-hosted git repository.

jackylk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/carbondata.git


The following commit(s) were added to refs/heads/master by this push:
     new 10a2da0   [CARBONDATA-3255] Support python writer and reader SDK
10a2da0 is described below

commit 10a2da0d3686e43d58dfeecbdbf9433f2fb977ad
Author: xubo245 <601450...@qq.com>
AuthorDate: Mon Nov 25 00:58:41 2019 +0800

     [CARBONDATA-3255] Support python writer and reader SDK
    
    Apache CarbonData already provide Java/ Scala/C++ interface for users, and 
more and more people use python to manage and analysis big data, so it's better 
to provide python interface to support to write and read structured and 
unstructured data in CarbonData, like String, int and binary data: 
image/voice/video
    
    In this PR, python SDK supports:
        1. API to read data: CarbonReader
        2. API to write data: CarbonWriter
        3. support primitive data types, nested types are not supported
        4. support projection and filter in reader
    
    This closes #3478
---
 .gitignore                                         |   1 +
 python/__init__.py                                 |  16 +
 python/pycarbon/__init__.py                        |   0
 python/pycarbon/sdk/ArrowCarbonReader.py           |  99 +++++
 python/pycarbon/sdk/CarbonReader.py                | 162 +++++++
 python/pycarbon/sdk/CarbonSchemaReader.py          |  64 +++
 python/pycarbon/sdk/CarbonWriter.py                | 120 ++++++
 python/pycarbon/sdk/Configuration.py               |  24 ++
 python/pycarbon/sdk/Constants.py                   |  16 +
 python/pycarbon/sdk/SDKUtil.py                     |  23 +
 python/pycarbon/sdk/__init__.py                    |   0
 python/pycarbon/test/.coveragerc                   |   4 +
 python/pycarbon/test/__init__.py                   |  32 ++
 python/pycarbon/test/resources/carbondatalogo.jpg  | Bin 0 -> 59099 bytes
 python/pycarbon/test/resources/carbondatalogo2.jpg | Bin 0 -> 38009 bytes
 .../resources/flowers/10686568196_b1915544a8.jpg   | Bin 0 -> 97920 bytes
 .../resources/flowers/10686568196_b1915544a8.txt   |   1 +
 .../resources/flowers/10712722853_5632165b04.jpg   | Bin 0 -> 63389 bytes
 .../resources/flowers/10712722853_5632165b04.txt   |   1 +
 .../flowers/subfolder/10841136265_af473efc60.jpg   | Bin 0 -> 62144 bytes
 .../flowers/subfolder/10841136265_af473efc60.txt   |   1 +
 python/pycarbon/test/resources/voc/2007_000027.jpg | Bin 0 -> 145493 bytes
 python/pycarbon/test/resources/voc/2007_000027.xml |  63 +++
 python/pycarbon/test/resources/voc/2007_000032.jpg | Bin 0 -> 54757 bytes
 python/pycarbon/test/resources/voc/2007_000032.xml |  63 +++
 python/pycarbon/test/resources/voc/2007_000033.jpg | Bin 0 -> 71205 bytes
 python/pycarbon/test/resources/voc/2007_000033.xml |  51 +++
 python/pycarbon/test/resources/voc/2007_000039.jpg | Bin 0 -> 64668 bytes
 python/pycarbon/test/resources/voc/2007_000039.xml |  27 ++
 python/pycarbon/test/resources/voc/2009_001444.jpg | Bin 0 -> 677151 bytes
 python/pycarbon/test/resources/voc/2009_001444.xml |  28 ++
 .../vocForSegmentationClass/2007_000032.jpg        | Bin 0 -> 54757 bytes
 .../vocForSegmentationClass/2007_000032.png        | Bin 0 -> 2334 bytes
 .../vocForSegmentationClass/2007_000033.jpg        | Bin 0 -> 71205 bytes
 .../vocForSegmentationClass/2007_000033.png        | Bin 0 -> 2814 bytes
 .../vocForSegmentationClass/2007_000042.jpg        | Bin 0 -> 82847 bytes
 .../vocForSegmentationClass/2007_000042.png        | Bin 0 -> 3620 bytes
 python/pycarbon/test/test_read_write_carbon.py     | 468 +++++++++++++++++++++
 38 files changed, 1264 insertions(+)

diff --git a/.gitignore b/.gitignore
index 00e4934..854ebff 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,4 @@ store/CSDK/cmake-build-debug/*
 .classpath
 metastore_db/
 derby.log
+python/.idea/
diff --git a/python/__init__.py b/python/__init__.py
new file mode 100644
index 0000000..0b909a7
--- /dev/null
+++ b/python/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__version__ = '0.1.0'
diff --git a/python/pycarbon/__init__.py b/python/pycarbon/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/pycarbon/sdk/ArrowCarbonReader.py 
b/python/pycarbon/sdk/ArrowCarbonReader.py
new file mode 100644
index 0000000..4b9a88a
--- /dev/null
+++ b/python/pycarbon/sdk/ArrowCarbonReader.py
@@ -0,0 +1,99 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ctypes
+
+import pyarrow as pa
+from modelarts import manifest
+from modelarts.field_name import CARBON
+
+from pycarbon.sdk.Constants import LOCAL_FILE_PREFIX
+
+class ArrowCarbonReader(object):
+  def __init__(self):
+    from jnius import autoclass
+    self.readerClass = 
autoclass('org.apache.carbondata.sdk.file.ArrowCarbonReader')
+
+  def builder(self, input_split):
+    self.input_split = input_split
+    self.ArrowCarbonReaderBuilder = self.readerClass.builder(input_split)
+    return self
+
+  def projection(self, projection_list):
+    self.ArrowCarbonReaderBuilder.projection(projection_list)
+    return self
+
+  def withHadoopConf(self, key, value):
+    if "fs.s3a.access.key" == key:
+      self.ak = value
+    elif "fs.s3a.secret.key" == key:
+      self.sk = value
+    elif "fs.s3a.endpoint" == key:
+      self.end_point = value
+    elif "fs.s3a.proxy.host" == key:
+      self.host = value
+    elif "fs.s3a.proxy.port" == key:
+      self.port = value
+
+    self.ArrowCarbonReaderBuilder.withHadoopConf(key, value)
+    return self
+
+  def build(self):
+    self.reader = self.ArrowCarbonReaderBuilder.buildArrowReader()
+    return self
+
+  def withFileLists(self, file_list):
+    self.ArrowCarbonReaderBuilder.withFileLists(file_list)
+    return self
+
+  def getSplits(self, is_blocklet_split):
+    from jnius import autoclass
+
+    java_list_class = autoclass('java.util.ArrayList')
+
+    if str(self.input_split).endswith(".manifest"):
+      if str(self.input_split).startswith(LOCAL_FILE_PREFIX):
+        self.manifest_path = str(self.input_split)[len(LOCAL_FILE_PREFIX):]
+      else:
+        self.manifest_path = self.input_split
+
+      from obs import ObsClient
+      if str(self.input_split).startswith("s3"):
+        obsClient = ObsClient(access_key_id=self.ak, secret_access_key=self.sk,
+                              server=str(self.end_point).replace('http://', 
''),
+                              long_conn_mode=True)
+        sources = manifest.getSources(self.manifest_path, CARBON, obsClient)
+        self.file_path = sources[0]
+      else:
+        sources = manifest.getSources(self.manifest_path, CARBON)
+      java_list = java_list_class()
+      for source in sources:
+        java_list.add(source)
+      return 
self.ArrowCarbonReaderBuilder.withFileLists(java_list).getSplits(is_blocklet_split)
+    else:
+      return self.ArrowCarbonReaderBuilder.getSplits(is_blocklet_split)
+
+  def read(self, schema):
+    address = self.reader.readArrowBatchAddress(schema)
+    size = (ctypes.c_int32).from_address(address).value
+    arrowData = (ctypes.c_byte * size).from_address(address + 4)
+    rawData = bytes(arrowData)
+    self.reader.freeArrowBatchMemory(address)
+    reader = pa.RecordBatchFileReader(pa.BufferReader(rawData))
+    data = reader.read_all()
+    return data
+
+  def close(self):
+    return self.reader.close()
diff --git a/python/pycarbon/sdk/CarbonReader.py 
b/python/pycarbon/sdk/CarbonReader.py
new file mode 100644
index 0000000..e06ab2f
--- /dev/null
+++ b/python/pycarbon/sdk/CarbonReader.py
@@ -0,0 +1,162 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class CarbonReader(object):
+  """
+  How to create CarbonReader:
+  1. new CarbonReader() and call .builder() first
+  2. call different configuration, like  .withFolder(path)
+  3. call .build() to create CarbonReader
+  4. read data by hasNext(), readNextRow(), readNextBatchRow(), 
readArrowBatch()
+  """
+
+  def __init__(self):
+    from jnius import autoclass
+    self.readerClass = autoclass('org.apache.carbondata.sdk.file.CarbonReader')
+
+  def builder(self):
+    """
+    :return: updated CarbonReader
+    """
+    self.CarbonReaderBuilder = self.readerClass.builder()
+    return self
+
+  def withFile(self, file_name):
+    """
+    read carbonData file from the file
+
+    :param file_name: CarbonData file name
+    :return: updated CarbonReader
+    """
+    self.CarbonReaderBuilder.withFile(file_name)
+    return self
+
+  def withFileLists(self, file_lists):
+    """
+    read carbonData file from the file list
+
+    :param file_lists: CarbonData file list
+    :return: updated CarbonReader
+    """
+    self.CarbonReaderBuilder.withFileLists(file_lists)
+    return self
+
+  def withFolder(self, folder_name):
+    """
+    read carbonData file from this folder
+
+    :param file_name: folder name
+    :return: updated CarbonReader
+    """
+    self.CarbonReaderBuilder.withFolder(folder_name)
+    return self
+
+  def withBatch(self, batch_size):
+    """
+
+    :param batch_size:
+    :return: updated CarbonReader
+    """
+    self.CarbonReaderBuilder.withBatch(batch_size)
+    return self
+
+  def projection(self, projection_list):
+    """
+    Configure the projection column names of carbon reader
+
+    :param projection_list: the list of projection column names
+    :return: updated CarbonReader
+    """
+    self.CarbonReaderBuilder.projection(projection_list)
+    return self
+
+  def filterEqual(self, column_name, value):
+    """
+    filter column name equal value
+    :param column_name: column_name in CarbonData file
+    :param value: the value of column_name
+    :return: updated CarbonReader
+    """
+    from jnius import autoclass
+    equal_to_expression_class = 
autoclass('org.apache.carbondata.core.scan.expression.conditional.EqualToExpression')
+    data_types_class = 
autoclass('org.apache.carbondata.core.metadata.datatype.DataTypes')
+    column_expression_class = 
autoclass('org.apache.carbondata.core.scan.expression.ColumnExpression')
+    literal_expression_class = 
autoclass('org.apache.carbondata.core.scan.expression.LiteralExpression')
+
+    column_expression = column_expression_class(column_name, 
data_types_class.STRING)
+    literal_expression = literal_expression_class(value, 
data_types_class.STRING)
+    equal_to_expression = equal_to_expression_class(column_expression, 
literal_expression)
+
+    self.CarbonReaderBuilder.filter(equal_to_expression)
+    return self
+
+  def withHadoopConf(self, key, value):
+    """
+    To support hadoop configuration, can set s3a AK,SK,end point and other 
conf with this
+
+    :param key: key word  of configuration
+    :param value: value of configuration
+    :return: updated CarbonReader
+    """
+    self.CarbonReaderBuilder.withHadoopConf(key, value)
+    return self
+
+  def build(self):
+    """
+    Build CarbonReader
+
+    :return:
+    """
+    self.reader = self.CarbonReaderBuilder.build()
+    return self
+
+  def splitAsArray(self, maxSplits):
+    return self.reader.split(maxSplits)
+
+  def hasNext(self):
+    """
+    Return true if has next row
+
+    :return:
+    """
+    return self.reader.hasNext()
+
+  def readNextRow(self):
+    """
+    Read and return next row object
+
+    :return:
+    """
+    return self.reader.readNextRow()
+
+  def readNextBatchRow(self):
+    """
+    Read and return next batch row objects
+
+    :return:
+    """
+    return self.reader.readNextBatchRow()
+
+  def readArrowBatch(self, schema):
+    return self.reader.readArrowBatch(schema)
+
+  def close(self):
+    """
+    Close reader
+
+    :return:
+    """
+    return self.reader.close()
diff --git a/python/pycarbon/sdk/CarbonSchemaReader.py 
b/python/pycarbon/sdk/CarbonSchemaReader.py
new file mode 100644
index 0000000..b7bc7aa
--- /dev/null
+++ b/python/pycarbon/sdk/CarbonSchemaReader.py
@@ -0,0 +1,64 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class CarbonSchemaReader(object):
+  """
+  How to use it:
+  1. create CarbonSchemaReader: carbonSchemaReader = CarbonSchemaReader()
+  2. read schema from path: schema = carbonSchemaReader.readSchema(path)
+  path can be a folder, carbonindex file and carbondata file.
+  """
+  def __init__(self):
+    from jnius import autoclass
+    self.carbonSchemaReader = 
autoclass('org.apache.carbondata.sdk.file.CarbonSchemaReader')
+    self.Schema = autoclass('org.apache.carbondata.sdk.file.Schema')
+
+  def readSchema(self, path, getAsBuffer=False, validateSchema=False, 
conf=None):
+    """
+    Read CarbonData schema from path.
+    :param path: data path, path can be a folder, carbonindex file name and 
carbondata file name
+    :param getAsBuffer:  whether get as buffer
+    :param validateSchema: whether validate schema.
+    :param conf: configuration for ak, sk, endpoint and so on.
+    :return: CarbonData schema.
+    """
+    if getAsBuffer == True:
+      return self.carbonSchemaReader.getArrowSchemaAsBytes(path)
+    if conf is None:
+      schema = self.carbonSchemaReader.readSchema(path, validateSchema)
+    else:
+      schema = self.carbonSchemaReader.readSchema(path, validateSchema, conf)
+    newSchema = schema.asOriginOrder()
+    return newSchema
+
+  def reorderSchemaBasedOnProjection(self, columns, schema):
+    fields = schema.getFields()
+    updateFields = list()
+    for column in columns:
+      for field in fields:
+        if column.casefold() == field.getFieldName().casefold():
+          updateFields.append(field)
+          break
+
+    updatedSchema = self.Schema(updateFields)
+    return updatedSchema
+
+  def getProjectionBasedOnSchema(self, schema):
+    fields = schema.getFields()
+    projection = list()
+    for field in fields:
+      projection.append(field.getFieldName())
+    return projection
diff --git a/python/pycarbon/sdk/CarbonWriter.py 
b/python/pycarbon/sdk/CarbonWriter.py
new file mode 100644
index 0000000..aa7bef6
--- /dev/null
+++ b/python/pycarbon/sdk/CarbonWriter.py
@@ -0,0 +1,120 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class CarbonWriter(object):
+  """
+  How to create CarbonWriter:
+  1. new CarbonWriter() and call .builder() first
+  2. call different configuration, like  .outputPath(path)
+  3. call .build() to create CarbonWriter
+  4. write data by write()
+  5. call close() to write data to local/HDFS/S3
+  """
+  def __init__(self):
+    from jnius import autoclass
+    self.writerClass = autoclass('org.apache.carbondata.sdk.file.CarbonWriter')
+
+  def builder(self):
+    self.CarbonWriterBuilder = self.writerClass.builder()
+    return self
+
+  def outputPath(self, path):
+    """
+    Sets the output path of the writer builder
+
+    :param path: is the absolute path where output files are written
+                This method must be called when building CarbonWriterBuilder
+    :return: updated CarbonWriter
+    """
+    self.CarbonWriterBuilder.outputPath(path)
+    return self
+
+  def withCsvInput(self, jsonSchema):
+    """
+    accepts row in CSV format
+
+    :param jsonSchema:json format schema
+    :return: updated CarbonWriter object
+    """
+    self.CarbonWriterBuilder.withCsvInput(jsonSchema)
+    return self
+
+  def writtenBy(self, name):
+    """
+    Record the name who write this CarbonData file
+    :param name: The name is writing the CarbonData files
+    :return:  updated CarbonWriter object
+    """
+    self.CarbonWriterBuilder.writtenBy(name)
+    return self
+
+  def withLoadOption(self, key, value):
+    """
+    To support the load options for sdk writer
+
+    :param key: the key of load option
+    :param value:  the value of load option
+    :return:  updated CarbonWriter object
+    """
+    self.CarbonWriterBuilder.withLoadOption(key, value)
+    return self
+
+  def withPageSizeInMb(self, value):
+    """
+     To set the blocklet size of CarbonData file
+
+    :param value: is page size in MB
+    :return: updated CarbonWriter
+    """
+    self.CarbonWriterBuilder.withPageSizeInMb(value)
+    return self
+
+  def withHadoopConf(self, key, value):
+    """
+    To support hadoop configuration, can set s3a AK,SK,end point and other 
conf with this
+
+    :param key: key word  of configuration
+    :param value: value of configuration
+    :return: updated CarbonWriter
+    """
+    self.CarbonWriterBuilder.withHadoopConf(key, value)
+    return self
+
+  def build(self):
+    """
+    Build a  CarbonWriter
+    This writer is not thread safe,
+    use withThreadSafe() configuration in multi thread environment
+    :return:
+    """
+    self.writer = self.CarbonWriterBuilder.build()
+    return self
+
+  def write(self, data):
+    """
+    Write an object to the file, the format of the object depends on the 
implementation.
+    :param data:
+    :return:
+    """
+    return self.writer.write(data)
+
+  def close(self):
+    """
+     Flush and close the writer
+
+    :return:
+    """
+    return self.writer.close()
diff --git a/python/pycarbon/sdk/Configuration.py 
b/python/pycarbon/sdk/Configuration.py
new file mode 100644
index 0000000..78040ec
--- /dev/null
+++ b/python/pycarbon/sdk/Configuration.py
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+class Configuration(object):
+  def __init__(self):
+    from jnius import autoclass
+    ConfigurationClass = autoclass('org.apache.hadoop.conf.Configuration')
+    self.conf = ConfigurationClass()
+
+  def set(self, key, value):
+    self.conf.set(key, value)
+    return self.conf
diff --git a/python/pycarbon/sdk/Constants.py b/python/pycarbon/sdk/Constants.py
new file mode 100644
index 0000000..7fc304b
--- /dev/null
+++ b/python/pycarbon/sdk/Constants.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+LOCAL_FILE_PREFIX = "file://"
diff --git a/python/pycarbon/sdk/SDKUtil.py b/python/pycarbon/sdk/SDKUtil.py
new file mode 100644
index 0000000..05fda02
--- /dev/null
+++ b/python/pycarbon/sdk/SDKUtil.py
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class SDKUtil(object):
+  def __init__(self):
+    from jnius import autoclass
+    self.SDKUtilClass = 
autoclass('org.apache.carbondata.sdk.file.utils.SDKUtil')
+
+  def readBinary(self, path):
+    return self.SDKUtilClass.readBinary(path)
diff --git a/python/pycarbon/sdk/__init__.py b/python/pycarbon/sdk/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/pycarbon/test/.coveragerc b/python/pycarbon/test/.coveragerc
new file mode 100644
index 0000000..7fc46f5
--- /dev/null
+++ b/python/pycarbon/test/.coveragerc
@@ -0,0 +1,4 @@
+[run]
+omit =
+    */__init__.py
+
diff --git a/python/pycarbon/test/__init__.py b/python/pycarbon/test/__init__.py
new file mode 100644
index 0000000..ccedf21
--- /dev/null
+++ b/python/pycarbon/test/__init__.py
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+curdir = os.path.dirname(os.path.realpath(__file__))
+
+jardir = os.path.abspath(os.path.join(curdir, os.path.pardir, os.path.pardir))
+
+DEFAULT_CARBONSDK_PATH = os.path.join(jardir, 'jars/carbondata-sdk.jar')
+
+S3_DATA_PATH = 's3a://sdk/binary'
+S3_DATA_PATH1 = 's3a://sdk/binary/sub1'
+S3_DATA_PATH2 = 's3a://sdk/binary/sub2'
+
+EXAMPLES_MANIFEST_PATH = os.path.join(jardir, 'examples/data/')
+
+LOCAL_DATA_PATH = os.path.join(jardir, 'examples/data/binary')
+
+IMAGE_DATA_PATH = os.path.join(jardir, 'examples/data/image')
diff --git a/python/pycarbon/test/resources/carbondatalogo.jpg 
b/python/pycarbon/test/resources/carbondatalogo.jpg
new file mode 100644
index 0000000..3469469
Binary files /dev/null and b/python/pycarbon/test/resources/carbondatalogo.jpg 
differ
diff --git a/python/pycarbon/test/resources/carbondatalogo2.jpg 
b/python/pycarbon/test/resources/carbondatalogo2.jpg
new file mode 100644
index 0000000..acbdb15
Binary files /dev/null and b/python/pycarbon/test/resources/carbondatalogo2.jpg 
differ
diff --git a/python/pycarbon/test/resources/flowers/10686568196_b1915544a8.jpg 
b/python/pycarbon/test/resources/flowers/10686568196_b1915544a8.jpg
new file mode 100644
index 0000000..12937a0
Binary files /dev/null and 
b/python/pycarbon/test/resources/flowers/10686568196_b1915544a8.jpg differ
diff --git a/python/pycarbon/test/resources/flowers/10686568196_b1915544a8.txt 
b/python/pycarbon/test/resources/flowers/10686568196_b1915544a8.txt
new file mode 100644
index 0000000..12f7d78
--- /dev/null
+++ b/python/pycarbon/test/resources/flowers/10686568196_b1915544a8.txt
@@ -0,0 +1 @@
+tulips
\ No newline at end of file
diff --git a/python/pycarbon/test/resources/flowers/10712722853_5632165b04.jpg 
b/python/pycarbon/test/resources/flowers/10712722853_5632165b04.jpg
new file mode 100644
index 0000000..48591bf
Binary files /dev/null and 
b/python/pycarbon/test/resources/flowers/10712722853_5632165b04.jpg differ
diff --git a/python/pycarbon/test/resources/flowers/10712722853_5632165b04.txt 
b/python/pycarbon/test/resources/flowers/10712722853_5632165b04.txt
new file mode 100644
index 0000000..84bd766
--- /dev/null
+++ b/python/pycarbon/test/resources/flowers/10712722853_5632165b04.txt
@@ -0,0 +1 @@
+daisy
\ No newline at end of file
diff --git 
a/python/pycarbon/test/resources/flowers/subfolder/10841136265_af473efc60.jpg 
b/python/pycarbon/test/resources/flowers/subfolder/10841136265_af473efc60.jpg
new file mode 100644
index 0000000..0822034
Binary files /dev/null and 
b/python/pycarbon/test/resources/flowers/subfolder/10841136265_af473efc60.jpg 
differ
diff --git 
a/python/pycarbon/test/resources/flowers/subfolder/10841136265_af473efc60.txt 
b/python/pycarbon/test/resources/flowers/subfolder/10841136265_af473efc60.txt
new file mode 100644
index 0000000..84bd766
--- /dev/null
+++ 
b/python/pycarbon/test/resources/flowers/subfolder/10841136265_af473efc60.txt
@@ -0,0 +1 @@
+daisy
\ No newline at end of file
diff --git a/python/pycarbon/test/resources/voc/2007_000027.jpg 
b/python/pycarbon/test/resources/voc/2007_000027.jpg
new file mode 100755
index 0000000..fe9ba8c
Binary files /dev/null and b/python/pycarbon/test/resources/voc/2007_000027.jpg 
differ
diff --git a/python/pycarbon/test/resources/voc/2007_000027.xml 
b/python/pycarbon/test/resources/voc/2007_000027.xml
new file mode 100755
index 0000000..576da53
--- /dev/null
+++ b/python/pycarbon/test/resources/voc/2007_000027.xml
@@ -0,0 +1,63 @@
+<annotation>
+       <folder>VOC2012</folder>
+       <filename>2007_000027.jpg</filename>
+       <source>
+               <database>The VOC2007 Database</database>
+               <annotation>PASCAL VOC2007</annotation>
+               <image>flickr</image>
+       </source>
+       <size>
+               <width>486</width>
+               <height>500</height>
+               <depth>3</depth>
+       </size>
+       <segmented>0</segmented>
+       <object>
+               <name>person</name>
+               <pose>Unspecified</pose>
+               <truncated>0</truncated>
+               <difficult>0</difficult>
+               <bndbox>
+                       <xmin>174</xmin>
+                       <ymin>101</ymin>
+                       <xmax>349</xmax>
+                       <ymax>351</ymax>
+               </bndbox>
+               <part>
+                       <name>head</name>
+                       <bndbox>
+                               <xmin>169</xmin>
+                               <ymin>104</ymin>
+                               <xmax>209</xmax>
+                               <ymax>146</ymax>
+                       </bndbox>
+               </part>
+               <part>
+                       <name>hand</name>
+                       <bndbox>
+                               <xmin>278</xmin>
+                               <ymin>210</ymin>
+                               <xmax>297</xmax>
+                               <ymax>233</ymax>
+                       </bndbox>
+               </part>
+               <part>
+                       <name>foot</name>
+                       <bndbox>
+                               <xmin>273</xmin>
+                               <ymin>333</ymin>
+                               <xmax>297</xmax>
+                               <ymax>354</ymax>
+                       </bndbox>
+               </part>
+               <part>
+                       <name>foot</name>
+                       <bndbox>
+                               <xmin>319</xmin>
+                               <ymin>307</ymin>
+                               <xmax>340</xmax>
+                               <ymax>326</ymax>
+                       </bndbox>
+               </part>
+       </object>
+</annotation>
diff --git a/python/pycarbon/test/resources/voc/2007_000032.jpg 
b/python/pycarbon/test/resources/voc/2007_000032.jpg
new file mode 100755
index 0000000..b111b5a
Binary files /dev/null and b/python/pycarbon/test/resources/voc/2007_000032.jpg 
differ
diff --git a/python/pycarbon/test/resources/voc/2007_000032.xml 
b/python/pycarbon/test/resources/voc/2007_000032.xml
new file mode 100755
index 0000000..779abb6
--- /dev/null
+++ b/python/pycarbon/test/resources/voc/2007_000032.xml
@@ -0,0 +1,63 @@
+<annotation>
+       <folder>VOC2012</folder>
+       <filename>2007_000032.jpg</filename>
+       <source>
+               <database>The VOC2007 Database</database>
+               <annotation>PASCAL VOC2007</annotation>
+               <image>flickr</image>
+       </source>
+       <size>
+               <width>500</width>
+               <height>281</height>
+               <depth>3</depth>
+       </size>
+       <segmented>1</segmented>
+       <object>
+               <name>aeroplane</name>
+               <pose>Frontal</pose>
+               <truncated>0</truncated>
+               <difficult>0</difficult>
+               <bndbox>
+                       <xmin>104</xmin>
+                       <ymin>78</ymin>
+                       <xmax>375</xmax>
+                       <ymax>183</ymax>
+               </bndbox>
+       </object>
+       <object>
+               <name>aeroplane</name>
+               <pose>Left</pose>
+               <truncated>0</truncated>
+               <difficult>0</difficult>
+               <bndbox>
+                       <xmin>133</xmin>
+                       <ymin>88</ymin>
+                       <xmax>197</xmax>
+                       <ymax>123</ymax>
+               </bndbox>
+       </object>
+       <object>
+               <name>person</name>
+               <pose>Rear</pose>
+               <truncated>0</truncated>
+               <difficult>0</difficult>
+               <bndbox>
+                       <xmin>195</xmin>
+                       <ymin>180</ymin>
+                       <xmax>213</xmax>
+                       <ymax>229</ymax>
+               </bndbox>
+       </object>
+       <object>
+               <name>person</name>
+               <pose>Rear</pose>
+               <truncated>0</truncated>
+               <difficult>0</difficult>
+               <bndbox>
+                       <xmin>26</xmin>
+                       <ymin>189</ymin>
+                       <xmax>44</xmax>
+                       <ymax>238</ymax>
+               </bndbox>
+       </object>
+</annotation>
diff --git a/python/pycarbon/test/resources/voc/2007_000033.jpg 
b/python/pycarbon/test/resources/voc/2007_000033.jpg
new file mode 100755
index 0000000..01f478f
Binary files /dev/null and b/python/pycarbon/test/resources/voc/2007_000033.jpg 
differ
diff --git a/python/pycarbon/test/resources/voc/2007_000033.xml 
b/python/pycarbon/test/resources/voc/2007_000033.xml
new file mode 100755
index 0000000..61899d6
--- /dev/null
+++ b/python/pycarbon/test/resources/voc/2007_000033.xml
@@ -0,0 +1,51 @@
+<annotation>
+       <folder>VOC2012</folder>
+       <filename>2007_000033.jpg</filename>
+       <source>
+               <database>The VOC2007 Database</database>
+               <annotation>PASCAL VOC2007</annotation>
+               <image>flickr</image>
+       </source>
+       <size>
+               <width>500</width>
+               <height>366</height>
+               <depth>3</depth>
+       </size>
+       <segmented>1</segmented>
+       <object>
+               <name>aeroplane</name>
+               <pose>Unspecified</pose>
+               <truncated>0</truncated>
+               <difficult>0</difficult>
+               <bndbox>
+                       <xmin>9</xmin>
+                       <ymin>107</ymin>
+                       <xmax>499</xmax>
+                       <ymax>263</ymax>
+               </bndbox>
+       </object>
+       <object>
+               <name>aeroplane</name>
+               <pose>Left</pose>
+               <truncated>0</truncated>
+               <difficult>0</difficult>
+               <bndbox>
+                       <xmin>421</xmin>
+                       <ymin>200</ymin>
+                       <xmax>482</xmax>
+                       <ymax>226</ymax>
+               </bndbox>
+       </object>
+       <object>
+               <name>aeroplane</name>
+               <pose>Left</pose>
+               <truncated>1</truncated>
+               <difficult>0</difficult>
+               <bndbox>
+                       <xmin>325</xmin>
+                       <ymin>188</ymin>
+                       <xmax>411</xmax>
+                       <ymax>223</ymax>
+               </bndbox>
+       </object>
+</annotation>
diff --git a/python/pycarbon/test/resources/voc/2007_000039.jpg 
b/python/pycarbon/test/resources/voc/2007_000039.jpg
new file mode 100755
index 0000000..1a3b717
Binary files /dev/null and b/python/pycarbon/test/resources/voc/2007_000039.jpg 
differ
diff --git a/python/pycarbon/test/resources/voc/2007_000039.xml 
b/python/pycarbon/test/resources/voc/2007_000039.xml
new file mode 100755
index 0000000..bc73f4e
--- /dev/null
+++ b/python/pycarbon/test/resources/voc/2007_000039.xml
@@ -0,0 +1,27 @@
+<annotation>
+       <folder>VOC2012</folder>
+       <filename>2007_000039.jpg</filename>
+       <source>
+               <database>The VOC2007 Database</database>
+               <annotation>PASCAL VOC2007</annotation>
+               <image>flickr</image>
+       </source>
+       <size>
+               <width>500</width>
+               <height>375</height>
+               <depth>3</depth>
+       </size>
+       <segmented>1</segmented>
+       <object>
+               <name>tvmonitor</name>
+               <pose>Frontal</pose>
+               <truncated>0</truncated>
+               <difficult>0</difficult>
+               <bndbox>
+                       <xmin>156</xmin>
+                       <ymin>89</ymin>
+                       <xmax>344</xmax>
+                       <ymax>279</ymax>
+               </bndbox>
+       </object>
+</annotation>
diff --git a/python/pycarbon/test/resources/voc/2009_001444.jpg 
b/python/pycarbon/test/resources/voc/2009_001444.jpg
new file mode 100755
index 0000000..f01c62c
Binary files /dev/null and b/python/pycarbon/test/resources/voc/2009_001444.jpg 
differ
diff --git a/python/pycarbon/test/resources/voc/2009_001444.xml 
b/python/pycarbon/test/resources/voc/2009_001444.xml
new file mode 100755
index 0000000..9a68cbc
--- /dev/null
+++ b/python/pycarbon/test/resources/voc/2009_001444.xml
@@ -0,0 +1,28 @@
+<annotation>
+       <filename>2009_001444.jpg</filename>
+       <folder>VOC2012</folder>
+       <object>
+               <name>cat</name>
+               <bndbox>
+                       <xmax>344</xmax>
+                       <xmin>1</xmin>
+                       <ymax>388</ymax>
+                       <ymin>1</ymin>
+               </bndbox>
+               <difficult>0</difficult>
+               <occluded>0</occluded>
+               <pose>Unspecified</pose>
+               <truncated>1</truncated>
+       </object>
+       <segmented>1</segmented>
+       <size>
+               <depth>3</depth>
+               <height>388</height>
+               <width>500</width>
+       </size>
+       <source>
+               <annotation>PASCAL VOC2009</annotation>
+               <database>The VOC2009 Database</database>
+               <image>flickr</image>
+       </source>
+</annotation>
diff --git 
a/python/pycarbon/test/resources/vocForSegmentationClass/2007_000032.jpg 
b/python/pycarbon/test/resources/vocForSegmentationClass/2007_000032.jpg
new file mode 100755
index 0000000..b111b5a
Binary files /dev/null and 
b/python/pycarbon/test/resources/vocForSegmentationClass/2007_000032.jpg differ
diff --git 
a/python/pycarbon/test/resources/vocForSegmentationClass/2007_000032.png 
b/python/pycarbon/test/resources/vocForSegmentationClass/2007_000032.png
new file mode 100755
index 0000000..1f7181c
Binary files /dev/null and 
b/python/pycarbon/test/resources/vocForSegmentationClass/2007_000032.png differ
diff --git 
a/python/pycarbon/test/resources/vocForSegmentationClass/2007_000033.jpg 
b/python/pycarbon/test/resources/vocForSegmentationClass/2007_000033.jpg
new file mode 100755
index 0000000..01f478f
Binary files /dev/null and 
b/python/pycarbon/test/resources/vocForSegmentationClass/2007_000033.jpg differ
diff --git 
a/python/pycarbon/test/resources/vocForSegmentationClass/2007_000033.png 
b/python/pycarbon/test/resources/vocForSegmentationClass/2007_000033.png
new file mode 100755
index 0000000..bbeb3f4
Binary files /dev/null and 
b/python/pycarbon/test/resources/vocForSegmentationClass/2007_000033.png differ
diff --git 
a/python/pycarbon/test/resources/vocForSegmentationClass/2007_000042.jpg 
b/python/pycarbon/test/resources/vocForSegmentationClass/2007_000042.jpg
new file mode 100755
index 0000000..2188d51
Binary files /dev/null and 
b/python/pycarbon/test/resources/vocForSegmentationClass/2007_000042.jpg differ
diff --git 
a/python/pycarbon/test/resources/vocForSegmentationClass/2007_000042.png 
b/python/pycarbon/test/resources/vocForSegmentationClass/2007_000042.png
new file mode 100755
index 0000000..73b6059
Binary files /dev/null and 
b/python/pycarbon/test/resources/vocForSegmentationClass/2007_000042.png differ
diff --git a/python/pycarbon/test/test_read_write_carbon.py 
b/python/pycarbon/test/test_read_write_carbon.py
new file mode 100644
index 0000000..ab02eef
--- /dev/null
+++ b/python/pycarbon/test/test_read_write_carbon.py
@@ -0,0 +1,468 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from pycarbon.sdk.CarbonReader import CarbonReader
+from pycarbon.sdk.CarbonSchemaReader import CarbonSchemaReader
+from pycarbon.sdk.CarbonWriter import CarbonWriter
+
+import base64
+import time
+import shutil
+import os
+import jnius_config
+
+jnius_config.set_classpath("../../../store/sdk/target/carbondata-sdk.jar")
+IMAGE_DATA_PATH = "./resources"
+
+def test_run_write_carbon():
+  jsonSchema = "[{stringField:string},{shortField:short},{intField:int}]"
+  path = "/tmp/data/writeCarbon" + str(time.time())
+
+  if os.path.exists(path):
+    shutil.rmtree(path)
+
+  writer = CarbonWriter() \
+    .builder() \
+    .outputPath(path) \
+    .withCsvInput(jsonSchema) \
+    .writtenBy("pycarbon") \
+    .build()
+
+  for i in range(0, 10):
+    from jnius import autoclass
+    arrayListClass = autoclass("java.util.ArrayList")
+    data_list = arrayListClass()
+    data_list.add("pycarbon")
+    data_list.add(str(i))
+    data_list.add(str(i * 10))
+    writer.write(data_list.toArray())
+
+  writer.close()
+
+  reader = CarbonReader() \
+    .builder() \
+    .withFolder(path) \
+    .withBatch(1000) \
+    .build()
+
+  i = 0
+  while reader.hasNext():
+    rows = reader.readNextBatchRow()
+    i += len(rows)
+
+  assert 10 == i
+  reader.close()
+
+
+  carbonSchemaReader = CarbonSchemaReader()
+  schema = carbonSchemaReader.readSchema(path)
+
+  assert 3 == schema.getFieldsLength()
+
+
+  writer = CarbonWriter() \
+    .builder() \
+    .outputPath(path) \
+    .withCsvInput(jsonSchema) \
+    .writtenBy("pycarbon") \
+    .build()
+
+  for i in range(0, 10):
+    from jnius import autoclass
+    arrayListClass = autoclass("java.util.ArrayList")
+    data_list = arrayListClass()
+    data_list.add("pycarbon")
+    data_list.add(str(i))
+    data_list.add(str(i * 10))
+    writer.write(data_list.toArray())
+
+  writer.close()
+
+  carbonSchemaReader = CarbonSchemaReader()
+  schema = carbonSchemaReader.readSchema(getAsBuffer=False, path=path, 
validateSchema=True)
+
+  assert 3 == schema.getFieldsLength()
+
+  shutil.rmtree(path)
+
+
+def test_run_write_carbon_binary_base64_encode():
+  jsonSchema = 
"[{stringField:string},{shortField:short},{intField:int},{binaryField:binary}]"
+  path = "/tmp/data/writeCarbon" + str(time.time())
+
+  if os.path.exists(path):
+    shutil.rmtree(path)
+
+  jpg_path = IMAGE_DATA_PATH + "/carbondatalogo.jpg"
+
+  writer = CarbonWriter() \
+    .builder() \
+    .outputPath(path) \
+    .withCsvInput(jsonSchema) \
+    .writtenBy("pycarbon") \
+    .build()
+
+  with open(jpg_path, mode='rb+') as file_object:
+    content = file_object.read()
+
+  for i in range(0, 10):
+    from jnius import autoclass
+
+    arrayListClass = autoclass("java.util.ArrayList")
+    data_list = arrayListClass()
+    data_list.add("pycarbon")
+    data_list.add(str(i))
+    data_list.add(str(i * 10))
+    data_list.add(base64.b64encode(content))
+    writer.write(data_list.toArray())
+
+  writer.close()
+
+  reader = CarbonReader() \
+    .builder() \
+    .withFolder(path) \
+    .withBatch(1000) \
+    .build()
+
+  i = 0
+  while reader.hasNext():
+    rows = reader.readNextBatchRow()
+    for row in rows:
+      i += 1
+      for column in row:
+        from jnius.jnius import ByteArray
+        if 1 == i and isinstance(column, ByteArray) and len(column) > 1000:
+          with open(path + "/image.jpg", 'wb+') as file_object:
+            file_object.write(base64.b64decode(column.tostring()))
+
+  assert 10 == i
+  reader.close()
+
+  shutil.rmtree(path)
+
+
+# TODO: to be supported
+@pytest.mark.skip("write binary to be supported")
+def test_run_write_carbon_binary():
+  jsonSchema = 
"[{stringField:string},{shortField:short},{intField:int},{binaryField:binary}]"
+  path = "/tmp/data/writeCarbon" + str(time.time())
+
+  if os.path.exists(path):
+    shutil.rmtree(path)
+
+  jpg_path = IMAGE_DATA_PATH + "/carbondatalogo.jpg"
+
+  writer = CarbonWriter() \
+    .builder() \
+    .outputPath(path) \
+    .withCsvInput(jsonSchema) \
+    .writtenBy("pycarbon") \
+    .build()
+
+  with open(jpg_path, mode='rb+') as file_object:
+    content = file_object.read()
+
+  for i in range(0, 10):
+    from jnius import autoclass
+
+    arrayListClass = autoclass("java.util.ArrayList")
+    data_list = arrayListClass()
+    data_list.add("pycarbon")
+    data_list.add(str(i))
+    data_list.add(str(i * 10))
+    data_list.add(content)
+    writer.write(data_list.toArray())
+
+  writer.close()
+
+  reader = CarbonReader() \
+    .builder() \
+    .withFolder(path) \
+    .withBatch(1000) \
+    .build()
+
+  i = 0
+  while reader.hasNext():
+    rows = reader.readNextBatchRow()
+    for row in rows:
+      i += 1
+      for column in row:
+        from jnius.jnius import ByteArray
+        if 1 == i and isinstance(column, ByteArray) and len(column) > 1000:
+          with open(path + "/image.jpg", 'wb+') as file_object:
+            file_object.write(column.tostring())
+
+  assert 10 == i
+  reader.close()
+
+  shutil.rmtree(path)
+
+
+def test_run_write_carbon_binary_base64_encode_many_files():
+  jsonSchema = 
"[{stringField:string},{shortField:short},{intField:int},{binaryField:binary},{txtField:string}]"
+  path = "/tmp/data/writeCarbon" + str(time.time())
+
+  if os.path.exists(path):
+    shutil.rmtree(path)
+
+  jpg_path = IMAGE_DATA_PATH + "/flowers"
+
+  from jnius import autoclass
+
+  sdkUtilClass = autoclass("org.apache.carbondata.sdk.file.utils.SDKUtil")
+  jpg_files = sdkUtilClass.listFiles(jpg_path, '.jpg')
+
+  writer = CarbonWriter() \
+    .builder() \
+    .outputPath(path) \
+    .withCsvInput(jsonSchema) \
+    .writtenBy("pycarbon") \
+    .build()
+
+  for i in range(0, jpg_files.size()):
+    jpg_path = jpg_files.get(i)
+    with open(jpg_path, mode='rb+') as file_object:
+      content = file_object.read()
+
+    with open(str(jpg_path).replace('.jpg', '.txt'), mode='r+') as file_object:
+      txt = file_object.read()
+
+    arrayListClass = autoclass("java.util.ArrayList")
+    data_list = arrayListClass()
+    data_list.add("pycarbon")
+    data_list.add(str(i))
+    data_list.add(str(i * 10))
+    data_list.add(base64.b64encode(content))
+    data_list.add(txt)
+    writer.write(data_list.toArray())
+
+  writer.close()
+
+  reader = CarbonReader() \
+    .builder() \
+    .withFolder(path) \
+    .build()
+
+  i = 0
+  while reader.hasNext():
+    rows = reader.readNextBatchRow()
+
+    for row in rows:
+      i += 1
+      for column in row:
+        from jnius.jnius import ByteArray
+        if isinstance(column, ByteArray) and len(column) > 1000:
+          with open(path + "/image" + str(i) + ".jpg", 'wb+') as file_object:
+            file_object.write(base64.b64decode(column.tostring()))
+
+  assert 3 == i
+  reader.close()
+
+  shutil.rmtree(path)
+
+
+def test_run_write_carbon_binary_base64_encode_voc():
+  jsonSchema = 
"[{stringField:string},{shortField:short},{intField:int},{binaryField:binary},{txtField:string}]"
+  path = "/tmp/data/writeCarbon" + str(time.time())
+
+  if os.path.exists(path):
+    shutil.rmtree(path)
+
+  jpg_path = IMAGE_DATA_PATH + "/voc"
+
+  from jnius import autoclass
+
+  sdkUtilClass = autoclass("org.apache.carbondata.sdk.file.utils.SDKUtil")
+  jpg_files = sdkUtilClass.listFiles(jpg_path, '.jpg')
+
+  writer = CarbonWriter() \
+    .builder() \
+    .outputPath(path) \
+    .withCsvInput(jsonSchema) \
+    .writtenBy("pycarbon") \
+    .build()
+
+  for i in range(0, jpg_files.size()):
+    jpg_path = jpg_files.get(i)
+    with open(jpg_path, mode='rb+') as file_object:
+      content = file_object.read()
+
+    with open(str(jpg_path).replace('.jpg', '.xml'), mode='r+') as file_object:
+      txt = file_object.read()
+
+    arrayListClass = autoclass("java.util.ArrayList")
+    data_list = arrayListClass()
+    data_list.add("pycarbon")
+    data_list.add(str(i))
+    data_list.add(str(i * 10))
+    data_list.add(base64.b64encode(content))
+    data_list.add(txt)
+    writer.write(data_list.toArray())
+
+  writer.close()
+
+  reader = CarbonReader() \
+    .builder() \
+    .withFolder(path) \
+    .withBatch(1000) \
+    .build()
+
+  i = 0
+  while reader.hasNext():
+    rows = reader.readNextBatchRow()
+    for row in rows:
+      i += 1
+      for column in row:
+        from jnius.jnius import ByteArray
+        if isinstance(column, ByteArray) and len(column) > 1000:
+          with open(path + "/image" + str(i) + ".jpg", 'wb+') as file_object:
+            file_object.write(base64.b64decode(column.tostring()))
+
+  assert 5 == i
+  reader.close()
+
+  shutil.rmtree(path)
+
+
+def test_run_write_carbon_binary_base64_encode_vocForSegmentationClass():
+  jsonSchema = 
"[{stringField:string},{shortField:short},{intField:int},{binaryField:binary},{segField:binary}]"
+  path = "/tmp/data/writeCarbon" + str(time.time())
+
+  if os.path.exists(path):
+    shutil.rmtree(path)
+
+  jpg_path = IMAGE_DATA_PATH + "/vocForSegmentationClass"
+
+  from jnius import autoclass
+
+  sdkUtilClass = autoclass("org.apache.carbondata.sdk.file.utils.SDKUtil")
+  jpg_files = sdkUtilClass.listFiles(jpg_path, '.jpg')
+
+  writer = CarbonWriter() \
+    .builder() \
+    .outputPath(path) \
+    .withCsvInput(jsonSchema) \
+    .writtenBy("pycarbon") \
+    .build()
+
+  for i in range(0, jpg_files.size()):
+    jpg_path = jpg_files.get(i)
+    with open(jpg_path, mode='rb+') as file_object:
+      content = file_object.read()
+
+    with open(str(jpg_path).replace('.jpg', '.png'), mode='rb+') as 
file_object:
+      png_data = file_object.read()
+
+    arrayListClass = autoclass("java.util.ArrayList")
+    data_list = arrayListClass()
+    data_list.add("pycarbon")
+    data_list.add(str(i))
+    data_list.add(str(i * 10))
+    data_list.add(base64.b64encode(content))
+    data_list.add(base64.b64encode(png_data))
+    writer.write(data_list.toArray())
+
+  writer.close()
+
+  reader = CarbonReader() \
+    .builder() \
+    .withFolder(path) \
+    .withBatch(1000) \
+    .build()
+
+  i = 0
+  while reader.hasNext():
+    rows = reader.readNextBatchRow()
+    for row in rows:
+      i += 1
+      num = 0
+      for column in row:
+        num += 1
+        from jnius.jnius import ByteArray
+        if isinstance(column, ByteArray) and len(column) > 1000:
+          with open(path + "/image" + str(i) + "_" + str(num) + ".jpg", 'wb+') 
as file_object:
+            file_object.write(base64.b64decode(column.tostring()))
+
+  assert 3 == i
+  reader.close()
+
+  shutil.rmtree(path)
+
+
+def test_run_write_carbon_binary_base64_encode_decodeInJava_many_files():
+  jsonSchema = 
"[{stringField:string},{shortField:short},{intField:int},{binaryField:binary},{txtField:string}]"
+  path = "/tmp/data/writeCarbon" + str(time.time())
+
+  if os.path.exists(path):
+    shutil.rmtree(path)
+
+  jpg_path = IMAGE_DATA_PATH + "/flowers"
+
+  from jnius import autoclass
+
+  sdkUtilClass = autoclass("org.apache.carbondata.sdk.file.utils.SDKUtil")
+  jpg_files = sdkUtilClass.listFiles(jpg_path, '.jpg')
+
+  writer = CarbonWriter() \
+    .builder() \
+    .outputPath(path) \
+    .withCsvInput(jsonSchema) \
+    .writtenBy("pycarbon") \
+    .withLoadOption("binary_decoder", "base64") \
+    .withPageSizeInMb(1) \
+    .build()
+
+  for i in range(0, jpg_files.size()):
+    jpg_path = jpg_files.get(i)
+    with open(jpg_path, mode='rb+') as file_object:
+      content = file_object.read()
+
+    with open(str(jpg_path).replace('.jpg', '.txt'), mode='r+') as file_object:
+      txt = file_object.read()
+
+    arrayListClass = autoclass("java.util.ArrayList")
+    data_list = arrayListClass()
+    data_list.add("pycarbon")
+    data_list.add(str(i))
+    data_list.add(str(i * 10))
+    data_list.add(base64.b64encode(content))
+    data_list.add(txt)
+    writer.write(data_list.toArray())
+
+  writer.close()
+
+  reader = CarbonReader() \
+    .builder() \
+    .withFolder(path) \
+    .withBatch(1000) \
+    .build()
+
+  i = 0
+  while reader.hasNext():
+    rows = reader.readNextBatchRow()
+    for row in rows:
+      i += 1
+      for column in row:
+        from jnius.jnius import ByteArray
+        if isinstance(column, ByteArray) and len(column) > 1000 and i < 20:
+          with open(path + "/image" + str(i) + ".jpg", 'wb+') as file_object:
+            file_object.write((column.tostring()))
+
+  assert 3 == i
+  reader.close()
+
+  shutil.rmtree(path)

Reply via email to