[GitHub] incubator-hawq pull request #1326: HAWQ-1575. Implemented readable Parquet p...

2018-02-02 Thread sansanichfb
Github user sansanichfb commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1326#discussion_r165799526
  
--- Diff: 
pxf/pxf-hdfs/src/main/java/org/apache/hawq/pxf/plugins/hdfs/ParquetDataFragmenter.java
 ---
@@ -0,0 +1,103 @@
+package org.apache.hawq.pxf.plugins.hdfs;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hawq.pxf.api.Fragment;
+import org.apache.hawq.pxf.api.Fragmenter;
+import org.apache.hawq.pxf.api.utilities.InputData;
+import org.apache.hawq.pxf.plugins.hdfs.utilities.HdfsUtilities;
+import org.apache.parquet.format.converter.ParquetMetadataConverter;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.ParquetInputFormat;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.schema.MessageType;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+public class ParquetDataFragmenter extends Fragmenter {
+private Job job;
+
+public ParquetDataFragmenter(InputData md) {
+super(md);
+JobConf jobConf = new JobConf(new Configuration(), 
ParquetDataFragmenter.class);
+try {
+job = Job.getInstance(jobConf);
+} catch (IOException e) {
+throw new RuntimeException("Unable to instantiate a job for 
reading fragments", e);
+}
+}
+
+
+@Override
+public List getFragments() throws Exception {
+String absoluteDataPath = 
HdfsUtilities.absoluteDataPath(inputData.getDataSource());
+ArrayList splits = getSplits(new 
Path(absoluteDataPath));
+
+for (InputSplit split : splits) {
+FileSplit fsp = (FileSplit) split;
+
+String filepath = fsp.getPath().toUri().getPath();
+String[] hosts = fsp.getLocations();
+
+Path file = new Path(filepath);
+
+ParquetMetadata metadata = ParquetFileReader.readFooter(
+job.getConfiguration(), file, 
ParquetMetadataConverter.NO_FILTER);
+MessageType schema = metadata.getFileMetaData().getSchema();
+
+byte[] fragmentMetadata = 
HdfsUtilities.prepareFragmentMetadata(fsp.getStart(), fsp.getLength(), 
fsp.getLocations());
--- End diff --

This method is needed to support 
`org.apache.hadoop.mapreduce.lib.input.FileSplit` type.


---


[GitHub] incubator-hawq pull request #1326: HAWQ-1575. Implemented readable Parquet p...

2018-02-02 Thread sansanichfb
Github user sansanichfb commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1326#discussion_r165783402
  
--- Diff: 
pxf/pxf-hdfs/src/main/java/org/apache/hawq/pxf/plugins/hdfs/utilities/HdfsUtilities.java
 ---
@@ -151,18 +153,42 @@ public static boolean isThreadSafe(String dataDir, 
String compCodec) {
  * @param fsp file split to be serialized
  * @return byte serialization of fsp
  * @throws IOException if I/O errors occur while writing to the 
underlying
- * stream
+ * stream
  */
 public static byte[] prepareFragmentMetadata(FileSplit fsp)
 throws IOException {
-ByteArrayOutputStream byteArrayStream = new 
ByteArrayOutputStream();
-ObjectOutputStream objectStream = new ObjectOutputStream(
-byteArrayStream);
-objectStream.writeLong(fsp.getStart());
-objectStream.writeLong(fsp.getLength());
-objectStream.writeObject(fsp.getLocations());
+
+return prepareFragmentMetadata(fsp.getStart(), fsp.getLength(), 
fsp.getLocations());
+
+}
+
+public static byte[] prepareFragmentMetadata(long start, long length, 
String[] locations)
+throws IOException {
+
+ByteArrayOutputStream byteArrayStream = 
writeBaseFragmentInfo(start, length, locations);
 
 return byteArrayStream.toByteArray();
+
+}
+
+private static ByteArrayOutputStream writeBaseFragmentInfo(long start, 
long length, String[] locations) throws IOException {
+ByteArrayOutputStream byteArrayStream = new 
ByteArrayOutputStream();
+ObjectOutputStream objectStream = new 
ObjectOutputStream(byteArrayStream);
+objectStream.writeLong(start);
+objectStream.writeLong(length);
+objectStream.writeObject(locations);
+return byteArrayStream;
+}
+
+public static byte[] prepareFragmentMetadata(long start,
--- End diff --

Thanks, deleted as unnecessary 


---


[GitHub] incubator-hawq pull request #1326: HAWQ-1575. Implemented readable Parquet p...

2018-02-02 Thread sansanichfb
Github user sansanichfb commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1326#discussion_r165782793
  
--- Diff: 
pxf/pxf-hdfs/src/main/java/org/apache/hawq/pxf/plugins/hdfs/ParquetDataFragmenter.java
 ---
@@ -0,0 +1,103 @@
+package org.apache.hawq.pxf.plugins.hdfs;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hawq.pxf.api.Fragment;
+import org.apache.hawq.pxf.api.Fragmenter;
+import org.apache.hawq.pxf.api.utilities.InputData;
+import org.apache.hawq.pxf.plugins.hdfs.utilities.HdfsUtilities;
+import org.apache.parquet.format.converter.ParquetMetadataConverter;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.ParquetInputFormat;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.schema.MessageType;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+public class ParquetDataFragmenter extends Fragmenter {
+private Job job;
+
+public ParquetDataFragmenter(InputData md) {
+super(md);
+JobConf jobConf = new JobConf(new Configuration(), 
ParquetDataFragmenter.class);
+try {
+job = Job.getInstance(jobConf);
+} catch (IOException e) {
+throw new RuntimeException("Unable to instantiate a job for 
reading fragments", e);
+}
+}
+
+
+@Override
+public List getFragments() throws Exception {
+String absoluteDataPath = 
HdfsUtilities.absoluteDataPath(inputData.getDataSource());
+ArrayList splits = getSplits(new 
Path(absoluteDataPath));
--- End diff --

Thanks, updated


---


[GitHub] incubator-hawq pull request #1326: HAWQ-1575. Implemented readable Parquet p...

2018-02-02 Thread sansanichfb
Github user sansanichfb commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1326#discussion_r165782764
  
--- Diff: pxf/pxf-service/src/scripts/pxf-env.sh ---
@@ -54,3 +54,5 @@ export HADOOP_DISTRO=${HADOOP_DISTRO}
 # Parent directory of Hadoop client installation (optional)
 # used in case of tarball-based installation when all clients are under a 
common parent directory
 export HADOOP_ROOT=${HADOOP_ROOT}
+
+export 
CATALINA_OPTS="-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5005"
--- End diff --

Sure, deleted.


---