[GitHub] incubator-hawq pull request #1326: HAWQ-1575. Implemented readable Parquet p...
Github user sansanichfb commented on a diff in the pull request: https://github.com/apache/incubator-hawq/pull/1326#discussion_r165799526 --- Diff: pxf/pxf-hdfs/src/main/java/org/apache/hawq/pxf/plugins/hdfs/ParquetDataFragmenter.java --- @@ -0,0 +1,103 @@ +package org.apache.hawq.pxf.plugins.hdfs; + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.lib.input.FileSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hawq.pxf.api.Fragment; +import org.apache.hawq.pxf.api.Fragmenter; +import org.apache.hawq.pxf.api.utilities.InputData; +import org.apache.hawq.pxf.plugins.hdfs.utilities.HdfsUtilities; +import org.apache.parquet.format.converter.ParquetMetadataConverter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.ParquetInputFormat; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.schema.MessageType; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +public class ParquetDataFragmenter extends Fragmenter { +private Job job; + +public ParquetDataFragmenter(InputData md) { +super(md); +JobConf jobConf = new JobConf(new Configuration(), ParquetDataFragmenter.class); +try { +job = Job.getInstance(jobConf); +} catch (IOException e) { +throw new RuntimeException("Unable to instantiate a job for reading fragments", e); +} +} + + +@Override +public List getFragments() throws Exception { +String absoluteDataPath = HdfsUtilities.absoluteDataPath(inputData.getDataSource()); +ArrayList splits = getSplits(new Path(absoluteDataPath)); + +for (InputSplit split : splits) { +FileSplit fsp = (FileSplit) split; + +String filepath = fsp.getPath().toUri().getPath(); +String[] hosts = fsp.getLocations(); + +Path file = new Path(filepath); + +ParquetMetadata metadata = ParquetFileReader.readFooter( +job.getConfiguration(), file, ParquetMetadataConverter.NO_FILTER); +MessageType schema = metadata.getFileMetaData().getSchema(); + +byte[] fragmentMetadata = HdfsUtilities.prepareFragmentMetadata(fsp.getStart(), fsp.getLength(), fsp.getLocations()); --- End diff -- This method is needed to support `org.apache.hadoop.mapreduce.lib.input.FileSplit` type. ---
[GitHub] incubator-hawq pull request #1326: HAWQ-1575. Implemented readable Parquet p...
Github user sansanichfb commented on a diff in the pull request: https://github.com/apache/incubator-hawq/pull/1326#discussion_r165783402 --- Diff: pxf/pxf-hdfs/src/main/java/org/apache/hawq/pxf/plugins/hdfs/utilities/HdfsUtilities.java --- @@ -151,18 +153,42 @@ public static boolean isThreadSafe(String dataDir, String compCodec) { * @param fsp file split to be serialized * @return byte serialization of fsp * @throws IOException if I/O errors occur while writing to the underlying - * stream + * stream */ public static byte[] prepareFragmentMetadata(FileSplit fsp) throws IOException { -ByteArrayOutputStream byteArrayStream = new ByteArrayOutputStream(); -ObjectOutputStream objectStream = new ObjectOutputStream( -byteArrayStream); -objectStream.writeLong(fsp.getStart()); -objectStream.writeLong(fsp.getLength()); -objectStream.writeObject(fsp.getLocations()); + +return prepareFragmentMetadata(fsp.getStart(), fsp.getLength(), fsp.getLocations()); + +} + +public static byte[] prepareFragmentMetadata(long start, long length, String[] locations) +throws IOException { + +ByteArrayOutputStream byteArrayStream = writeBaseFragmentInfo(start, length, locations); return byteArrayStream.toByteArray(); + +} + +private static ByteArrayOutputStream writeBaseFragmentInfo(long start, long length, String[] locations) throws IOException { +ByteArrayOutputStream byteArrayStream = new ByteArrayOutputStream(); +ObjectOutputStream objectStream = new ObjectOutputStream(byteArrayStream); +objectStream.writeLong(start); +objectStream.writeLong(length); +objectStream.writeObject(locations); +return byteArrayStream; +} + +public static byte[] prepareFragmentMetadata(long start, --- End diff -- Thanks, deleted as unnecessary ---
[GitHub] incubator-hawq pull request #1326: HAWQ-1575. Implemented readable Parquet p...
Github user sansanichfb commented on a diff in the pull request: https://github.com/apache/incubator-hawq/pull/1326#discussion_r165782793 --- Diff: pxf/pxf-hdfs/src/main/java/org/apache/hawq/pxf/plugins/hdfs/ParquetDataFragmenter.java --- @@ -0,0 +1,103 @@ +package org.apache.hawq.pxf.plugins.hdfs; + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.lib.input.FileSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hawq.pxf.api.Fragment; +import org.apache.hawq.pxf.api.Fragmenter; +import org.apache.hawq.pxf.api.utilities.InputData; +import org.apache.hawq.pxf.plugins.hdfs.utilities.HdfsUtilities; +import org.apache.parquet.format.converter.ParquetMetadataConverter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.ParquetInputFormat; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.schema.MessageType; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +public class ParquetDataFragmenter extends Fragmenter { +private Job job; + +public ParquetDataFragmenter(InputData md) { +super(md); +JobConf jobConf = new JobConf(new Configuration(), ParquetDataFragmenter.class); +try { +job = Job.getInstance(jobConf); +} catch (IOException e) { +throw new RuntimeException("Unable to instantiate a job for reading fragments", e); +} +} + + +@Override +public List getFragments() throws Exception { +String absoluteDataPath = HdfsUtilities.absoluteDataPath(inputData.getDataSource()); +ArrayList splits = getSplits(new Path(absoluteDataPath)); --- End diff -- Thanks, updated ---
[GitHub] incubator-hawq pull request #1326: HAWQ-1575. Implemented readable Parquet p...
Github user sansanichfb commented on a diff in the pull request: https://github.com/apache/incubator-hawq/pull/1326#discussion_r165782764 --- Diff: pxf/pxf-service/src/scripts/pxf-env.sh --- @@ -54,3 +54,5 @@ export HADOOP_DISTRO=${HADOOP_DISTRO} # Parent directory of Hadoop client installation (optional) # used in case of tarball-based installation when all clients are under a common parent directory export HADOOP_ROOT=${HADOOP_ROOT} + +export CATALINA_OPTS="-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5005" --- End diff -- Sure, deleted. ---