This is an automated email from the ASF dual-hosted git repository. mergebot-role pushed a commit to branch mergebot in repository https://gitbox.apache.org/repos/asf/beam-site.git
commit f3ce7d81544f8e96a50c48cf6e489c99f6b36ee7 Author: timrobertson100 <[email protected]> AuthorDate: Fri Jul 27 09:47:16 2018 +0200 [BEAM-4260] Document HCatalogIO use with Hive 1.1 --- src/documentation/io/built-in-hcatalog.md | 147 ++++++++++++++++++++++++++++++ src/documentation/io/built-in.md | 2 +- 2 files changed, 148 insertions(+), 1 deletion(-) diff --git a/src/documentation/io/built-in-hcatalog.md b/src/documentation/io/built-in-hcatalog.md new file mode 100644 index 0000000..88e1008 --- /dev/null +++ b/src/documentation/io/built-in-hcatalog.md @@ -0,0 +1,147 @@ +--- +layout: section +title: "Apache HCatalog InputFormat IO" +section_menu: section-menu/documentation.html +permalink: /documentation/io/built-in/hcatalog/ +--- + +[Pipeline I/O Table of Contents]({{site.baseurl}}/documentation/io/io-toc/) + +# HCatalog IO + +An `HCatalogIO` is a transform for reading and writing data to an HCatalog managed source. + +### Reading using HCatalogIO + +To configure an HCatalog source, you must specify a metastore URI and a table name. Other optional parameters are database and filter. + +For example: +```java +Map<String, String> configProperties = new HashMap<String, String>(); +configProperties.put("hive.metastore.uris","thrift://metastore-host:port"); +pipeline + .apply(HCatalogIO.read() + .withConfigProperties(configProperties) + .withDatabase("default") //optional, assumes default if none specified + .withTable("employee") + .withFilter(filterString) //optional, may be specified if the table is partitioned +``` +```py + # The Beam SDK for Python does not support HCatalogIO. +``` + +### Writing using HCatalogIO + +To configure an `HCatalog` sink, you must specify a metastore URI and a table name. Other +optional parameters are database, partition and batchsize. +The destination table should exist beforehand as the transform will not create a new table if missing. + +For example: +```java +Map<String, String> configProperties = new HashMap<String, String>(); +configProperties.put("hive.metastore.uris","thrift://metastore-host:port"); + +pipeline + .apply(...) + .apply(HCatalogIO.write() + .withConfigProperties(configProperties) + .withDatabase("default") //optional, assumes default if none specified + .withTable("employee") + .withPartition(partitionValues) //optional, may be specified if the table is partitioned + .withBatchSize(1024L)) //optional, assumes a default batch size of 1024 if none specified +``` +```py + # The Beam SDK for Python does not support HCatalogIO. +``` + +### Using older versions of HCatalog (1.x) + +`HCatalogIO` is build for Apache HCatalog versions 2 and up and will not work out of the box for older versions of HCatalog. +The following illustrates a workaround to work with Hive 1.1. + +Include the following Hive 1.2 jars in the über jar you build. +The 1.2 jars provide the necessary methods for Beam while remain compatible with Hive 1.1. + +``` +<dependency> + <groupId>org.apache.beam</groupId> + <artifactId>beam-sdks-java-io-hcatalog</artifactId> + <version>${beam.version}</version> +</dependency> +<dependency> + <groupId>org.apache.hive.hcatalog</groupId> + <artifactId>hive-hcatalog-core</artifactId> + <version>1.2</version> +</dependency> +<dependency> + <groupId>org.apache.hive</groupId> + <artifactId>hive-metastore</artifactId> + <version>1.2</version> +</dependency> +<dependency> + <groupId>org.apache.hive</groupId> + <artifactId>hive-exec</artifactId> + <version>1.2</version> +</dependency> +<dependency> + <groupId>org.apache.hive</groupId> + <artifactId>hive-common</artifactId> + <version>1.2</version> +</dependency> +``` + +Relocate _only_ the following hive packages: + +``` +<plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-shade-plugin</artifactId> + <version>${maven-shade-plugin.version}</version> + <configuration> + <createDependencyReducedPom>false</createDependencyReducedPom> + <filters> + <filter> + <artifact>*:*</artifact> + <excludes> + <exclude>META-INF/*.SF</exclude> + <exclude>META-INF/*.DSA</exclude> + <exclude>META-INF/*.RSA</exclude> + </excludes> + </filter> + </filters> + </configuration> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>shade</goal> + </goals> + <configuration> + <shadedArtifactAttached>true</shadedArtifactAttached> + <shadedClassifierName>shaded</shadedClassifierName> + <transformers> + <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/> + </transformers> + <relocations> + <!-- Important: Do not relocate org.apache.hadoop.hive --> + <relocation> + <pattern>org.apache.hadoop.hive.conf</pattern> + <shadedPattern>h12.org.apache.hadoop.hive.conf</shadedPattern> + </relocation> + <relocation> + <pattern>org.apache.hadoop.hive.ql</pattern> + <shadedPattern>h12.org.apache.hadoop.hive.ql</shadedPattern> + </relocation> + <relocation> + <pattern>org.apache.hadoop.hive.metastore</pattern> + <shadedPattern>h12.org.apache.hadoop.hive.metastore</shadedPattern> + </relocation> + </relocations> + </configuration> + </execution> + </executions> +</plugin> +``` + +This has been testing to read SequenceFile and ORCFile file backed tables running with +Beam 2.4.0 on Spark 2.3 / YARN in a Cloudera CDH 5.12.2 managed environment. \ No newline at end of file diff --git a/src/documentation/io/built-in.md b/src/documentation/io/built-in.md index 1861284..0cda2b6 100644 --- a/src/documentation/io/built-in.md +++ b/src/documentation/io/built-in.md @@ -58,7 +58,7 @@ Consult the [Programming Guide I/O section]({{site.baseurl }}/documentation/prog <p><a href="https://github.com/apache/beam/tree/master/sdks/java/io/cassandra">Apache Cassandra</a></p> <p><a href="{{site.baseurl}}/documentation/io/built-in/hadoop/">Apache Hadoop InputFormat</a></p> <p><a href="https://github.com/apache/beam/tree/master/sdks/java/io/hbase">Apache HBase</a></p> - <p><a href="https://github.com/apache/beam/tree/master/sdks/java/io/hcatalog">Apache Hive (HCatalog)</a></p> + <p><a href="{{site.baseurl}}/documentation/io/built-in/hcatalog">Apache Hive (HCatalog)</a></p> <p><a href="https://github.com/apache/beam/tree/master/sdks/java/io/kudu">Apache Kudu</a></p> <p><a href="https://github.com/apache/beam/tree/master/sdks/java/io/solr">Apache Solr</a></p> <p><a href="https://github.com/apache/beam/tree/master/sdks/java/io/elasticsearch">Elasticsearch (v2.x and v5.x)</a></p>
