Repository: falcon Updated Branches: refs/heads/master 91f7c817c -> c1d37bfab
FALCON-2030 Enfornce time partition in the storage path for feeds with import and export policy Enforce time partition in the storage path for feeds with import and export policy. HCAT storage types, this restriction is already handled. Author: Venkatesan Ramachandran <[email protected]> Reviewers: "Ajay Yadava <[email protected]>,Ying Zheng <[email protected]>" Closes #214 from vramachan/FALCON-2030.EnforceTimePartition Project: http://git-wip-us.apache.org/repos/asf/falcon/repo Commit: http://git-wip-us.apache.org/repos/asf/falcon/commit/c1d37bfa Tree: http://git-wip-us.apache.org/repos/asf/falcon/tree/c1d37bfa Diff: http://git-wip-us.apache.org/repos/asf/falcon/diff/c1d37bfa Branch: refs/heads/master Commit: c1d37bfab3cae98a61cbe924786a0335e13135dc Parents: 91f7c81 Author: Venkatesan Ramachandran <[email protected]> Authored: Fri Jul 8 13:30:22 2016 -0700 Committer: bvellanki <[email protected]> Committed: Fri Jul 8 13:30:22 2016 -0700 ---------------------------------------------------------------------- .../falcon/entity/parser/FeedEntityParser.java | 14 ++++ .../entity/parser/FeedEntityParserTest.java | 18 +++++ .../resources/config/feed/feed-import-0.1.xml | 2 +- .../feed/feed-import-exclude-fields-0.1.xml | 4 +- .../config/feed/feed-import-invalid-0.1.xml | 4 +- .../feed-import-invalid-storage-path-0.1.xml | 73 ++++++++++++++++++++ .../feed/feed-import-no-timepartition-0.1.xml | 73 ++++++++++++++++++++ .../config/feed/feed-import-noargs-0.1.xml | 4 +- docs/src/site/twiki/EntitySpecification.twiki | 3 + docs/src/site/twiki/ImportExport.twiki | 3 + 10 files changed, 191 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/falcon/blob/c1d37bfa/common/src/main/java/org/apache/falcon/entity/parser/FeedEntityParser.java ---------------------------------------------------------------------- diff --git a/common/src/main/java/org/apache/falcon/entity/parser/FeedEntityParser.java b/common/src/main/java/org/apache/falcon/entity/parser/FeedEntityParser.java index 28fdaf8..6b72174 100644 --- a/common/src/main/java/org/apache/falcon/entity/parser/FeedEntityParser.java +++ b/common/src/main/java/org/apache/falcon/entity/parser/FeedEntityParser.java @@ -27,6 +27,7 @@ import org.apache.falcon.entity.EntityUtil; import org.apache.falcon.entity.FeedHelper; import org.apache.falcon.entity.FileSystemStorage; import org.apache.falcon.entity.Storage; +import org.apache.falcon.entity.common.FeedDataPath; import org.apache.falcon.entity.store.ConfigurationStore; import org.apache.falcon.entity.v0.Entity; import org.apache.falcon.entity.v0.EntityGraph; @@ -65,6 +66,7 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.TimeZone; +import java.util.regex.Matcher; /** * Parser that parses feed entity definition. @@ -637,9 +639,21 @@ public class FeedEntityParser extends EntityParser<Feed> { + "but it doesn't contain location type - data in cluster " + cluster.getName()); } + // storage location needs to have time partition if import or export is enabled. + if (FeedHelper.isImportEnabled(cluster) || FeedHelper.isExportEnabled(cluster)) { + if (!matchStoragePathPattern(dataLocation.getPath())) { + throw new ValidationException(String.format("Feed %s with Import/Export policy " + + "needs to have time partition in the storage location path", feed.getName())); + } + } } } + private boolean matchStoragePathPattern(String feedBasePath) { + Matcher matcher = FeedDataPath.PATTERN.matcher(feedBasePath); + return matcher.find(); + } + /** * Validate extraction and merge type combination. Currently supported combo: * http://git-wip-us.apache.org/repos/asf/falcon/blob/c1d37bfa/common/src/test/java/org/apache/falcon/entity/parser/FeedEntityParserTest.java ---------------------------------------------------------------------- diff --git a/common/src/test/java/org/apache/falcon/entity/parser/FeedEntityParserTest.java b/common/src/test/java/org/apache/falcon/entity/parser/FeedEntityParserTest.java index f9aad19..ced4fc5 100644 --- a/common/src/test/java/org/apache/falcon/entity/parser/FeedEntityParserTest.java +++ b/common/src/test/java/org/apache/falcon/entity/parser/FeedEntityParserTest.java @@ -1161,6 +1161,24 @@ public class FeedEntityParserTest extends AbstractTestBase { Assert.fail("ValidationException should have been thrown"); } + @Test (expectedExceptions = {ValidationException.class}) + public void testImportFeedWithNoTimePartition() throws Exception { + + InputStream feedStream = this.getClass() + .getResourceAsStream("/config/feed/feed-import-no-timepartition-0.1.xml"); + parser.parseAndValidate(feedStream); + Assert.fail("ValidationException should have been thrown"); + } + + @Test (expectedExceptions = {ValidationException.class}) + public void testImportFeedWithInvalidTimePartition() throws Exception { + + InputStream feedStream = this.getClass() + .getResourceAsStream("/config/feed/feed-import-invalid-storage-path-0.1.xml"); + parser.parseAndValidate(feedStream); + Assert.fail("ValidationException should have been thrown"); + } + public void testValidateEmailNotification() throws Exception { Feed feedNotification = (Feed) EntityType.FEED.getUnmarshaller().unmarshal( (FeedEntityParserTest.class.getResourceAsStream(FEED_XML))); http://git-wip-us.apache.org/repos/asf/falcon/blob/c1d37bfa/common/src/test/resources/config/feed/feed-import-0.1.xml ---------------------------------------------------------------------- diff --git a/common/src/test/resources/config/feed/feed-import-0.1.xml b/common/src/test/resources/config/feed/feed-import-0.1.xml index 69f7ede..20489c9 100644 --- a/common/src/test/resources/config/feed/feed-import-0.1.xml +++ b/common/src/test/resources/config/feed/feed-import-0.1.xml @@ -59,7 +59,7 @@ </clusters> <locations> - <location type="data" path="/projects/falcon/clicks"/> + <location type="data" path="/projects/falcon/clicks/${YEAR}-${MONTH}"/> <location type="stats" path="/projects/falcon/clicksStats"/> <location type="meta" path="/projects/falcon/clicksMetaData"/> </locations> http://git-wip-us.apache.org/repos/asf/falcon/blob/c1d37bfa/common/src/test/resources/config/feed/feed-import-exclude-fields-0.1.xml ---------------------------------------------------------------------- diff --git a/common/src/test/resources/config/feed/feed-import-exclude-fields-0.1.xml b/common/src/test/resources/config/feed/feed-import-exclude-fields-0.1.xml index 5a6fcd9..03518d9 100644 --- a/common/src/test/resources/config/feed/feed-import-exclude-fields-0.1.xml +++ b/common/src/test/resources/config/feed/feed-import-exclude-fields-0.1.xml @@ -56,7 +56,7 @@ </arguments> </import> <locations> - <location type="data" path="/projects/falcon/clicks"/> + <location type="data" path="/projects/falcon/clicks/${MONTH}/click1"/> <location type="stats" path="/projects/falcon/clicksStats"/> <location type="meta" path="/projects/falcon/clicksMetaData"/> </locations> @@ -64,7 +64,7 @@ </clusters> <locations> - <location type="data" path="/projects/falcon/clicks"/> + <location type="data" path="/projects/falcon/clicks/${MONTH}/click1"/> <location type="stats" path="/projects/falcon/clicksStats"/> <location type="meta" path="/projects/falcon/clicksMetaData"/> </locations> http://git-wip-us.apache.org/repos/asf/falcon/blob/c1d37bfa/common/src/test/resources/config/feed/feed-import-invalid-0.1.xml ---------------------------------------------------------------------- diff --git a/common/src/test/resources/config/feed/feed-import-invalid-0.1.xml b/common/src/test/resources/config/feed/feed-import-invalid-0.1.xml index 9428bce..900e0f0 100644 --- a/common/src/test/resources/config/feed/feed-import-invalid-0.1.xml +++ b/common/src/test/resources/config/feed/feed-import-invalid-0.1.xml @@ -55,7 +55,7 @@ </arguments> </import> <locations> - <location type="data" path="/projects/falcon/clicks"/> + <location type="data" path="/projects/falcon/clicks/${YEAR}"/> <location type="stats" path="/projects/falcon/clicksStats"/> <location type="meta" path="/projects/falcon/clicksMetaData"/> </locations> @@ -63,7 +63,7 @@ </clusters> <locations> - <location type="data" path="/projects/falcon/clicks"/> + <location type="data" path="/projects/falcon/clicks/${YEAR}"/> <location type="stats" path="/projects/falcon/clicksStats"/> <location type="meta" path="/projects/falcon/clicksMetaData"/> </locations> http://git-wip-us.apache.org/repos/asf/falcon/blob/c1d37bfa/common/src/test/resources/config/feed/feed-import-invalid-storage-path-0.1.xml ---------------------------------------------------------------------- diff --git a/common/src/test/resources/config/feed/feed-import-invalid-storage-path-0.1.xml b/common/src/test/resources/config/feed/feed-import-invalid-storage-path-0.1.xml new file mode 100644 index 0000000..d589bb9 --- /dev/null +++ b/common/src/test/resources/config/feed/feed-import-invalid-storage-path-0.1.xml @@ -0,0 +1,73 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + --> +<feed description="Customer data" name="CustomerFeed" xmlns="uri:falcon:feed:0.1"> + <tags>[email protected], [email protected], _department_type=forecasting</tags> + <partitions> + <partition name="fraud"/> + <partition name="good"/> + </partitions> + + <groups>online,bi</groups> + <availabilityFlag>_SUCCESS</availabilityFlag> + + <frequency>hours(1)</frequency> + <sla slaLow="hours(2)" slaHigh="hours(3)"/> + <timezone>UTC</timezone> + + <late-arrival cut-off="hours(6)"/> + + <clusters> + <cluster name="testCluster" type="source"> + <validity start="2011-11-01T00:00Z" end="2011-12-31T00:00Z"/> + <retention limit="hours(48)" action="delete"/> + <!-- Limit can be in Time or Instances 100, Action ENUM DELETE,ARCHIVE --> + <sla slaLow="hours(3)" slaHigh="hours(4)"/> + <import> + <source name="test-hsql-db" tableName="customer"> + <extract type="full"> + <mergepolicy>snapshot</mergepolicy> + </extract> + <fields> + <includes> + <field>id</field> + <field>name</field> + </includes> + </fields> + </source> + <arguments> + <argument name="--num-mappers" value="2"/> + </arguments> + </import> + <locations> + <location type="data" path="/projects/falcon/clicks/${INVALID}"/> + <location type="stats" path="/projects/falcon/clicksStats"/> + <location type="meta" path="/projects/falcon/clicksMetaData"/> + </locations> + </cluster> + </clusters> + + <locations> + <location type="data" path="/projects/falcon/clicks/${INVALID}"/> + <location type="stats" path="/projects/falcon/clicksStats"/> + <location type="meta" path="/projects/falcon/clicksMetaData"/> + </locations> + + <ACL owner="testuser" group="group" permission="0x755"/> + <schema location="/schema/clicks" provider="protobuf"/> +</feed> http://git-wip-us.apache.org/repos/asf/falcon/blob/c1d37bfa/common/src/test/resources/config/feed/feed-import-no-timepartition-0.1.xml ---------------------------------------------------------------------- diff --git a/common/src/test/resources/config/feed/feed-import-no-timepartition-0.1.xml b/common/src/test/resources/config/feed/feed-import-no-timepartition-0.1.xml new file mode 100644 index 0000000..9428bce --- /dev/null +++ b/common/src/test/resources/config/feed/feed-import-no-timepartition-0.1.xml @@ -0,0 +1,73 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + --> +<feed description="Customer data" name="CustomerFeed" xmlns="uri:falcon:feed:0.1"> + <tags>[email protected], [email protected], _department_type=forecasting</tags> + <partitions> + <partition name="fraud"/> + <partition name="good"/> + </partitions> + + <groups>online,bi</groups> + <availabilityFlag>_SUCCESS</availabilityFlag> + + <frequency>hours(1)</frequency> + <sla slaLow="hours(2)" slaHigh="hours(3)"/> + <timezone>UTC</timezone> + + <late-arrival cut-off="hours(6)"/> + + <clusters> + <cluster name="testCluster" type="source"> + <validity start="2011-11-01T00:00Z" end="2011-12-31T00:00Z"/> + <retention limit="hours(48)" action="delete"/> + <!-- Limit can be in Time or Instances 100, Action ENUM DELETE,ARCHIVE --> + <sla slaLow="hours(3)" slaHigh="hours(4)"/> + <import> + <source name="test-hsql-db" tableName="customer"> + <extract type="full"> + <mergepolicy>snapshot</mergepolicy> + </extract> + <fields> + <includes> + <field>id</field> + <field>name</field> + </includes> + </fields> + </source> + <arguments> + <argument name="--num-mappers" value="2"/> + </arguments> + </import> + <locations> + <location type="data" path="/projects/falcon/clicks"/> + <location type="stats" path="/projects/falcon/clicksStats"/> + <location type="meta" path="/projects/falcon/clicksMetaData"/> + </locations> + </cluster> + </clusters> + + <locations> + <location type="data" path="/projects/falcon/clicks"/> + <location type="stats" path="/projects/falcon/clicksStats"/> + <location type="meta" path="/projects/falcon/clicksMetaData"/> + </locations> + + <ACL owner="testuser" group="group" permission="0x755"/> + <schema location="/schema/clicks" provider="protobuf"/> +</feed> http://git-wip-us.apache.org/repos/asf/falcon/blob/c1d37bfa/common/src/test/resources/config/feed/feed-import-noargs-0.1.xml ---------------------------------------------------------------------- diff --git a/common/src/test/resources/config/feed/feed-import-noargs-0.1.xml b/common/src/test/resources/config/feed/feed-import-noargs-0.1.xml index c96249c..2a36283 100644 --- a/common/src/test/resources/config/feed/feed-import-noargs-0.1.xml +++ b/common/src/test/resources/config/feed/feed-import-noargs-0.1.xml @@ -46,7 +46,7 @@ </source> </import> <locations> - <location type="data" path="/projects/falcon/clicks"/> + <location type="data" path="/projects/falcon/clicks/${YEAR}"/> <location type="stats" path="/projects/falcon/clicksStats"/> <location type="meta" path="/projects/falcon/clicksMetaData"/> </locations> @@ -54,7 +54,7 @@ </clusters> <locations> - <location type="data" path="/projects/falcon/clicks"/> + <location type="data" path="/projects/falcon/clicks/${YEAR}"/> <location type="stats" path="/projects/falcon/clicksStats"/> <location type="meta" path="/projects/falcon/clicksMetaData"/> </locations> http://git-wip-us.apache.org/repos/asf/falcon/blob/c1d37bfa/docs/src/site/twiki/EntitySpecification.twiki ---------------------------------------------------------------------- diff --git a/docs/src/site/twiki/EntitySpecification.twiki b/docs/src/site/twiki/EntitySpecification.twiki index 9f9e210..faad305 100644 --- a/docs/src/site/twiki/EntitySpecification.twiki +++ b/docs/src/site/twiki/EntitySpecification.twiki @@ -334,6 +334,9 @@ The snapshot layout creates a snapshot of the data on HDFS using the feed's loca to specify the projection columns. Feed import from database underneath uses sqoop to achieve the task. Any advanced Sqoop options can be specified via the arguments. +The feed's data storage location should include some combination of timepartition if import policy is associated with it. +Please see ImportExport documentation for more details. + ---+++ Late Arrival <verbatim> http://git-wip-us.apache.org/repos/asf/falcon/blob/c1d37bfa/docs/src/site/twiki/ImportExport.twiki ---------------------------------------------------------------------- diff --git a/docs/src/site/twiki/ImportExport.twiki b/docs/src/site/twiki/ImportExport.twiki index b0ce7ff..2fcb42b 100644 --- a/docs/src/site/twiki/ImportExport.twiki +++ b/docs/src/site/twiki/ImportExport.twiki @@ -108,6 +108,9 @@ where {lib-dir} value varies in oozie deployments. since the frequency of the Feed is hour(1) and the Feed instances are deleted after 90 days because of the retention policy. + The feed's data location should have some combination of time partitions (like ${YEAR}, ${MONTH}, {$DAY}, + ${HOUR}, ${MINUTE} etc) if import or export policy is associated. + <verbatim>
