This is an automated email from the ASF dual-hosted git repository.
stevel pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/hadoop.git
The following commit(s) were added to refs/heads/trunk by this push:
new 819159f HDFS-14788. Use dynamic regex filter to ignore copy of source
files in Distcp.
819159f is described below
commit 819159fa060897bcf7c9ae09bf4b2fc97292f92b
Author: Mukund Thakur <[email protected]>
AuthorDate: Mon Jan 6 19:09:07 2020 +0000
HDFS-14788. Use dynamic regex filter to ignore copy of source files in
Distcp.
Contributed by Mukund Thakur.
Change-Id: I781387ddce95ee300c12a160dc9a0f7d602403c3
---
.../java/org/apache/hadoop/tools/CopyFilter.java | 31 +++++++
.../org/apache/hadoop/tools/DistCpConstants.java | 14 ++++
.../hadoop/tools/RegexpInConfigurationFilter.java | 72 ++++++++++++++++
.../hadoop-distcp/src/site/markdown/DistCp.md.vm | 17 ++++
.../org/apache/hadoop/tools/TestCopyFilter.java | 97 ++++++++++++++++++++++
.../tools/TestRegexpInConfigurationFilter.java | 55 ++++++++++++
6 files changed, 286 insertions(+)
diff --git
a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/CopyFilter.java
b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/CopyFilter.java
index 4b348a5..f5f00f1 100644
---
a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/CopyFilter.java
+++
b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/CopyFilter.java
@@ -17,6 +17,11 @@
*/
package org.apache.hadoop.tools;
+import java.lang.reflect.Constructor;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
@@ -26,6 +31,8 @@ import org.apache.hadoop.fs.Path;
*/
public abstract class CopyFilter {
+ private static final Logger LOG = LoggerFactory.getLogger(CopyFilter.class);
+
/**
* Default initialize method does nothing.
*/
@@ -47,6 +54,30 @@ public abstract class CopyFilter {
* @return An instance of the appropriate CopyFilter
*/
public static CopyFilter getCopyFilter(Configuration conf) {
+ String filtersClassName = conf
+ .get(DistCpConstants.CONF_LABEL_FILTERS_CLASS);
+ if (filtersClassName != null) {
+ try {
+ Class<? extends CopyFilter> filtersClass = conf
+ .getClassByName(filtersClassName)
+ .asSubclass(CopyFilter.class);
+ filtersClassName = filtersClass.getName();
+ Constructor<? extends CopyFilter> constructor = filtersClass
+ .getDeclaredConstructor(Configuration.class);
+ return constructor.newInstance(conf);
+ } catch (Exception e) {
+ LOG.error(DistCpConstants.CLASS_INSTANTIATION_ERROR_MSG +
+ filtersClassName, e);
+ throw new RuntimeException(
+ DistCpConstants.CLASS_INSTANTIATION_ERROR_MSG +
+ filtersClassName, e);
+ }
+ } else {
+ return getDefaultCopyFilter(conf);
+ }
+ }
+
+ private static CopyFilter getDefaultCopyFilter(Configuration conf) {
String filtersFilename = conf.get(DistCpConstants.CONF_LABEL_FILTERS_FILE);
if (filtersFilename == null) {
diff --git
a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java
b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java
index f0adc78..2581568 100644
---
a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java
+++
b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java
@@ -120,6 +120,17 @@ public final class DistCpConstants {
/* DistCp CopyListing class override param */
public static final String CONF_LABEL_COPY_LISTING_CLASS =
"distcp.copy.listing.class";
+ /**
+ * DistCp Filter class override param.
+ */
+ public static final String CONF_LABEL_FILTERS_CLASS = "distcp.filters.class";
+
+ /**
+ * Distcp exclude file regex override param.
+ */
+ public static final String DISTCP_EXCLUDE_FILE_REGEX =
+ "distcp.exclude-file-regex";
+
/* DistCp Copy Buffer Size */
public static final String CONF_LABEL_COPY_BUFFER_SIZE =
"distcp.copy.buffer.size";
@@ -177,4 +188,7 @@ public final class DistCpConstants {
public static final String CHECKSUM_MISMATCH_ERROR_MSG =
"Checksum mismatch between ";
+
+ public static final String CLASS_INSTANTIATION_ERROR_MSG =
+ "Unable to instantiate ";
}
diff --git
a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/RegexpInConfigurationFilter.java
b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/RegexpInConfigurationFilter.java
new file mode 100644
index 0000000..4bf62e2
--- /dev/null
+++
b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/RegexpInConfigurationFilter.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+
+
+/**
+ * Implementation of regex based filter for DistCp.
+ * {@link DistCpConstants#CONF_LABEL_FILTERS_CLASS} needs to be set
+ * in {@link Configuration} when launching a distcp job.
+ */
+public class RegexpInConfigurationFilter extends CopyFilter {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(RegexpInConfigurationFilter.class);
+
+ /**
+ * Regex which can used to filter source files.
+ * {@link DistCpConstants#DISTCP_EXCLUDE_FILE_REGEX} can be set
+ * in {@link Configuration} when launching a DistCp job.
+ * If not set no files will be filtered.
+ */
+ private String excludeFileRegex;
+
+ private List<Pattern> filters = new ArrayList<>();
+
+ protected RegexpInConfigurationFilter(Configuration conf) {
+ excludeFileRegex = conf
+ .getTrimmed(DistCpConstants.DISTCP_EXCLUDE_FILE_REGEX, "");
+ if (!excludeFileRegex.isEmpty()) {
+ Pattern pattern = Pattern.compile(excludeFileRegex);
+ filters.add(pattern);
+ }
+ }
+
+ @Override
+ public boolean shouldCopy(Path path) {
+ for (Pattern filter : filters) {
+ if (filter.matcher(path.toString()).matches()) {
+ LOG.debug("Skipping {} as it matches the filter regex",
+ path.toString());
+ return false;
+ }
+ }
+ return true;
+ }
+}
diff --git a/hadoop-tools/hadoop-distcp/src/site/markdown/DistCp.md.vm
b/hadoop-tools/hadoop-distcp/src/site/markdown/DistCp.md.vm
index a5c4011..bf5b891 100644
--- a/hadoop-tools/hadoop-distcp/src/site/markdown/DistCp.md.vm
+++ b/hadoop-tools/hadoop-distcp/src/site/markdown/DistCp.md.vm
@@ -440,6 +440,23 @@ $H3 Copy-listing Generator
of DistCp differs here from the legacy DistCp, in how paths are considered
for copy.
+ One may also customize the filtering of files which shouldn't be copied
+ by passing the current supported implementation of CopyFilter interface
+ or a new implementation can be written. This can be specified by setting the
+ `distcp.filters.class` in the DistCpOptions:
+
+ 1. `distcp.filters.class` to "RegexCopyFilter". If you are using this
implementation,
+ you will have to pass along "CopyFilter" `distcp.filters.file` which
contains the
+ regex used for filtering. Support regular expressions specified by
+ java.util.regex.Pattern.
+ 2. `distcp.filters.class` to "RegexpInConfigurationFilter". If you are using
this
+ implementation, you will have to pass along the regex also using
+ `distcp.exclude-file-regex` parameter in "DistCpOptions". Support regular
+ expressions specified by java.util.regex.Pattern. This is a more dynamic
approach
+ as compared to "RegexCopyFilter".
+ 3. `distcp.filters.class` to "TrueCopyFilter". This is used as a default
+ implementation if none of the above options are specified.
+
The legacy implementation only lists those paths that must definitely be
copied on to target. E.g. if a file already exists at the target (and
`-overwrite` isn't specified), the file isn't even considered in the
diff --git
a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyFilter.java
b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyFilter.java
new file mode 100644
index 0000000..4d36f38
--- /dev/null
+++
b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyFilter.java
@@ -0,0 +1,97 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.junit.Test;
+
+import org.apache.hadoop.conf.Configuration;
+
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Test {@link CopyFilter}.
+ */
+public class TestCopyFilter {
+
+ @Test
+ public void testGetCopyFilterTrueCopyFilter() {
+ Configuration configuration = new Configuration(false);
+ CopyFilter copyFilter = CopyFilter.getCopyFilter(configuration);
+ assertTrue("copyFilter should be instance of TrueCopyFilter",
+ copyFilter instanceof TrueCopyFilter);
+ }
+
+ @Test
+ public void testGetCopyFilterRegexCopyFilter() {
+ Configuration configuration = new Configuration(false);
+ configuration.set(DistCpConstants.CONF_LABEL_FILTERS_FILE, "random");
+ CopyFilter copyFilter = CopyFilter.getCopyFilter(configuration);
+ assertTrue("copyFilter should be instance of RegexCopyFilter",
+ copyFilter instanceof RegexCopyFilter);
+ }
+
+ @Test
+ public void testGetCopyFilterRegexpInConfigurationFilter() {
+ final String filterName =
+ "org.apache.hadoop.tools.RegexpInConfigurationFilter";
+ Configuration configuration = new Configuration(false);
+ configuration.set(DistCpConstants.CONF_LABEL_FILTERS_CLASS, filterName);
+ CopyFilter copyFilter = CopyFilter.getCopyFilter(configuration);
+ assertTrue("copyFilter should be instance of RegexpInConfigurationFilter",
+ copyFilter instanceof RegexpInConfigurationFilter);
+ }
+
+ @Test
+ public void testGetCopyFilterNonExistingClass() throws Exception {
+ final String filterName =
+ "org.apache.hadoop.tools.RegexpInConfigurationWrongFilter";
+ Configuration configuration = new Configuration(false);
+ configuration.set(DistCpConstants.CONF_LABEL_FILTERS_CLASS, filterName);
+ intercept(RuntimeException.class,
+ DistCpConstants.CLASS_INSTANTIATION_ERROR_MSG + filterName,
+ () -> CopyFilter.getCopyFilter(configuration));
+ }
+
+ @Test
+ public void testGetCopyFilterWrongClassType() throws Exception {
+ final String filterName =
+ "org.apache.hadoop.tools." +
+ "TestCopyFilter.FilterNotExtendingCopyFilter";
+ Configuration configuration = new Configuration(false);
+ configuration.set(DistCpConstants.CONF_LABEL_FILTERS_CLASS, filterName);
+ intercept(RuntimeException.class,
+ DistCpConstants.CLASS_INSTANTIATION_ERROR_MSG + filterName,
+ () -> CopyFilter.getCopyFilter(configuration));
+ }
+
+ @Test
+ public void testGetCopyFilterEmptyString() throws Exception {
+ final String filterName = "";
+ Configuration configuration = new Configuration(false);
+ configuration.set(DistCpConstants.CONF_LABEL_FILTERS_CLASS, filterName);
+ intercept(RuntimeException.class,
+ DistCpConstants.CLASS_INSTANTIATION_ERROR_MSG + filterName,
+ () -> CopyFilter.getCopyFilter(configuration));
+ }
+
+ private class FilterNotExtendingCopyFilter {
+
+ }
+}
diff --git
a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestRegexpInConfigurationFilter.java
b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestRegexpInConfigurationFilter.java
new file mode 100644
index 0000000..8cf0620
--- /dev/null
+++
b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestRegexpInConfigurationFilter.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools;
+
+import org.junit.Test;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Test {@link RegexpInConfigurationFilter}.
+ */
+public class TestRegexpInConfigurationFilter {
+
+ @Test
+ public void testShouldCopy() {
+ Configuration configuration = new Configuration(false);
+ configuration.set(DistCpConstants.DISTCP_EXCLUDE_FILE_REGEX,
+ "\\/.*_COPYING_$|\\/.*_COPYING$|^.*\\/\\.[^\\/]*$|" +
+ "\\/_temporary$|\\/\\_temporary\\/|.*\\/\\.Trash\\/.*");
+ RegexpInConfigurationFilter defaultCopyFilter =
+ new RegexpInConfigurationFilter(configuration);
+ Path shouldCopyPath = new Path("/user/bar");
+ assertTrue(shouldCopyPath.toString() + " should be copied",
+ defaultCopyFilter.shouldCopy(shouldCopyPath));
+ shouldCopyPath = new Path("/user/bar/_COPYING");
+ assertFalse(shouldCopyPath.toString() + " shouldn't be copied",
+ defaultCopyFilter.shouldCopy(shouldCopyPath));
+ shouldCopyPath = new Path("/user/bar/_COPYING_");
+ assertFalse(shouldCopyPath.toString() + " shouldn't be copied",
+ defaultCopyFilter.shouldCopy(shouldCopyPath));
+ shouldCopyPath = new Path("/temp/");
+ assertTrue(shouldCopyPath.toString() + " should be copied",
+ defaultCopyFilter.shouldCopy(shouldCopyPath));
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]