rmetzger commented on a change in pull request #13796: URL: https://github.com/apache/flink/pull/13796#discussion_r515952694
########## File path: flink-connectors/flink-connector-hive/src/main/resources/META-INF/NOTICE ########## @@ -7,7 +7,6 @@ The Apache Software Foundation (http://www.apache.org/). This project bundles the following dependencies under the Apache Software License 2.0. (http://www.apache.org/licenses/LICENSE-2.0.txt) - org.apache.parquet:parquet-hadoop:1.11.1 -- org.apache.parquet:parquet-format:2.4.0 Review comment: Well, we are deploying all those flink-sql-connector-hive-{1.22,2.2.0,2.3.6,3.1.2} versions. But they have their own NOTICE files. The shade plugin output for `flink-connector-hive` doesn't include this dependency. We are not shading `flink-connector-hive` multiple times for different versions. ########## File path: tools/azure-pipelines/jobs-template.yml ########## @@ -229,7 +229,7 @@ jobs: IT_CASE_S3_BUCKET: $(SECRET_S3_BUCKET) IT_CASE_S3_ACCESS_KEY: $(SECRET_S3_ACCESS_KEY) IT_CASE_S3_SECRET_KEY: $(SECRET_S3_SECRET_KEY) - condition: not(eq(variables['SKIP'], '1')) + condition: and(succeeded(),not(eq(variables['SKIP'], '1'))) Review comment: I will put it into a separate hotfix. ########## File path: flink-connectors/flink-sql-connector-hbase-1.4/pom.xml ########## @@ -87,6 +87,7 @@ under the License. <exclude>org.apache.hbase:hbase-metrics*</exclude> <exclude>org.apache.hbase:hbase-server*</exclude> <exclude>org.apache.hbase:hbase-hadoop*-compat</exclude> + <exclude>org.apache.hbase:hbase-common:jar:tests</exclude> Review comment: ``` [INFO] --- maven-shade-plugin:3.1.1:shade (shade-flink) @ flink-sql-connector-hbase-1.4_2.11 --- [INFO] Including org.apache.flink:flink-connector-hbase-1.4_2.11:jar:1.12-SNAPSHOT in the shaded jar. [INFO] Including org.apache.flink:flink-connector-hbase-base_2.11:jar:1.12-SNAPSHOT in the shaded jar. [INFO] Excluding org.apache.hbase:hbase-server:jar:1.4.3 from the shaded jar. [INFO] Including org.apache.hbase:hbase-common:jar:1.4.3 in the shaded jar. [INFO] Excluding org.apache.avro:avro:jar:1.10.0 from the shaded jar. [INFO] Excluding com.fasterxml.jackson.core:jackson-core:jar:2.10.1 from the shaded jar. [INFO] Excluding com.fasterxml.jackson.core:jackson-databind:jar:2.10.1 from the shaded jar. [INFO] Excluding com.fasterxml.jackson.core:jackson-annotations:jar:2.10.1 from the shaded jar. [INFO] Excluding com.github.stephenc.findbugs:findbugs-annotations:jar:1.3.9-1 from the shaded jar. [INFO] Including org.apache.hbase:hbase-protocol:jar:1.4.3 in the shaded jar. [INFO] Including org.apache.hbase:hbase-procedure:jar:1.4.3 in the shaded jar. [INFO] Including org.apache.hbase:hbase-common:jar:tests:1.4.3 in the shaded jar. ``` I believe these are test classes only, which we don't need to ship to our users? ########## File path: tools/ci/java-ci-tools/src/main/java/org/apache/flink/tools/ci/licensecheck/LicenseChecker.java ########## @@ -0,0 +1,320 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.tools.ci.licensecheck; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.StringReader; +import java.nio.charset.Charset; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Scanner; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +/** + * Utility class checking for proper NOTICE files based on the maven build output. + */ +public class LicenseChecker { + // ---------------------------------------- Launcher ---------------------------------------- // + + private static final Logger LOG = LoggerFactory.getLogger(LicenseChecker.class); + + public static void main(String[] args) throws IOException { + if (args.length < 2) { + System.out.println("Usage: LicenseChecker <pathMavenBuildOutput> <pathFlinkRoot>"); + System.exit(1); + } + LOG.warn("THIS UTILITY IS ONLY CHECKING FOR COMMON LICENSING MISTAKES. A MANUAL CHECK OF THE NOTICE FILES, DEPLOYED ARTIFACTS, ETC. IS STILL NEEDED!"); + + LicenseChecker checker = new LicenseChecker(); + int severeIssueCount = checker.run(new File(args[0]), new File(args[1])); + + if (severeIssueCount > 0) { + LOG.warn("Found a total of {} severe license issues", severeIssueCount); + + System.exit(1); + } + LOG.info("License check completed without severe issues."); + } + + // ---------------------------------------- License Checker ---------------------------------------- // + + private static final List<String> MODULES_SKIPPING_DEPLOYMENT = loadFromResources("modules-skipping-deployment.modulelist"); + + private static final List<String> MODULES_DEFINING_EXCESS_DEPENDENCIES = loadFromResources("modules-defining-excess-dependencies.modulelist"); + + // pattern for maven shade plugin + private static final Pattern SHADE_NEXT_MODULE_PATTERN = Pattern.compile(".*:shade \\((shade-flink|default)\\) @ ([^ _]+)(_[0-9.]+)? --.*"); + private static final Pattern SHADE_INCLUDE_MODULE_PATTERN = Pattern.compile(".*Including ([^:]+):([^:]+):jar:([^ ]+) in the shaded jar"); + + // pattern for maven-dependency-plugin copyied dependencies + private static final Pattern DEPENDENCY_COPY_NEXT_MODULE_PATTERN = Pattern.compile(".*maven-dependency-plugin:3.1.1:copy \\([^)]+\\) @ ([^ _]+)(_[0-9.]+)? --.*"); + private static final Pattern DEPENDENCY_COPY_INCLUDE_MODULE_PATTERN = Pattern.compile(".*Configured Artifact: ([^:]+):([^:]+):([^:]+):jar.*"); + + private static final Pattern NOTICE_DEPENDENCY_PATTERN = Pattern.compile("- ([^ :]+):([^:]+):([^ ]+)($| )|.*bundles \"([^:]+):([^:]+):([^\"]+)\".*"); + + private int run(File buildResult, File root) throws IOException { + int severeIssueCount = 0; + // parse included dependencies from build output + Multimap<String, IncludedDependency> modulesWithShadedDependencies = parseModulesFromBuildResult(buildResult); + LOG.info("Extracted " + modulesWithShadedDependencies.asMap().keySet().size() + " modules with a total of " + modulesWithShadedDependencies.values().size() + " dependencies"); + + // find modules producing a shaded-jar + List<File> noticeFiles = findNoticeFiles(root); + LOG.info("Found {} NOTICE files to check", noticeFiles.size()); + + // check that all required NOTICE files exists + severeIssueCount += ensureRequiredNoticeFiles(modulesWithShadedDependencies, noticeFiles); + + // check each NOTICE file + for (File noticeFile: noticeFiles) { + severeIssueCount += checkNoticeFile(modulesWithShadedDependencies, noticeFile); + } + + // find modules included in flink-dist + + return severeIssueCount; + } + + private int ensureRequiredNoticeFiles(Multimap<String, IncludedDependency> modulesWithShadedDependencies, List<File> noticeFiles) { + int severeIssueCount = 0; + Set<String> shadingModules = new HashSet<>(modulesWithShadedDependencies.keys()); + shadingModules.removeAll(noticeFiles.stream().map(LicenseChecker::getModuleFromNoticeFile).collect(Collectors.toList())); + for (String moduleWithoutNotice : shadingModules) { + if (!MODULES_SKIPPING_DEPLOYMENT.contains(moduleWithoutNotice)) { + LOG.warn("Module {} is missing a NOTICE file. It has shaded dependencies: {}", moduleWithoutNotice, modulesWithShadedDependencies.get(moduleWithoutNotice)); + severeIssueCount++; + } + } + return severeIssueCount; + } + + private static String getModuleFromNoticeFile(File noticeFile) { + File moduleFile = noticeFile.getParentFile() // META-INF + .getParentFile() // resources + .getParentFile() // main + .getParentFile() // src + .getParentFile(); // <-- module name + return moduleFile.getName(); + } + + private int checkNoticeFile(Multimap<String, IncludedDependency> modulesWithShadedDependencies, File noticeFile) throws IOException { + int severeIssueCount = 0; + String moduleName = getModuleFromNoticeFile(noticeFile); + + // 1st line contains module name + String noticeContents = readFile(noticeFile.toPath()); + if (!noticeContents.startsWith(moduleName)) { + String firstLine = noticeContents.substring(0, noticeContents.indexOf('\n')); + LOG.warn("Expected first file of notice file to start with module name. moduleName={}, firstLine={}", moduleName, firstLine); + } + + // collect all declared dependencies from NOTICE file + Set<IncludedDependency> declaredDependencies = new HashSet<>(); + try (BufferedReader br = new BufferedReader(new StringReader(noticeContents))) { + String line; + while ((line = br.readLine()) != null) { + Matcher noticeDependencyMatcher = NOTICE_DEPENDENCY_PATTERN.matcher(line); + if (noticeDependencyMatcher.find()) { + String groupId = noticeDependencyMatcher.group(1); + String artifactId = noticeDependencyMatcher.group(2); + String version = noticeDependencyMatcher.group(3); + if (groupId == null && artifactId == null && version == null) { // "bundles" case + groupId = noticeDependencyMatcher.group(5); + artifactId = noticeDependencyMatcher.group(6); + version = noticeDependencyMatcher.group(7); + } + IncludedDependency toAdd = IncludedDependency.create(groupId, artifactId, version); + if (!declaredDependencies.add(toAdd)) { + LOG.warn("Dependency {} has been declared twice in module {}", toAdd, moduleName); + } + } + } + } + // print all dependencies missing from NOTICE file + Set<IncludedDependency> expectedDependencies = new HashSet<>(modulesWithShadedDependencies.get(moduleName)); + expectedDependencies.removeAll(declaredDependencies); + for (IncludedDependency missingDependency : expectedDependencies) { + LOG.error("Could not find dependency {} in NOTICE file {}", missingDependency, noticeFile); + severeIssueCount++; + } + + if (!MODULES_DEFINING_EXCESS_DEPENDENCIES.contains(moduleName)) { + // print all dependencies defined in NOTICE file, which were not expected + Set<IncludedDependency> excessDependencies = new HashSet<>(declaredDependencies); + excessDependencies.removeAll(modulesWithShadedDependencies.get(moduleName)); + for (IncludedDependency excessDependency : excessDependencies) { + LOG.warn("Dependency {} is mentioned in NOTICE file {}, but is not expected there", excessDependency, noticeFile); + } + } + + return severeIssueCount; + } + + private static String readFile(Path path) throws IOException { + byte[] encoded = Files.readAllBytes(path); + return new String(encoded, Charset.defaultCharset()); + } + + private List<File> findNoticeFiles(File root) throws IOException { + return Files.walk(root.toPath()) + .filter(file -> { + int nameCount = file.getNameCount(); + return file.getName(nameCount - 1).toString().equals("NOTICE") + && file.getName(nameCount - 2).toString().equals("META-INF") + && file.getName(nameCount - 3).toString().equals("resources"); + }) + .map(Path::toFile) + .collect(Collectors.toList()); + } + + private Multimap<String, IncludedDependency> parseModulesFromBuildResult(File buildResult) throws IOException { + Multimap<String, IncludedDependency> result = ArrayListMultimap.create(); + try (BufferedReader br = new BufferedReader(new FileReader(buildResult))) { Review comment: I agree that this is an alternative approach. It would imply materializing the entire build output as strings into the JVM. I'm not sure if the code is a lot more readable when having multiple nested loops fiddling with the line indexes. For the sake of my time, I would like to keep the code as-is for now. I agree it is not perfectly easy to understand, but the improvement wouldn't be substantial enough to pay the performance and reimplementation price. ########## File path: flink-connectors/flink-sql-connector-hbase-1.4/pom.xml ########## @@ -87,6 +87,7 @@ under the License. <exclude>org.apache.hbase:hbase-metrics*</exclude> <exclude>org.apache.hbase:hbase-server*</exclude> <exclude>org.apache.hbase:hbase-hadoop*-compat</exclude> + <exclude>org.apache.hbase:hbase-common:jar:tests</exclude> Review comment: Yes, true. I will look into this. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
