IMPALA-4110: Apache RAT script on Impala tarballs. Apache RAT is a tool for license auditing. It will be used as part of the Apache release process. This patch includes a script for parsing its output and a file containing a list of filename globs that should be ignored. The script takes two command line parameters as input - the filename of the ignored file globs, and the filename of the RAT xml output.
Change-Id: Ic95bd38fbb90f9a901602dd91cee541b16bf4714 Reviewed-on: http://gerrit.cloudera.org:8080/4405 Reviewed-by: Alex Behm <[email protected]> Tested-by: Jim Apple <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/aa28e37e Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/aa28e37e Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/aa28e37e Branch: refs/heads/master Commit: aa28e37eb7df9a53dbcf7aabe910095681dd3a51 Parents: f4bbd41 Author: Jim Apple <[email protected]> Authored: Tue Sep 13 14:47:57 2016 -0700 Committer: Jim Apple <[email protected]> Committed: Wed Sep 14 14:42:23 2016 +0000 ---------------------------------------------------------------------- bin/check-rat-report.py | 80 +++++++++++++++++++++++++++++ bin/rat_exclude_files.txt | 111 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 191 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/aa28e37e/bin/check-rat-report.py ---------------------------------------------------------------------- diff --git a/bin/check-rat-report.py b/bin/check-rat-report.py new file mode 100755 index 0000000..2181015 --- /dev/null +++ b/bin/check-rat-report.py @@ -0,0 +1,80 @@ +#!/usr/bin/python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Apache RAT is a tool for checking license compliance. This is a script that uses Apache +# RAT to check licenses in Impala. +# +# It takes as command line parameters two file names - the first is the name of a file +# containing globs of files to ignore, and the second is the XML output of RAT. +# +# I tested this with +# +# pushd "${IMPALA_HOME}" +# export SANDBOX=$(mktemp -d) # Just a place to put files for testing +# echo "${SANDBOX}" +# git archive -o "${SANDBOX}/test-impala.tar.gz" HEAD # Make the tarball to check +# java -jar ~/Downloads/apache-rat-0.12/apache-rat-0.12.jar -x \ +# "${SANDBOX}/test-impala.tar.gz" >"${SANDBOX}/rat.xml" +# bin/check-rat-report.py bin/rat_exclude_files.txt "${SANDBOX}/rat.xml" +# +# This is copied from a similar file in Apache Kudu. Only RAT 0.12 is supported at this +# time. + +import fnmatch +import re +import sys +import xml.etree.ElementTree as ET + +if len(sys.argv) != 3: + sys.stderr.write("Usage: %s exclude_globs.lst rat_report.xml\n" % (sys.argv[0],)) + sys.exit(1) + +exclude_globs_filename = sys.argv[1] +xml_filename = sys.argv[2] + +globs = [line.strip() for line in open(exclude_globs_filename, "r") if "# " != line[0:2]] + +tree = ET.parse(xml_filename) +root = tree.getroot() +all_ok = True + +resources = root.findall('resource') +for r in resources: + approvals = r.findall('license-approval') + if approvals and approvals[0].attrib['name'] == 'true': + continue + clean_name = re.sub('^[^/]+/', '', r.attrib['name']) + excluded = False + for g in globs: + if fnmatch.fnmatch(clean_name, g): + excluded = True + break + if not excluded: + typename = r.findall('type')[0].attrib['name'] + if not (clean_name[0:9] == 'testdata/' and typename in ['archive', 'binary']): + sys.stderr.write( + "%s: %s\n" % + ('UNAPPROVED' if approvals else "NO APPROVALS; " + typename, clean_name)) + all_ok = False + +if not all_ok: + sys.exit(1) + +print 'OK' +sys.exit(0) http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/aa28e37e/bin/rat_exclude_files.txt ---------------------------------------------------------------------- diff --git a/bin/rat_exclude_files.txt b/bin/rat_exclude_files.txt new file mode 100644 index 0000000..d3ee414 --- /dev/null +++ b/bin/rat_exclude_files.txt @@ -0,0 +1,111 @@ +# These are the globs that RAT ignores when doing a copyright +# audit. Comments start with "# ". + +# http://www.apache.org/legal/src-headers.html: "A file without any +# degree of creativity in either its literal elements or its structure +# is not protected by copyright law; therefore, such a file does not +# require a license header." +.clang-format +.gitignore +*/.gitignore +*/rat_exclude_files.txt +be/src/testutil/htpasswd +be/src/testutil/*.key +tests/*/__init__.py +testdata/common/__init__.py +fe/src/test/resources/regionservers +shell/__init__.py +ssh_keys/id_rsa_impala +testdata/__init__.py +tests/__init__.py +www/index.html + +# See LICENSE.txt +be/src/gutil/* +www/highlight/* +www/DataTables*/* +www/datatables.* +www/bootstrap/* +tests/comparison/leopard/static/css/bootstrap* +tests/comparison/leopard/static/js/bootstrap* +shell/ext-py/prettytable-0.7.1/* +shell/ext-py/sqlparse-0.1.14/* +shell/ext-py/sasl-0.1.1/* +www/d3.v3.min.js +www/jquery/jquery-1.12.4.min.js + +# http://www.apache.org/legal/src-headers.html: "Short informational text files; for +# example README, INSTALL files. The expectation is that these files make it obvious which +# product they relate to." +be/src/testutil/certificates-info.txt +bin/README-RUNNING-BENCHMARKS +LOGS.md +README.md +*/README +*/README.dox +testdata/bin/README-BENCHMARK-TEST-GENERATION +tests/comparison/ORACLE.txt + +# http://www.apache.org/legal/src-headers.html: "Test data for which the addition of a +# source header would cause the tests to fail." +testdata/*.csv +testdata/*.test +be/src/testutil/*.pem +*.json +fe/src/test/resources/*.xml +fe/src/test/resources/hbase-jaas-client.conf.template +fe/src/test/resources/hbase-jaas-server.conf.template +llvm-ir/test-loop.bc +testdata/AllTypesError/*.txt +testdata/AllTypesErrorNoNulls/*.txt +*.avsc +*.parq +*.parquet +testdata/cluster/node_templates/cdh5/etc/hadoop/conf/*.xml.tmpl +testdata/cluster/node_templates/cdh5/etc/kudu/*.conf.tmpl +testdata/cluster/node_templates/common/etc/hadoop/conf/*.xml.tmpl +testdata/data/chars-formats.txt +testdata/data/chars-tiny.txt +testdata/data/decimal-tiny.txt +testdata/data/decimal_tbl.txt +testdata/data/overflow.txt +testdata/data/text-comma-backslash-newline.txt +testdata/data/text-dollar-hash-pipe.txt +testdata/data/widerow.txt +testdata/data/local_tbl/00000.txt +testdata/datasets/functional/functional_schema_template.sql +testdata/hive_benchmark/grepTiny/part-00000 +tests/pytest.ini +tests/shell/bad_impalarc +tests/shell/good_impalarc +tests/shell/shell.cmds +tests/shell/shell2.cmds +tests/shell/shell_error.cmds +tests/shell/test_close_queries.sql +tests/shell/test_file_comments.sql +tests/shell/test_file_no_comments.sql +tests/shell/test_var_substitution.sql + + +# Generated by Apache-licensed software: +be/src/transport/config.h + +# BSD 3-clause license that RAT can't seem to identify: +cmake_modules/FindJNI.cmake + +# http://www.apache.org/legal/resolved.html#category-a : Python Software Foundation +# License is allowed. +shell/pkg_resources.py + +# Notices in Impala as required by ASF rules: +DISCLAIMER +LICENSE.txt +NOTICE.txt + +# Notices in thirdparty sources included in the Impala repo and called out in /LICENSE.txt +be/src/thirdparty/squeasel/LICENSE + +# http://www.apache.org/legal/src-headers.html: 'Snippet' files that are combined as form +# a larger file where the larger file would have duplicate licensing headers. +www/all_child_groups.tmpl +www/common-footer.tmpl
