refactor prefix filtering and spike the same update-index trick to step 2 initial history clean, but its not faster here
Project: http://git-wip-us.apache.org/repos/asf/brooklyn-client/repo Commit: http://git-wip-us.apache.org/repos/asf/brooklyn-client/commit/4e533c85 Tree: http://git-wip-us.apache.org/repos/asf/brooklyn-client/tree/4e533c85 Diff: http://git-wip-us.apache.org/repos/asf/brooklyn-client/diff/4e533c85 Branch: refs/heads/master Commit: 4e533c854e9748789484c5ad5bbd75839e19e74c Parents: 61ef068 Author: Alex Heneveld <[email protected]> Authored: Wed Dec 16 14:41:02 2015 +0000 Committer: Alex Heneveld <[email protected]> Committed: Wed Dec 16 16:59:48 2015 +0000 ---------------------------------------------------------------------- 2-clean-history.sh | 6 +++++- grep-lines-starting.sh | 20 ++++++++++++++++++++ make-whitelist.sh | 16 +--------------- 3 files changed, 26 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/brooklyn-client/blob/4e533c85/2-clean-history.sh ---------------------------------------------------------------------- diff --git a/2-clean-history.sh b/2-clean-history.sh index dd23a62..779bc17 100755 --- a/2-clean-history.sh +++ b/2-clean-history.sh @@ -16,7 +16,11 @@ git checkout master # now make master reorganised, if reorg branch exists git reset --hard reorg -git filter-branch --index-filter "git rm -r --cached --ignore-unmatch $(echo $( cat ${basedir}/big-files-to-remove.txt ))" master ${branches} +git filter-branch -f --index-filter "git rm -r -q --cached --ignore-unmatch $(echo $( cat ${basedir}/big-files-to-remove.txt ))" master +## above is slightly faster than below, because (compared w step 3) we have fewer patterns and (compared w step 4) we are benefitting from rm's native pattern matching +# git filter-branch -f --index-filter \ +# "git ls-files > /tmp/TMP-clean-history-LS ; ${basedir}/grep-lines-starting.sh ${basedir}/big-files-to-remove.txt /tmp/TMP-clean-history-LS | git update-index --force-remove --stdin" \ +# --tag-name-filter cat --prune-empty master ${branches} # option 2: delete the entire example *if* it contains binaries but keep it if it doesn't - means that the project will suddenly appear in history but should work when it does appear # (we have gone for option 1, just cutting the big files) http://git-wip-us.apache.org/repos/asf/brooklyn-client/blob/4e533c85/grep-lines-starting.sh ---------------------------------------------------------------------- diff --git a/grep-lines-starting.sh b/grep-lines-starting.sh new file mode 100755 index 0000000..df1e29c --- /dev/null +++ b/grep-lines-starting.sh @@ -0,0 +1,20 @@ + +# efficient way to find lines beginning with any of the given prefixes, sorted +# much faster than grep -f (with or without -F) for big files because of the line start logic and sort -- O(N log N) rather than O(N^2) + +if [ -z "$2" ] ; then echo "Usage: grep-lines-starting.sh <prefix_file> <lines> # to find all lines starting with any prefix in <prefix_file>" ; exit 1 ; fi + +PREFIX_FILE=$1 +INPUT=$2 +TMP=/tmp/remove-prefixes-tmp + +cat $PREFIX_FILE | awk '{if ($1) print $0"\tMATCH_THIS" }' | cat - $INPUT | sort -u > ${TMP}_merged +cat ${TMP}_merged | awk -F $'\t' '{ + if ($2=="MATCH_THIS") { + if (!patt || substr($1,0,length(patt))!=patt) { patt=$1; } + if (last==patt) { print last; } + } else { + last=$0; + if (patt && substr(last,0,length(patt))==patt) { print last; } + } }' | sort -u + http://git-wip-us.apache.org/repos/asf/brooklyn-client/blob/4e533c85/make-whitelist.sh ---------------------------------------------------------------------- diff --git a/make-whitelist.sh b/make-whitelist.sh index 84e6ed8..eb31841 100755 --- a/make-whitelist.sh +++ b/make-whitelist.sh @@ -51,21 +51,7 @@ while [ -s $TODO_REMAINING ] ; do cat ${TODO_HERE}_ids | xargs -L -n100 git show -l99999 -M50 -C90 --name-status --format="ID: %H" | grep -v ^ID: | awk -F $'\t' '{ if ($3) print $3"\t"$2; else print $2; }' | sort -u >> ${TODO_HERE}_allpaths echo comparing `cat ${TODO_HERE}_allpaths | wc -l` candidate files against paths... - cat $TODO_REMAINING | awk '{print $0"\tMATCH_THIS" }' | cat - ${TODO_HERE}_allpaths | sort -u > ${TODO_HERE}_merged - cat ${TODO_HERE}_merged | awk -F $'\t' '{ - if ($2=="MATCH_THIS") { - if (!patt || substr($1,0,length(patt))!=patt) { patt=$1; } - if (last1==patt) { print last1; if (last2) print last2; } - last1=""; - } else { - last1=$1; last2=$2; - if (patt && substr(last1,0,length(patt))==patt) { print last1; if (last2) print last2; } - } }' | sort -u -o ${TODO_HERE} - # logging for the above, if needed -# echo MATCHING for $OUTPUT_FILENAME : >> ${ORIG_DIR}/log -# cat ${TODO_HERE}_merged | awk -F $'\t' '{ if ($2=="MATCH_THIS") { if (!patt || substr($1,0,length(patt))!=patt) { patt=$1; } -# if (last1==patt) { print "MATCH LAST on "patt" ADDS "last1" "last2; } last1=""; } -# else { last1=$1; last2=$2; if (patt && substr(last1,0,length(patt))==patt) { print "MATCH NEXT on "patt" ADDS "last1" "last2; } } }' >> ${ORIG_DIR}/log + ${ORIG_DIR}/grep-lines-starting.sh ${TODO_REMAINING} ${TODO_HERE}_allpaths | awk -F $'\t' '{print $1; if ($2) print $2;}' | sort -u -o ${TODO_HERE} comm -23 ${TODO_HERE} $OUTPUT > ${TODO_REMAINING} cat $OUTPUT ${TODO_HERE} | sort -u -o ${OUTPUT}
