[14/50] [abbrv] brooklyn-client git commit: refactor prefix filtering and spike the same update-index trick to step 2 initial history clean, but its not faster here

heneveld Mon, 01 Feb 2016 10:01:39 -0800

refactor prefix filtering and spike the same update-index trick to step 2 
initial history clean, but its not faster here



Project: http://git-wip-us.apache.org/repos/asf/brooklyn-client/repo
Commit: http://git-wip-us.apache.org/repos/asf/brooklyn-client/commit/4e533c85
Tree: http://git-wip-us.apache.org/repos/asf/brooklyn-client/tree/4e533c85
Diff: http://git-wip-us.apache.org/repos/asf/brooklyn-client/diff/4e533c85

Branch: refs/heads/master
Commit: 4e533c854e9748789484c5ad5bbd75839e19e74c
Parents: 61ef068
Author: Alex Heneveld <[email protected]>
Authored: Wed Dec 16 14:41:02 2015 +0000
Committer: Alex Heneveld <[email protected]>
Committed: Wed Dec 16 16:59:48 2015 +0000

----------------------------------------------------------------------
 2-clean-history.sh     |  6 +++++-
 grep-lines-starting.sh | 20 ++++++++++++++++++++
 make-whitelist.sh      | 16 +---------------
 3 files changed, 26 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/brooklyn-client/blob/4e533c85/2-clean-history.sh
----------------------------------------------------------------------
diff --git a/2-clean-history.sh b/2-clean-history.sh
index dd23a62..779bc17 100755
--- a/2-clean-history.sh
+++ b/2-clean-history.sh
@@ -16,7 +16,11 @@ git checkout master
 # now make master reorganised, if reorg branch exists
 git reset --hard reorg
 
-git filter-branch --index-filter "git rm -r --cached --ignore-unmatch $(echo 
$( cat ${basedir}/big-files-to-remove.txt ))" master ${branches}
+git filter-branch -f --index-filter "git rm -r -q --cached --ignore-unmatch 
$(echo $( cat ${basedir}/big-files-to-remove.txt ))" master
+## above is slightly faster than below, because (compared w step 3) we have 
fewer patterns and (compared w step 4) we are benefitting from rm's native 
pattern matching
+# git filter-branch -f --index-filter \
+#  "git ls-files > /tmp/TMP-clean-history-LS ; 
${basedir}/grep-lines-starting.sh ${basedir}/big-files-to-remove.txt 
/tmp/TMP-clean-history-LS | git update-index --force-remove --stdin" \
+#  --tag-name-filter cat --prune-empty master ${branches}
 
 # option 2: delete the entire example *if* it contains binaries but keep it if 
it doesn't - means that the project will suddenly appear in history but should 
work when it does appear
 # (we have gone for option 1, just cutting the big files)

http://git-wip-us.apache.org/repos/asf/brooklyn-client/blob/4e533c85/grep-lines-starting.sh
----------------------------------------------------------------------
diff --git a/grep-lines-starting.sh b/grep-lines-starting.sh
new file mode 100755
index 0000000..df1e29c
--- /dev/null
+++ b/grep-lines-starting.sh
@@ -0,0 +1,20 @@
+
+# efficient way to find lines beginning with any of the given prefixes, sorted
+# much faster than grep -f (with or without -F) for big files because of the 
line start logic and sort -- O(N log N) rather than O(N^2)
+
+if [ -z "$2" ] ; then echo "Usage:  grep-lines-starting.sh <prefix_file> 
<lines>     # to find all lines starting with any prefix in <prefix_file>" ; 
exit 1 ; fi
+
+PREFIX_FILE=$1
+INPUT=$2
+TMP=/tmp/remove-prefixes-tmp
+
+cat $PREFIX_FILE | awk '{if ($1) print $0"\tMATCH_THIS" }' | cat - $INPUT | 
sort -u > ${TMP}_merged
+cat ${TMP}_merged | awk -F $'\t' '{
+    if ($2=="MATCH_THIS") {
+      if (!patt || substr($1,0,length(patt))!=patt) { patt=$1; }
+      if (last==patt) { print last; }
+    } else {
+      last=$0;
+      if (patt && substr(last,0,length(patt))==patt) { print last; }
+    } }' | sort -u
+

http://git-wip-us.apache.org/repos/asf/brooklyn-client/blob/4e533c85/make-whitelist.sh
----------------------------------------------------------------------
diff --git a/make-whitelist.sh b/make-whitelist.sh
index 84e6ed8..eb31841 100755
--- a/make-whitelist.sh
+++ b/make-whitelist.sh
@@ -51,21 +51,7 @@ while [ -s $TODO_REMAINING ] ; do
   cat ${TODO_HERE}_ids | xargs -L -n100 git show -l99999 -M50 -C90 
--name-status --format="ID: %H" | grep -v ^ID: | awk -F $'\t' '{ if ($3) print 
$3"\t"$2; else print $2; }' | sort -u >> ${TODO_HERE}_allpaths
 
   echo comparing `cat ${TODO_HERE}_allpaths | wc -l` candidate files against 
paths...
-  cat $TODO_REMAINING | awk '{print $0"\tMATCH_THIS" }' | cat - 
${TODO_HERE}_allpaths | sort -u > ${TODO_HERE}_merged
-  cat ${TODO_HERE}_merged | awk -F $'\t' '{ 
-    if ($2=="MATCH_THIS") { 
-      if (!patt || substr($1,0,length(patt))!=patt) { patt=$1; } 
-      if (last1==patt) { print last1; if (last2) print last2; } 
-      last1=""; 
-    } else { 
-      last1=$1; last2=$2; 
-      if (patt && substr(last1,0,length(patt))==patt) { print last1; if 
(last2) print last2; } 
-    } }' | sort -u -o ${TODO_HERE}
-   # logging for the above, if needed
-#  echo MATCHING for $OUTPUT_FILENAME : >> ${ORIG_DIR}/log
-#  cat ${TODO_HERE}_merged | awk -F $'\t' '{ if ($2=="MATCH_THIS") { if (!patt 
|| substr($1,0,length(patt))!=patt) { patt=$1; } 
-#      if (last1==patt) { print "MATCH LAST on "patt" ADDS "last1" "last2; } 
last1=""; }
-#    else { last1=$1; last2=$2; if (patt && 
substr(last1,0,length(patt))==patt) { print "MATCH NEXT on "patt" ADDS "last1" 
"last2; } } }' >> ${ORIG_DIR}/log
+  ${ORIG_DIR}/grep-lines-starting.sh ${TODO_REMAINING} ${TODO_HERE}_allpaths | 
awk -F $'\t' '{print $1; if ($2) print $2;}' | sort -u -o ${TODO_HERE}
 
   comm -23 ${TODO_HERE} $OUTPUT > ${TODO_REMAINING}
   cat $OUTPUT ${TODO_HERE} | sort -u -o ${OUTPUT}

[14/50] [abbrv] brooklyn-client git commit: refactor prefix filtering and spike the same update-index trick to step 2 initial history clean, but its not faster here

Reply via email to