ArielGlenn has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/359907 )

Change subject: script to batch 7z recompress revision content history files 
manually
......................................................................

script to batch 7z recompress revision content history files manually

Hope to not need this again but, you know.

Bug: T168223
Change-Id: Iaf6a0db8dcecddd0ec903cbf222c1eb57f24db77
---
A xmldumps-backup/do_7z_jobs.sh
1 file changed, 206 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/dumps 
refs/changes/07/359907/1

diff --git a/xmldumps-backup/do_7z_jobs.sh b/xmldumps-backup/do_7z_jobs.sh
new file mode 100755
index 0000000..a08d372
--- /dev/null
+++ b/xmldumps-backup/do_7z_jobs.sh
@@ -0,0 +1,206 @@
+#!/bin/bash
+# no error checking, we don't care. if file fails we'll
+# rerun it by hand later
+
+# locks wiki for date, recompresses revision history content bz2
+# files to 7z files, doing recompression in batches.
+# does NOT: update md5s, status, dumprininfo, symlinks, etc.
+# does NOT: clean up old dumps, remove old files from run
+
+usage() {
+       echo "Usage: $0 --config <pathtofile> --wiki <dbname>"
+       echo "  --date <YYYYMMDD> --jobinfo num:num:num,..."
+       echo "[--skiplock] [--dryrun] [--verbose]"
+       echo
+       echo "  --config   path to configuration file for dump generation"
+       echo "  --wiki     dbname of wiki"
+       echo "  --jobinfo  partnum,partnum2,..."
+       echo "  --date     date of run"
+        echo "  --numjobs  number of jobs to run simultaneously"
+        echo "  --skiplock don't lock the wiki (use with care!)"
+       echo "  --dryrun   don't run commands, show what would have been done"
+       echo "  --verbose print commands as they are run, etc"
+       exit 1
+}
+
+set_defaults() {
+    CONFIGFILE=""
+    WIKI=""
+    JOBINFO=""
+    DATE=""
+    NUMJOBS=""
+    SKIPLOCK=""
+    DRYRUN=""
+    VERBOSE=""
+}
+
+process_opts () {
+    while [ $# -gt 0 ]; do
+       if [ $1 == "--config" ]; then
+               CONFIGFILE="$2"
+               shift; shift;
+       elif [ $1 == "--wiki" ]; then
+               WIKI="$2"
+               shift; shift
+       elif [ $1 == "--jobinfo" ]; then
+               JOBINFO="$2"
+               shift; shift
+       elif [ $1 == "--date" ]; then
+               DATE="$2"
+               shift; shift
+       elif [ $1 == "--numjobs" ]; then
+               NUMJOBS="$2"
+               shift; shift
+       elif [ $1 == "--skiplock" ]; then
+               SKIPLOCK="true"
+               shift
+       elif [ $1 == "--dryrun" ]; then
+               DRYRUN="true"
+               shift
+       elif [ $1 == "--verbose" ]; then
+               VERBOSE="true"
+               shift
+       else
+               echo "$0: Unknown option $1"
+               usage
+       fi
+    done
+}
+
+check_opts() {
+    if [ -z "$WIKI" -o -z "$JOBINFO" -o -z "$DATE" -o -z "$CONFIGFILE" -o -z 
"$NUMJOBS" ]; then
+        echo "$0: Mandatory options 'wiki', 'jobinfo', 'date', 'numjobs' and 
'config' must be specified"
+        usage
+    elif [ ! -f "$CONFIGFILE" ]; then
+            echo "Could not find config file: $CONFIGFILE"
+            echo "Exiting..."
+            exit 1
+    fi
+    # sanity check of date
+    result=`date -d "$DATE"`
+    if [ -z "$result" ]; then
+       echo "bad date given for 'date' arg"
+        usage
+    fi
+}
+
+get_dumps_output_dir() {
+    DUMPS_OUTPUT_ROOT=$( /usr/bin/python $WIKIDUMP_BASE/getconfigvals.py 
--configfile "$CONFIGFILE" --args 'output:public' --format values )
+    if [ -z "$DUMPS_OUTPUT_ROOT" ]; then
+       echo "Failed to get dumps output root dir from config file, giving up"
+       exit 1
+    fi
+    DUMPS_OUTPUT_DIR="${DUMPS_OUTPUT_ROOT}/${WIKI}/${DATE}"
+}
+
+get_bz2files_completed() {
+    # get list of bz2files we would compress, remove from the
+    # list all those that are not yet complete (they are still
+    # being written, etc), we will not recompress those.
+
+    bz2files_completed=()
+    bz2files=$( ls 
"${DUMPS_OUTPUT_DIR}/${WIKI}-${DATE}-pages-meta-history${PARTNUM}.xml"*.bz2 )
+    for bz2file in $bz2files; do
+        /usr/local/bin/checkforbz2footer "$bz2file";
+        if [ $? -eq 0 ]; then
+            bz2files_completed=( "${bz2files_completed[@]}" "$bz2file" )
+        fi
+    done
+}
+
+setup_recompression_command() {
+    inputfile="$1"
+    outputfile=$( echo $inputfile | sed -e 's/.bz2/.7z/g;' )
+    ZCAT_COMMAND=("/bin/bzcat" "$inputfile")
+    SEVENZ_COMMAND=("/usr/bin/7za" "a" "-mx=4" "-si" "$outputfile")
+}
+
+do_recompression() {
+    # this many processes at once
+    LIMIT="$1"
+    while :
+    do
+        if [ ${#bz2files_completed[*]} -eq 0 ]; then
+            break
+       elif [ ${#bz2files_completed[*]} -lt $LIMIT ]; then
+            end=${#bz2files_completed[*]} 
+       else
+            end=$LIMIT
+       fi
+
+       files_in_batch=(${bz2files_completed[@]:0:$end})
+
+        wait_pids=()
+       files_doing=()
+        if [ -n "$DRYRUN" -o -n "$VERBOSE" ]; then
+            echo "new batch"
+        fi
+       for filename in ${files_in_batch[@]}; do
+            setup_recompression_command "$filename"
+            if [ -e $outputfile ]; then
+                continue;
+            fi
+           if [ -n "$DRYRUN" -o -n "$VERBOSE" ]; then
+               echo  "${ZCAT_COMMAND[@]} | ${SEVENZ_COMMAND[@]}"
+           fi
+           if [ -z "$DRYRUN" ]; then
+                ( ${ZCAT_COMMAND[@]} | ${SEVENZ_COMMAND[@]} ) &
+               wait_pids+=($!)
+               files_doing+=("$outputfile")
+            fi
+       done
+       i=0
+       for pid in ${wait_pids[*]}; do
+           wait $pid
+           if [ $? -ne 0 ]; then
+               echo "failed to generate" ${files_doing[$i]} "with nonzero exit 
code"
+            fi
+           ((i++))
+       done
+       bz2files_completed=(${bz2files_completed[@]:$end})
+    done
+}
+
+lockerup() {
+    if [ -z "$DRYRUN" ]; then
+        /usr/bin/python "$WIKIDUMP_BASE/dump_lock.py" --wiki $WIKI --date 
$DATE --configfile $CONFIGFILE &
+        lockerpid=$!
+       sleep 2  #  wait a bit, give the process time to finish up if it failed
+       # see if it's still running (which means it got the lock)
+       kill -0 "$lockerpid" >/dev/null 2>&1
+       if [ $? -ne 0 ]; then
+           echo "failed to get lock, exiting"
+           exit 1
+       elif [ -n "$VERBOSE" ]; then
+           echo "got lock"
+       fi
+    fi
+}
+
+cleanup_lock() {
+    if [ -z "$DRYRUN" ]; then
+       if [ -n "$lockerpid" ]; then
+           kill -HUP $lockerpid
+       fi
+       if [ -n "$VERBOSE" ]; then
+           echo "removed lock"
+       fi
+    fi
+}
+
+WIKIDUMP_BASE=`dirname "$0"`
+set_defaults
+process_opts "$@"
+check_opts
+IFS=',' read -a JOBARRAY <<< "$JOBINFO"
+if [ -z "$SKIPLOCK" ]; then
+    lockerup
+fi
+get_dumps_output_dir
+for PARTNUM in ${JOBARRAY[*]}; do
+    get_bz2files_completed
+    do_recompression $NUMJOBS
+done
+if [ -z "$SKIPLOCK" ]; then
+    cleanup_lock
+fi

-- 
To view, visit https://gerrit.wikimedia.org/r/359907
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Iaf6a0db8dcecddd0ec903cbf222c1eb57f24db77
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: master
Gerrit-Owner: ArielGlenn <ar...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to