Various further improvements to the scripts (JENA-977) - Validate sort temporary directory when indexing and WARN if the disk it is on is low on space (10% or less free) - Support --debug and --trace flags in all scripts, add various debug output throughout scripts - Fix a bug with not detecting sort failure when pv is used to monitor progress - Fix a bug in size calculations used for progress monitoring and sort failure detection
This commit includes some temporary DEV changes that will be reverted later Project: http://git-wip-us.apache.org/repos/asf/jena/repo Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/7770596b Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/7770596b Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/7770596b Branch: refs/heads/JENA-977 Commit: 7770596bc94613409fe2753240b603ae22a38b57 Parents: a96b016 Author: Rob Vesse <[email protected]> Authored: Fri Jun 26 16:15:18 2015 +0100 Committer: Rob Vesse <[email protected]> Committed: Fri Jun 26 16:31:05 2015 +0100 ---------------------------------------------------------------------- apache-jena/bin/tdbloader2 | 59 +++++++++++----- apache-jena/bin/tdbloader2data | 43 ++++++++++-- apache-jena/bin/tdbloader2index | 126 ++++++++++++++++++++++++++++++----- 3 files changed, 192 insertions(+), 36 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/jena/blob/7770596b/apache-jena/bin/tdbloader2 ---------------------------------------------------------------------- diff --git a/apache-jena/bin/tdbloader2 b/apache-jena/bin/tdbloader2 index 9ff2727..9508031 100755 --- a/apache-jena/bin/tdbloader2 +++ b/apache-jena/bin/tdbloader2 @@ -49,6 +49,10 @@ Common additional options are as follows: Advanced additional options are as follows: + -d + --debug + Enable debug mode, adds extra debug output + -k --keep-work Keeps the temporary work files around after they are no longer @@ -65,6 +69,10 @@ Advanced additional options are as follows: When no phase is specified it defaults to all + -t + --trace + Enable trace mode, essentially sets -x within the scripts + EOF } @@ -101,13 +109,12 @@ case "$(uname)" in esac export JENA_CP -#echo $JENA_CP -if [ -z "$SORT_ARGS" ] -then +# echo JENA_CP +if [ -z "$SORT_ARGS" ]; then SORT_ARGS="--buffer-size=50%" - if $(sort --parallel=3 < /dev/null 2>/dev/null) - then - SORT_ARGS="$SORT_ARGS --parallel=3" + sort --parallel=3 < /dev/null 2>/dev/null + if [ $? = 0 ]; then + SORT_ARGS="$SORT_ARGS --parallel=3" fi fi export SORT_ARGS @@ -116,11 +123,23 @@ export SORT_ARGS LOC= PHASE= KEEP_WORK=0 +DEBUG=0 +TRACE=0 while [ $# -gt 0 ] do ARG=$1 case "$ARG" in + -d|--debug) + # Debug Mode + shift + DEBUG=1 + ;; + -h|--help) + # Help + printUsage + exit 0 + ;; -k|--keep-work) # Keep work files shift @@ -143,10 +162,11 @@ do PHASE="$1" shift ;; - -h|--help) - # Help - printUsage - exit 0 + -t|--trace) + # Trace mode + shift + TRACE=1 + set -x ;; *) # Once we see an unrecognized argument treat as start of files to process @@ -159,9 +179,15 @@ if [ -z "$PHASE" ]; then PHASE="all" fi COMMON_ARGS= -if [ $KEEP_WORK = 0 ]; then +if [ $KEEP_WORK = 1 ]; then COMMON_ARGS="--keep-work" fi +if [ $DEBUG = 1 ]; then + COMMON_ARGS="$COMMON_ARGS --debug" +fi +if [ $TRACE = 1 ]; then + COMMON_ARGS="$COMMON_ARGS --trace" +fi log() { echo " $(date $DATE)" "$@" ; } @@ -172,16 +198,19 @@ DATE="+%H:%M:%S" log "-- TDB Bulk Loader Start" TIME1="$(date +%s)" +TOOL_DIR=$JENA_HOME/bin/ +# DEV - Following is just for debugging +TOOL_DIR= case "$PHASE" in all) - exec "$JENA_HOME/bin/tdbloader2data" $COMMON_ARGS --loc "$LOC" "$@" - exec "$JENA_HOME/bin/tdbloader2index" $COMMON_ARGS --loc "$LOC" + exec "${TOOL_DIR}tdbloader2data" $COMMON_ARGS --loc "$LOC" "$@" + exec "${TOOL_DIR}tdbloader2index" $COMMON_ARGS --loc "$LOC" ;; data) - exec "$JENA_HOME/bin/tdbloader2data" $COMMON_ARGS --loc "$LOC" "$@" + exec "${TOOL_DIR}tdbloader2data" $COMMON_ARGS --loc "$LOC" "$@" ;; index) - exec "$JENA_HOME/bin/tdbloader2index" $COMMON_ARGS --loc "$LOC" + exec "${TOOL_DIR}tdbloader2index" $COMMON_ARGS --loc "$LOC" ;; *) echo "Unrecognized phase $PHASE" 1>&2 http://git-wip-us.apache.org/repos/asf/jena/blob/7770596b/apache-jena/bin/tdbloader2data ---------------------------------------------------------------------- diff --git a/apache-jena/bin/tdbloader2data b/apache-jena/bin/tdbloader2data index 5aceb27..efb590a 100755 --- a/apache-jena/bin/tdbloader2data +++ b/apache-jena/bin/tdbloader2data @@ -52,35 +52,58 @@ Common additional options are as follows: Advanced additional options are as follows: + -d + --debug + Enable debug mode, adds extra debug output + -k --keep-work Keeps the temporary work files around after they are no longer needed. May be useful for debugging. + -t + --trace + Enable trace mode, essentially sets -x within the scripts + EOF } # Exit on error. set -e -# Sort order is ASCII -export LC_ALL="C" - log() { echo " $(date $DATE)" "$@" ; } +function debug() { + if [ $DEBUG = 1 ]; then + log "DEBUG" "$@" + fi +} + #DATE="+%Y-%m-%dT%H:%M:%S%:z" DATE="+%H:%M:%S" PKG=org.apache.jena.tdb.store.bulkloader2 +#DEV - Allows use against Jena 2 API +PKG=com.hp.hpl.jena.tdb.store.bulkloader2 # Process Arguments LOC= KEEP_WORK=0 +DEBUG=0 while [ $# -gt 0 ] do ARG=$1 case "$ARG" in + -d|--debug) + # Debug Mode + shift + DEBUG=1 + ;; + -h|--help) + printUsage + exit 0 + ;; -k|--keep-work) # Keep work files # This option is actually not used by this script but may be passed in @@ -99,9 +122,10 @@ do LOC=${ARG/-*loc=/} shift ;; - -h|--help) - printUsage - exit 0 + -t|--trace) + # Trace mode + shift + set -x ;; *) # Any further arguments are treated as data files @@ -124,18 +148,23 @@ fi if [ ! -e "$LOC" ] ; then # If non-existent try to create + debug "Trying to create new database directory: $LOC" mkdir "$LOC" if [ $? != 0 ]; then echo "Failed to create new directory: $LOC" exit 1 fi + debug "New database directory created: $LOC" fi if [ ! -d "$LOC" ] ; then echo "Location is not a directory: $LOC" ; exit 1 ; fi +# TODO Make LOC absolute + FILES="$@" ## JVM Arguments JVM_ARGS=${JVM_ARGS:--Xmx1200M} +debug "JVM Arguments are $JVM_ARGS" # Classpath set in "tdbloader2" if [ -z "$JENA_CP" ] @@ -151,6 +180,8 @@ log "Data Load Phase" DATA_TRIPLES="$LOC/data-triples.tmp" DATA_QUADS="$LOC/data-quads.tmp" +debug "Data files are $DATA_TRIPLES and $DATA_QUADS" + java $JVM_ARGS -cp "$JENA_CP" "$PKG".CmdNodeTableBuilder \ "--loc=$LOC" "--triples=$DATA_TRIPLES" "--quads=$DATA_QUADS" $FILES http://git-wip-us.apache.org/repos/asf/jena/blob/7770596b/apache-jena/bin/tdbloader2index ---------------------------------------------------------------------- diff --git a/apache-jena/bin/tdbloader2index b/apache-jena/bin/tdbloader2index index 2730af1..971b824 100755 --- a/apache-jena/bin/tdbloader2index +++ b/apache-jena/bin/tdbloader2index @@ -49,11 +49,18 @@ Common additional options are as follows: Advanced additional options are as follows: + -d + --debug + Enable debug mode, adds extra debug output + -k --keep-work Keeps the temporary work files around after they are no longer needed. May be useful for debugging. + -t + --trace + Enable trace mode, essentially sets -x within the scripts EOF } @@ -65,20 +72,45 @@ export LC_ALL="C" log() { echo " $(date $DATE)" "$@" ; } -TMP=$$ +function debug() { + if [ $DEBUG = 1 ]; then + log "DEBUG" "$@" + fi +} + +function warn() { + log "WARN" "$@" +} + +function getSize() { + ls -l $1 | awk '{print $5}' +} + #DATE="+%Y-%m-%dT%H:%M:%S%:z" DATE="+%H:%M:%S" PKG=org.apache.jena.tdb.store.bulkloader2 +#DEV - Allows use against Jena 2 API +PKG=com.hp.hpl.jena.tdb.store.bulkloader2 # Process Arguments LOC= KEEP_WORK=0 +DEBUG=0 while [ $# -gt 0 ] do ARG=$1 case "$ARG" in + -d|--debug) + # Debug Mode + shift + DEBUG=1 + ;; + -h|--help) + printUsage + exit 0 + ;; -k|--keep-work) # Keep work files shift @@ -95,9 +127,10 @@ do LOC=${ARG/-*loc=/} shift ;; - -h|--help) - printUsage - exit 0 + -t|--trace) + # Trace mode + shift + set -x ;; *) # Any further arguments are ignored @@ -111,6 +144,8 @@ if [ -z "$LOC" ] ; then echo "No location specified" ; exit 1 ; fi if [ ! -e "$LOC" ] ; then echo "Location specified does not exist: $LOC" ; exit 1; fi if [ ! -d "$LOC" ] ; then echo "Location is not a directory: $LOC" ; exit 1 ; fi +# TODO Make LOC absolute + DATA_TRIPLES="$LOC/data-triples.tmp" DATA_QUADS="$LOC/data-quads.tmp" @@ -123,9 +158,12 @@ if [ ! -e "$DATA_QUADS" ]; then exit 1 fi +debug "Data files are $DATA_TRIPLES and $DATA_QUADS" + ##--parallel is not always available. SORT_ARGS="${SORT_ARGS:---buffer-size=50%}" JVM_ARGS=${JVM_ARGS:--Xmx1200M} +debug "JVM Arguments are $JVM_ARGS" # Classpath set in "tdbloader2" if [ -z "$JENA_CP" ] @@ -133,17 +171,57 @@ then echo "Classpath not provided : set JENA_CP" 1>&2 exit 1 fi +debug "Jena Classpath is $JENA_CP" # ---- Index intermediates ## All files are writtern S P O / G S P O columns per row but in different sort orders. log "Index Building Phase" # Check whether Pipe Viewer is available -# Needs to temporarily disable exit on error +# Needs to temporarily disable exit on error as which produces an error +# if the given command is not found set +e which pv >/dev/null 2>&1 HAS_PV=$? set -e +if [ $HAS_PV = 0 ]; then + debug "pv (Pipe Viewer) available on your system so sorts will show progres" +else + debug "No pv (Pipe Viewer) on your system so sorts will show no progress" +fi + +# Check where we are storing temporary sort files +debug "Sort Arguments: $SORT_ARGS" +SORT_TEMP_DIR= +if [[ "$SORT_ARGS" == *"-T "* ]]; then + # Specified via -T argument + SORT_TEMP_DIR=(${SORT_ARGS/-T /}) + SORT_TEMP_DIR=${SORT_TEMP_DIR[0]} +elif [[ "$SORT_ARGS" == *"--temporary-directory="* ]]; then + # Specified via --temporary-directory argument + SORT_TEMP_DIR=(${SORT_ARGS/--temporary-directory=/}) + SORT_TEMP_DIR=${SORT_TEMP_DIR[0]} +else + # Using the system temp directory + SORT_TEMP_DIR="$TMPDIR" +fi +debug "Sort Temp Directory: $SORT_TEMP_DIR" + +# Find out how much space is on the sort directory +SORT_DRIVE_INFO=$(df "$SORT_TEMP_DIR" | tail -n +2) +SORT_DRIVE_DISK=$(echo $SORT_DRIVE_INFO | awk '{print $1}') +SORT_DRIVE_FREE_SPACE=$(echo $SORT_DRIVE_INFO | awk '{print $4}') +SORT_DRIVE_USED=$(echo $SORT_DRIVE_INFO | awk '{print $5}') +SORT_DRIVE_FREE=${SORT_DRIVE_USED/"%"/} +SORT_DRIVE_FREE=$((100 - $SORT_DRIVE_FREE)) +debug "Sort Temp Directory ${SORT_TEMP_DIR} is on disk ${SORT_DRIVE_DISK} which has ${SORT_DRIVE_FREE}% free space (${SORT_DRIVE_FREE_SPACE} bytes)" + +if [ $SORT_DRIVE_FREE -le 10 ]; then + echo + warn "Sort Temp Directory ${SORT_TEMP_DIR} is on disk ${SORT_DRIVE_DISK} which only has ${SORT_DRIVE_FREE}% free space (${SORT_DRIVE_FREE_SPACE} bytes) available" + warn "This may result in sort failures if the data to be indexed is large" + echo +fi generate_index() { @@ -152,35 +230,52 @@ generate_index() local IDX=$3 local WORK="$LOC/$IDX-txt" - if [ ! -s "$DATA" ] - then + if [ ! -s "$DATA" ]; then + debug "Skipping Index $IDX as no relevant data to index" return fi log "Creating Index $IDX" # Sort the input data - log " Sort $IDX" + log "Sort $IDX" + debug "Sorting $DATA into work file $WORK" if [ $HAS_PV = 0 ]; then # Use pv (pipe viewer) to monitor sort progress # Note that progress data will only be seen if running in the foreground - SIZE=$(du -k "$DATA" | cut -f 1) + # To report progress need to know size of input data + SIZE=$(getSize "$DATA") + debug "Size of data to be sorted is $SIZE bytes" + pv -c -N data < "$DATA" | sort $SORT_ARGS -u $KEYS | pv -c -N sort -s $SIZE > $WORK + + # CAUTION + # If sort errors here then the piping through pv will stop us from seeing the error + # and we'll continue onwards + # Therefore we need to check that the output size is same as input size as this is + # the only way to tell if sort suceeded + OUTPUT_SIZE=$(getSize "$WORK") + debug "Size of sorted data is $OUTPUT_SIZE bytes" + if [ $SIZE != $OUTPUT_SIZE ]; then + log "Aborting due to sort error" + exit 1 + fi else # Use sort without any progress monitoring sort $SORT_ARGS -u $KEYS < "$DATA" > $WORK fi - log " Sort $IDX Completed" + log "Sort $IDX Completed" # Build into an index - log " Build $IDX" + log "Build $IDX" rm -f "$LOC/$IDX.dat" rm -f "$LOC/$IDX.idn" java -cp "$JENA_CP" "$PKG".CmdIndexBuild "$LOC" "$IDX" "$WORK" - log " Build $IDX Completed" + log "Build $IDX Completed" # Remove work file unless keeping - if [ $KEEP_WORK = 1 ]; then + if [ $KEEP_WORK = 0 ]; then + debug "Cleaning up work file $WORK" rm "$WORK" fi } @@ -211,6 +306,7 @@ generate_index "$K4 $K2 $K3 $K1" "$DATA_QUADS" OSPG log "Index Building Phase Completed" # ---- Clean up. -if [ $KEEP_WORK = 1 ]; then - rm -f "$DATA_TRIPLES" "$DATA_QUADS" +if [ $KEEP_WORK = 0 ]; then + debug "Cleaning up data files $DATA_TRIPLES and $DATA_QUADS" + rm -f "$DATA_TRIPLES" "$DATA_QUADS" fi
