Repository: jena
Updated Branches:
  refs/heads/JENA-977 [created] 7770596bc


Initial work on refactoring tdbloader2 scripts (JENA-977)

- Better option processing
- Split tdbloader2worker into a data and index phase script
- Support only running a specific phase


Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/d92e3362
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/d92e3362
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/d92e3362

Branch: refs/heads/JENA-977
Commit: d92e336263da3f0f2a58dfc24cb9b5f23449cc5c
Parents: 13855a6
Author: Rob Vesse <[email protected]>
Authored: Thu Jun 25 16:56:29 2015 +0100
Committer: Rob Vesse <[email protected]>
Committed: Fri Jun 26 16:30:15 2015 +0100

----------------------------------------------------------------------
 apache-jena/bin/tdbloader2      |  72 +++++++++++++++-
 apache-jena/bin/tdbloader2data  | 107 ++++++++++++++++++++++++
 apache-jena/bin/tdbloader2index | 155 +++++++++++++++++++++++++++++++++++
 3 files changed, 333 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/jena/blob/d92e3362/apache-jena/bin/tdbloader2
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2 b/apache-jena/bin/tdbloader2
index c081074..37cc874 100755
--- a/apache-jena/bin/tdbloader2
+++ b/apache-jena/bin/tdbloader2
@@ -48,6 +48,7 @@ case "$(uname)" in
 esac
 
 export JENA_CP
+echo $JENA_CP
 if [ -z "$SORT_ARGS" ]
 then
     SORT_ARGS="--buffer-size=50%"
@@ -58,4 +59,73 @@ then
 fi
 export SORT_ARGS
 
-exec "$JENA_HOME/bin/tdbloader2worker" "$@"
+# Process arguments
+LOC=
+PHASE=
+
+while [ $# -gt 0 ]
+do
+  ARG=$1
+  case "$ARG" in
+    --loc|-loc)
+      # Location space separated
+      shift
+      LOC="$1"
+      shift
+      ;;
+    -*loc=*)
+      # Location = separated
+      LOC=${ARG/-*loc=/}
+      shift
+      ;;
+    --phase)
+      # Phase space separated
+      shift
+      PHASE="$1"
+      shift
+      ;;
+    *)
+      # Once we see an unrecognized argument treat as start of files to process
+      break
+      ;;
+  esac
+done
+
+if [ -z "$PHASE" ]; then
+  PHASE="all"
+fi
+
+echo "Location is '$LOC'"
+echo "Phase is '$PHASE'"
+
+log() { echo " $(date $DATE)" "$@" ; }
+
+#DATE="+%Y-%m-%dT%H:%M:%S%:z"
+DATE="+%H:%M:%S"
+
+# ---- Start
+log "-- TDB Bulk Loader Start"
+TIME1="$(date +%s)"
+
+case "$PHASE" in
+  all)
+    exec "$JENA_HOME/bin/tdbloader2data" --loc "$LOC" "$@"
+    exec "$JENA_HOME/bin/tdbloader2index" --loc "$LOC"
+    ;;
+  data)
+    exec "$JENA_HOME/bin/tdbloader2data" --loc "$LOC" "$@"
+    ;;
+  index)
+    exec "$JENA_HOME/bin/tdbloader2index" --loc "$LOC"
+    ;;
+  *)
+    echo "Unrecognized phase $PHASE" 1>&2
+    exit 1
+    ;;
+esac
+
+# ---- End
+TIME2="$(date +%s)"
+log "-- TDB Bulk Loader Finish"
+ELAPSED=$(($TIME2-$TIME1))
+log "-- $ELAPSED seconds"
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/jena/blob/d92e3362/apache-jena/bin/tdbloader2data
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2data b/apache-jena/bin/tdbloader2data
new file mode 100755
index 0000000..90200e4
--- /dev/null
+++ b/apache-jena/bin/tdbloader2data
@@ -0,0 +1,107 @@
+#!/usr/bin/env bash
+
+## Licensed to the Apache Software Foundation (ASF) under one
+## or more contributor license agreements.  See the NOTICE file
+## distributed with this work for additional information
+## regarding copyright ownership.  The ASF licenses this file
+## to you under the Apache License, Version 2.0 (the
+## "License"); you may not use this file except in compliance
+## with the License.  You may obtain a copy of the License at
+##
+##     http://www.apache.org/licenses/LICENSE-2.0
+##
+## Unless required by applicable law or agreed to in writing, software
+## distributed under the License is distributed on an "AS IS" BASIS,
+## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+## See the License for the specific language governing permissions and
+## limitations under the License.
+
+# The environment for this sub-script is setup by "tdbloader2"
+
+# Exit on error.
+set -e
+
+# Sort order is ASCII
+export LC_ALL="C"
+
+log() { echo " $(date $DATE)" "$@" ; }
+
+#DATE="+%Y-%m-%dT%H:%M:%S%:z"
+DATE="+%H:%M:%S"
+
+## JVM Arguments
+JVM_ARGS=${JVM_ARGS:--Xmx1200M}
+
+# Classpath set in "tdbloader2"
+if [ -z "$JENA_CP" ]
+then
+    echo "Classpath not provided : set JENA_CP" 1>&2
+    exit 1
+fi
+
+USAGE="Usage: tdbloader2data --loc location datafile ..."
+PKG=org.apache.jena.tdb.store.bulkloader2
+
+while [ $# -gt 0 ]
+do
+  ARG=$1
+  case "$ARG" in
+    --loc|-loc)
+      # Location space separated
+      shift
+      LOC="$1"
+      shift
+      ;;
+    -*loc=*)
+      # Location = separated
+      LOC=${ARG/-*loc=/}
+      shift
+      ;;
+    --help)
+      echo $USAGE
+      exit 0
+      ;;
+    *)
+      # Any further arguments are treated as data files
+      break
+      ;;
+  esac
+done
+
+# Verify arguments
+if [ -z "$LOC" ] ; then echo "No location specified" ; exit 1 ; fi
+if [ $# = 0 ]; then echo "No data files specified" ; exit 1 ; fi
+
+# Look for any index and data files in the directory.
+# Skip a possible configuration file
+if test -n "$(find "$LOC" -maxdepth 1 -type f ! -name 'this.*' -print -quit)"
+then 
+    echo "Location is not empty: $LOC"
+    exit 1
+fi
+
+if [ ! -e "$LOC" ] ; then
+  # If non-existent try to create
+  mkdir "$LOC"
+  if [ $? != 0 ]; then
+    echo "Failed to create new directory: $LOC"
+    exit 1
+  fi
+fi
+if [ ! -d "$LOC" ] ; then echo "Location is not a directory: $LOC" ; exit 1 ; 
fi
+
+FILES="$@"
+## Stdin?
+KEEPWORKFILES="${KEEPWORKFILES:-}"
+
+# ---- Data loading phase
+log "Data Load Phase"
+# Produce nodes file and triples/quads text file.
+
+DATA_TRIPLES="$LOC/data-triples.tmp"
+DATA_QUADS="$LOC/data-quads.tmp"
+
+java $JVM_ARGS -cp "$JENA_CP" "$PKG".CmdNodeTableBuilder \
+    "--loc=$LOC" "--triples=$DATA_TRIPLES" "--quads=$DATA_QUADS" $FILES
+
+log "Data Load Phase Completed"

http://git-wip-us.apache.org/repos/asf/jena/blob/d92e3362/apache-jena/bin/tdbloader2index
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2index b/apache-jena/bin/tdbloader2index
new file mode 100755
index 0000000..372aa5c
--- /dev/null
+++ b/apache-jena/bin/tdbloader2index
@@ -0,0 +1,155 @@
+#!/usr/bin/env bash
+
+## Licensed to the Apache Software Foundation (ASF) under one
+## or more contributor license agreements.  See the NOTICE file
+## distributed with this work for additional information
+## regarding copyright ownership.  The ASF licenses this file
+## to you under the Apache License, Version 2.0 (the
+## "License"); you may not use this file except in compliance
+## with the License.  You may obtain a copy of the License at
+##
+##     http://www.apache.org/licenses/LICENSE-2.0
+##
+## Unless required by applicable law or agreed to in writing, software
+## distributed under the License is distributed on an "AS IS" BASIS,
+## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+## See the License for the specific language governing permissions and
+## limitations under the License.
+
+# The environment for this sub-script is setup by "tdbloader2"
+
+# Exit on error.
+set -e
+
+# Sort order is ASCII
+export LC_ALL="C"
+
+log() { echo " $(date $DATE)" "$@" ; }
+
+TMP=$$
+#DATE="+%Y-%m-%dT%H:%M:%S%:z"
+DATE="+%H:%M:%S"
+
+##--parallel is not always available.
+SORT_ARGS="${SORT_ARGS:---buffer-size=50%}"
+JVM_ARGS=${JVM_ARGS:--Xmx1200M}
+
+# Classpath set in "tdbloader2"
+if [ -z "$JENA_CP" ]
+then
+    echo "Classpath not provided : set JENA_CP" 1>&2
+    exit 1
+fi
+
+USAGE="Usage: tdbloader2index --loc location"
+PKG=org.apache.jena.tdb.store.bulkloader2
+
+while [ $# -gt 0 ]
+do
+  ARG=$1
+  case "$ARG" in
+    --loc|-loc)
+      # Location space separated
+      shift
+      LOC="$1"
+      shift
+      ;;
+    -*loc=*)
+      # Location = separated
+      LOC=${ARG/-*loc=/}
+      shift
+      ;;
+    --help)
+      echo $USAGE
+      exit 0
+      ;;
+    *)
+      # Any further arguments are ignored
+      break
+      ;;
+  esac
+done
+
+# Verify arguments
+if [ -z "$LOC" ] ; then echo "No location specified" ; exit 1 ; fi
+if [ ! -e "$LOC" ] ; then echo "Location specified does not exist: $LOC" ; 
exit 1; fi
+if [ ! -d "$LOC" ] ; then echo "Location is not a directory: $LOC" ; exit 1 ; 
fi
+
+KEEPWORKFILES="${KEEPWORKFILES:-}"
+
+DATA_TRIPLES="$LOC/data-triples.tmp"
+DATA_QUADS="$LOC/data-quads.tmp"
+
+# ---- Index intermediates
+## All files are writtern S P O / G S P O columns per row but in different 
sort orders.
+log "Index Building Phase"
+
+which pv >/dev/null 2>&1
+HAS_PV=$?
+
+process_rows()
+{
+    local KEYS="$1"
+    local DATA="$2"
+    local IDX=$3
+    local WORK="$LOC/$IDX-txt"
+
+    if [ ! -s "$DATA" ]
+    then
+           return
+         fi
+
+    log "Creating Index $IDX"
+    log "  Sort $IDX"
+    if [ $HAS_PV = 0 ]; then
+      # Use pv (pipe viewer) to monitor sort progress
+      # Note that progress data will only be seen if running in the foreground
+      SIZE=$(du -k "$DATA" | cut -f 1)
+      pv -c -N data < "$DATA" | sort $SORT_ARGS -u $KEYS | pv -c -N sort -s 
$SIZE > $WORK
+    else
+      # Use sort without any progress monitoring
+      sort $SORT_ARGS -u $KEYS < "$DATA" > $WORK
+    fi
+    log "  Sort $IDX Completed"
+    log "  Build $IDX"
+    rm -f "$LOC/$IDX.dat"
+    rm -f "$LOC/$IDX.idn"
+    java -cp "$JENA_CP" "$PKG".CmdIndexBuild "$LOC" "$IDX" "$WORK"
+    log "  Build $IDX Completed"
+    # Remove intermediary file.
+    if [ "$KEEPWORKFILES" != "yes" ] 
+    then
+           rm "$WORK"
+    fi
+}
+
+K1="-k 1,1"
+K2="-k 2,2"
+K3="-k 3,3"
+K4="-k 4,4"
+
+process_rows "$K1 $K2 $K3" "$DATA_TRIPLES" SPO
+
+process_rows "$K2 $K3 $K1" "$DATA_TRIPLES" POS
+
+process_rows "$K3 $K1 $K2" "$DATA_TRIPLES" OSP
+
+process_rows "$K1 $K2 $K3 $K4" "$DATA_QUADS" GSPO
+
+process_rows "$K1 $K3 $K4 $K2" "$DATA_QUADS" GPOS
+
+process_rows "$K1 $K4 $K2 $K3" "$DATA_QUADS" GOSP
+
+process_rows "$K2 $K3 $K4 $K1" "$DATA_QUADS" SPOG
+
+process_rows "$K3 $K4 $K2 $K1" "$DATA_QUADS" POSG
+
+process_rows "$K4 $K2 $K3 $K1" "$DATA_QUADS" OSPG
+
+log "Index Building Phase Completed"
+
+# ---- Clean up.
+if [ "$KEEPWORKFILES" != "yes" ] 
+then
+    rm -f "$DATA_TRIPLES" "$DATA_QUADS" 
+fi

Reply via email to