Author: ogrisel
Date: Tue Sep 13 17:54:08 2011
New Revision: 1170271

URL: http://svn.apache.org/viewvc?rev=1170271&view=rev
Log:
STANBOL-323: sample script to fetch the resources to index

Added:
    incubator/stanbol/trunk/entityhub/indexing/dbpedia/fetch_prepare.sh   (with 
props)

Added: incubator/stanbol/trunk/entityhub/indexing/dbpedia/fetch_prepare.sh
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/dbpedia/fetch_prepare.sh?rev=1170271&view=auto
==============================================================================
--- incubator/stanbol/trunk/entityhub/indexing/dbpedia/fetch_prepare.sh (added)
+++ incubator/stanbol/trunk/entityhub/indexing/dbpedia/fetch_prepare.sh Tue Sep 
13 17:54:08 2011
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+
+INDEXING_JAR=`pwd`/target/org.apache.stanbol.entityhub.indexing.dbpedia-*-jar-with-dependencies.jar
+WORKSPACE=/tmp/dbpedia-index
+DBPEDIA=http://downloads.dbpedia.org/3.7
+MAX_SORT_MEM=2G
+
+# Turn on echoing
+set -x
+
+# Ensure that the workspace exists
+mkdir -p $WORKSPACE
+
+# Create the folder structure under the workspace folder
+cd $WORKSPACE
+java -jar $INDEXING_JAR init
+
+# Rank entities by popularity by counting the number of incoming links in the
+# wikipedia graph: computing this takes around 2 hours
+if [ ! -f $WORKSPACE/indexing/resources/incoming_links.txt ]
+then
+    curl $DBPEDIA/en/page_links_en.nt.bz2 \
+        | bzcat \
+        | sed -e 's/.*<http\:\/\/dbpedia\.org\/resource\/\([^>]*\)> ./\1/' \
+        | sort -S $MAX_SORT_MEM \
+        | uniq -c  \
+        | sort -nr -S $MAX_SORT_MEM > 
$WORKSPACE/indexing/resources/incoming_links.txt
+fi
+
+# Download the RDF dumps:
+cd $WORKSPACE/indexing/resources/rdfdata
+
+# General attributes for all entities
+wget -c $DBPEDIA/dbpedia_3.7.owl.bz2
+wget -c $DBPEDIA/en/instance_types_en.nt.bz2
+wget -c $DBPEDIA/ar/labels_ar.nt.bz2
+wget -c $DBPEDIA/de/labels_de.nt.bz2
+wget -c $DBPEDIA/en/labels_en.nt.bz2
+wget -c $DBPEDIA/fr/labels_fr.nt.bz2
+wget -c $DBPEDIA/it/labels_it.nt.bz2
+wget -c $DBPEDIA/ja/labels_ja.nt.bz2
+wget -c $DBPEDIA/zh/labels_zh.nt.bz2
+#wget -c $DBPEDIA/en/short_abstracts_en.nt.bz2
+wget -c $DBPEDIA/en/long_abstracts_en.nt.bz2
+
+# Type specific attributes
+wget -c $DBPEDIA/en/geo_coordinates_en.nt.bz2
+wget -c $DBPEDIA/en/persondata_en.nt.bz2
+
+# Category information
+#wget -c $DBPEDIA/en/category_labels_en.nt.bz2
+#wget -c $DBPEDIA/en/skos_categories_en.nt.bz2
+#wget -c $DBPEDIA/en/article_categories_en.nt.bz2
+
+# Redirects
+wget -c $DBPEDIA/en/redirects_en.nt.bz2
+
+set +x
+
+# Instruction to launch the indexing
+echo "Preparation & data fetch done: edit config in 
$WORKSPACE/indexing/config/"
+echo "Then launch indexing command:"
+echo "(cd $WORKSPACE && java -jar $INDEXING_JAR index)"
+

Propchange: incubator/stanbol/trunk/entityhub/indexing/dbpedia/fetch_prepare.sh
------------------------------------------------------------------------------
    svn:executable = *


Reply via email to