Author: koji
Date: Thu Jul 24 07:54:52 2014
New Revision: 1613030

URL: http://svn.apache.org/r1613030
Log:
add another kmeans program by using Apache Spark

Added:
    labs/alike/trunk/build.sbt
    labs/alike/trunk/demo/conf.xml
      - copied unchanged from r1612742, labs/alike/trunk/demo/demo-conf.xml
    labs/alike/trunk/project/   (with props)
    labs/alike/trunk/src/main/
    labs/alike/trunk/src/main/scala/
    labs/alike/trunk/src/main/scala/org/
    labs/alike/trunk/src/main/scala/org/apache/
    labs/alike/trunk/src/main/scala/org/apache/alike/
    
labs/alike/trunk/src/main/scala/org/apache/alike/KMeansClusteringExecutor.scala
Removed:
    labs/alike/trunk/demo/demo-conf.xml
Modified:
    labs/alike/trunk/README.txt
    labs/alike/trunk/build.properties
    labs/alike/trunk/build.xml
    labs/alike/trunk/demo/   (props changed)
    labs/alike/trunk/demo/README.txt
    labs/alike/trunk/demo/build.xml
    labs/alike/trunk/src/java/org/apache/alike/AlikeConfig.java
    labs/alike/trunk/src/java/org/apache/alike/KMeansLauncher.java

Modified: labs/alike/trunk/README.txt
URL: 
http://svn.apache.org/viewvc/labs/alike/trunk/README.txt?rev=1613030&r1=1613029&r2=1613030&view=diff
==============================================================================
--- labs/alike/trunk/README.txt (original)
+++ labs/alike/trunk/README.txt Thu Jul 24 07:54:52 2014
@@ -10,8 +10,8 @@ Alike is a framework for searching simil
 
 Prerequisites
 -------------
-As alike is written by Python and Java, these runtime environments are needed.
-Ant and Ivy are needed to build alike, too.
+As alike is written by Python, Scala and Java, these runtime environments are 
needed.
+Ant, Ivy and sbt are needed to build alike, too.
 
 OpenCV should be preinstalled with Python Client API. Just for reference, I 
did the
 following procedure to install OpenCV and Python in my Mac OS X 10.6.8 (Snow 
Leopard).
@@ -28,8 +28,8 @@ The procedure may vary depending on your
 is the first wall to use alike, sharing your successful procedures will be 
great contribution
 to community!
 
-The rest of alike dependencies, such as Mahout and Lucene, are available for 
download via
-ivy:retrieve ant task.
+The rest of alike dependencies, such as Mahout, Spark and Lucene, are 
available for download via
+ivy:retrieve ant task or abt compile/package.
 
 Solrpy is needed if you want to try the demo. Just for your reference, I did 
the following
 to install solrpy on my Mac OS X 10.6.8.

Modified: labs/alike/trunk/build.properties
URL: 
http://svn.apache.org/viewvc/labs/alike/trunk/build.properties?rev=1613030&r1=1613029&r2=1613030&view=diff
==============================================================================
--- labs/alike/trunk/build.properties (original)
+++ labs/alike/trunk/build.properties Thu Jul 24 07:54:52 2014
@@ -10,7 +10,7 @@ test.cls.dir           = test-classes
 test.result.dir        = test-result
 tools.dir              = tools
 prettify.dir           = ${tools.dir}/prettify
-samples.dir            = samples
+demo.dir               = demo
 d3.js                  = d3.v2.min.js
 product.jar            = ${PRODUCT_NAME}-${PRODUCT_VERSION}.jar
 product.job            = ${PRODUCT_NAME}-${PRODUCT_VERSION}.job

Added: labs/alike/trunk/build.sbt
URL: 
http://svn.apache.org/viewvc/labs/alike/trunk/build.sbt?rev=1613030&view=auto
==============================================================================
--- labs/alike/trunk/build.sbt (added)
+++ labs/alike/trunk/build.sbt Thu Jul 24 07:54:52 2014
@@ -0,0 +1,18 @@
+organization := "org.apache.alike"
+
+name := "alike spark"
+
+version := "0.2"
+
+//scalaVersion := "2.11.1"
+crossScalaVersions := Seq("2.10", "2.11.1")
+
+libraryDependencies += "org.apache.spark" %% "spark-core" % "1.0.1"
+
+libraryDependencies += "org.apache.spark" %% "spark-mllib" % "1.0.1"
+
+managedClasspath in Compile += file("classes")
+
+mainClass := Some("org.apache.alike.KMeansClusteringExecutor")
+
+resolvers += "Akka Repository" at "http://repo.akka.io/releases/";

Modified: labs/alike/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/labs/alike/trunk/build.xml?rev=1613030&r1=1613029&r2=1613030&view=diff
==============================================================================
--- labs/alike/trunk/build.xml (original)
+++ labs/alike/trunk/build.xml Thu Jul 24 07:54:52 2014
@@ -24,7 +24,7 @@
     <property file="${mybase.dir}/build.properties"/>
 
     <path id="common.path.lib">
-        <fileset dir="${lib.dir}" includes="*.jar" 
excludes="lucene-*-3.6.0.jar"/>
+        <fileset dir="${lib.dir}" includes="*.jar" 
excludes="lucene-*-4.9..0.jar"/>
     </path>
 
     <!-- ================================================================== -->
@@ -49,7 +49,7 @@
     <target name="alike-javadoc" description="build alike javadoc">
         <property name="javadoc.link.java" 
value="http://docs.oracle.com/javase/jp/6/api/"/>
         <property name="javadoc.link.mahout" 
value="https://builds.apache.org/job/Mahout-Quality/javadoc/"/>
-        <property name="javadoc.link.lucene.core" 
value="http://lucene.apache.org/core/4_0_0/core/"/>
+        <property name="javadoc.link.lucene.core" 
value="http://lucene.apache.org/core/4_9_0/core/"/>
         <delete dir="${javadoc.dir}"/>
         <mkdir dir="${javadoc.dir}"/>
         <copy todir="${javadoc.dir}/prettify">
@@ -74,13 +74,13 @@
           windowtitle="${PRODUCT_NAME} ${PRODUCT_VERSION} API"
           doctitle="${PRODUCT_NAME} ${PRODUCT_VERSION} API"
           stylesheetfile="${javadoc.dir}/prettify/stylesheet+prettify.css"
-          bottom="Copyright &amp;copy; 2012 &lt;a 
href=&quot;http://apache.org/&quot;&gt;Apache Software Foundation&lt;/a&gt; All 
Rights Reserved.">
+          bottom="Copyright &amp;copy; 2012-2014 &lt;a 
href=&quot;http://apache.org/&quot;&gt;Apache Software Foundation&lt;/a&gt; All 
Rights Reserved.">
             <classpath refid="common.path.lib"/>
             <link href="${javadoc.link.java}"/>
             <link href="${javadoc.link.mahout}"/>
             <link href="${javadoc.link.lucene.core}"/>
             <header><![CDATA[
-<a href="http://labs.apache.org/labs.html";>Apache alike Copyright (c) 2012 
Apache Software Foundation</a>
+<a href="http://labs.apache.org/labs.html";>Apache alike Copyright (c) 
2012-2014 Apache Software Foundation</a>
 <script src="{@docRoot}/prettify/prettify.js" type="text/javascript"></script>
 <script 
language="JavaScript">window.onload=function(){windowTitle();prettyPrint();}</script>
             ]]></header>
@@ -92,12 +92,15 @@
         <mkdir dir="${job.dir}"/>
         <copy todir="${job.dir}">
             <fileset dir="${cls.dir}"/>
+            <fileset dir="${demo.dir}">
+              <include name="conf.xml"/>
+            </fileset>
        </copy>
         <unjar dest="${job.dir}">
             <fileset dir="${lib.dir}">
               <include name="**/*.jar"/>
               <exclude name="hadoop-*.jar"/>
-              <exclude name="lucene-*-3.6.0.jar"/>
+              <exclude name="lucene-*-4.9.0.jar"/>
               <exclude name="*-javadoc.jar"/>
               <exclude name="*-sources.jar"/>
             </fileset>

Propchange: labs/alike/trunk/demo/
------------------------------------------------------------------------------
--- svn:ignore (original)
+++ svn:ignore Thu Jul 24 07:54:52 2014
@@ -11,3 +11,4 @@ solr-demo-data.xml
 .input-vectors.crc
 hadoop-conf
 desc
+all-vectors.txt

Modified: labs/alike/trunk/demo/README.txt
URL: 
http://svn.apache.org/viewvc/labs/alike/trunk/demo/README.txt?rev=1613030&r1=1613029&r2=1613030&view=diff
==============================================================================
--- labs/alike/trunk/demo/README.txt (original)
+++ labs/alike/trunk/demo/README.txt Thu Jul 24 07:54:52 2014
@@ -18,6 +18,7 @@
 1. build alike (in the parent directory of this directory)
 
    $ ant
+   $ sbt clean package
 
 2. change to this directory
 
@@ -67,7 +68,7 @@
    $ python run_desc_extractor.py
 
 7. if you want to run mahout on hadoop distributed environment, do the 
following,
-   otherwise, go to 8.
+   otherwise, go to 9.
 
    (1) install Hadoop, set HADOOP_HOME environment variable
    (2) copy $HADOOP_HOME/conf to demo/hadoop-conf
@@ -80,17 +81,32 @@
    (5) set HADOOP_CONF_DIR environment variable to demo/hadoop-conf
    (6) execute hadoop namenode -format and start-all.sh
 
-8. run clustering and vector quantization programs
+8. provide conf.xml file. As there is s sample conf file "conf.xml.sample", 
copy it to conf.xml
+   and modify conf.xml appropriately.
 
-   $ ant piv
+   $ cp conf.xml.sample conf.xml
+   $ vi conf.xml
+
+9. run clustering and vector quantization programs. You can choose use of (1) 
Mahout or (2) Spark
+   for clustering
 
+   #
+   # (1) If you prefer Mahout
+   #
+   $ ant piv
    # kmeans may take tens of minutes
    $ ant kmeans
-
    $ ant clusterdump
+
+   #
+   # (2) If you prefer Spark (um, it takes 4 and half hours on my MacBook)
+   #
+   $ spark-submit --jars ../apache-alike-0.2.job --class 
org.apache.alike.KMeansClusteringExecutor 
../target/scala-2.10/alike-spark_2.10-0.2.jar
+
+   # After (1) or (2), do the following for vector quantization
    $ ant qv
 
-9. goto Solr site, download Solr 4.9.0 or superior and unzip
+10. goto Solr site, download Solr 4.9.0 or superior and unzip
 
    # Apache Solr web site and download solr-4.9.0.tgz or superior
    http://lucene.apache.org/solr/
@@ -98,17 +114,17 @@
    # unzip
    $ tar xvzf solr-4.9.0.tgz
 
-10. startup Solr server
+11. startup Solr server
 
    $ cd solr-4.9.0/example
    $ java -Dsolr.solr.home=../../solrhome -jar start.jar
 
-11. index demo vector quantization data
+12. index demo vector quantization data
 
    $ ./post.sh solr-demo-data.xml
 
-12. startup demo web server
+13. startup demo web server
 
    $ python demoserver.py
 
-13. access to http://localhost:8080/ in your web browser and enjoy the demo!
+14. access to http://localhost:8080/ in your web browser and enjoy the demo!

Modified: labs/alike/trunk/demo/build.xml
URL: 
http://svn.apache.org/viewvc/labs/alike/trunk/demo/build.xml?rev=1613030&r1=1613029&r2=1613030&view=diff
==============================================================================
--- labs/alike/trunk/demo/build.xml (original)
+++ labs/alike/trunk/demo/build.xml Thu Jul 24 07:54:52 2014
@@ -21,7 +21,7 @@
          default="piv" basedir="..">
 
     <import file="../build.xml"/>
-    <property name="conf" value="demo-conf.xml"/>
+    <property name="conf" value="conf.xml"/>
 
     <target name="check-hadoop">
         <available file="demo/hadoop-conf" type="dir" 
property="hadoop.conf.dir" value="demo/hadoop-conf"/>

Propchange: labs/alike/trunk/project/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Thu Jul 24 07:54:52 2014
@@ -0,0 +1 @@
+target

Modified: labs/alike/trunk/src/java/org/apache/alike/AlikeConfig.java
URL: 
http://svn.apache.org/viewvc/labs/alike/trunk/src/java/org/apache/alike/AlikeConfig.java?rev=1613030&r1=1613029&r2=1613030&view=diff
==============================================================================
--- labs/alike/trunk/src/java/org/apache/alike/AlikeConfig.java (original)
+++ labs/alike/trunk/src/java/org/apache/alike/AlikeConfig.java Thu Jul 24 
07:54:52 2014
@@ -17,8 +17,8 @@
 
 package org.apache.alike;
 
+import java.io.InputStream;
 import java.lang.reflect.Constructor;
-import java.lang.reflect.InvocationTargetException;
 
 import javax.xml.xpath.XPath;
 import javax.xml.xpath.XPathConstants;
@@ -33,7 +33,15 @@ public final class AlikeConfig {
   private final InputSource is;
   
   public AlikeConfig(String confFile){
-    is = new InputSource(confFile);
+    this(new InputSource(confFile));
+  }
+  
+  public AlikeConfig(InputStream inSt){
+    this(new InputSource(inSt));
+  }
+  
+  private AlikeConfig(InputSource is){
+    this.is = is;
     xpath = XPathFactory.newInstance().newXPath();
   }
   
@@ -72,10 +80,14 @@ public final class AlikeConfig {
     return getStringValue("/config/clustering/outDir/text()");
   }
   
-  public String getClusterMaxIter(){
+  public String getClusterMaxIterStr(){
     return 
getStringValue("/config/clustering/cluster[@method='kmeans']/param[@name='maxIter']/text()");
   }
   
+  public int getClusterMaxIter(){
+    return Integer.parseInt(getClusterMaxIterStr());
+  }
+  
   public String getClusterConvergenceDelta(){
     return 
getStringValue("/config/clustering/cluster[@method='kmeans']/param[@name='cd']/text()");
   }

Modified: labs/alike/trunk/src/java/org/apache/alike/KMeansLauncher.java
URL: 
http://svn.apache.org/viewvc/labs/alike/trunk/src/java/org/apache/alike/KMeansLauncher.java?rev=1613030&r1=1613029&r2=1613030&view=diff
==============================================================================
--- labs/alike/trunk/src/java/org/apache/alike/KMeansLauncher.java (original)
+++ labs/alike/trunk/src/java/org/apache/alike/KMeansLauncher.java Thu Jul 24 
07:54:52 2014
@@ -50,7 +50,7 @@ public final class KMeansLauncher {
     arguments[6] = "-k";
     arguments[7] = config.getNumOfClustersStr();
     arguments[8] = "--maxIter";
-    arguments[9] = config.getClusterMaxIter();
+    arguments[9] = config.getClusterMaxIterStr();
     arguments[10] = "-cd";
     arguments[11] = config.getClusterConvergenceDelta();
     

Added: 
labs/alike/trunk/src/main/scala/org/apache/alike/KMeansClusteringExecutor.scala
URL: 
http://svn.apache.org/viewvc/labs/alike/trunk/src/main/scala/org/apache/alike/KMeansClusteringExecutor.scala?rev=1613030&view=auto
==============================================================================
--- 
labs/alike/trunk/src/main/scala/org/apache/alike/KMeansClusteringExecutor.scala 
(added)
+++ 
labs/alike/trunk/src/main/scala/org/apache/alike/KMeansClusteringExecutor.scala 
Thu Jul 24 07:54:52 2014
@@ -0,0 +1,50 @@
+package org.apache.alike
+
+import java.io.File
+import java.io.InputStream
+import java.io.PrintWriter
+import org.apache.alike.FileUtil.Executor
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.clustering.KMeans
+import org.apache.spark.mllib.linalg.Vectors
+import scala.io.Source
+
+
+class MyExec extends Executor {
+  val writer = new PrintWriter("all-vectors.txt")
+  override def isExecutable(theFile: File) : Boolean = 
theFile.getName.endsWith(".txt")
+  def execute(theFile : File){
+    val ite = Source.fromFile(theFile).getLines
+    ite.next(); ite.next()     // skip filename and count lines
+    for(line <- ite){
+      writer.println(line)
+    }
+  }
+  def close() { writer.close }
+}
+
+object KMeansClusteringExecutor {
+  def main(args: Array[String]){
+    val is = this.getClass.getClassLoader.getResourceAsStream("conf.xml")
+    val config = new AlikeConfig(is)
+    val inDir = config.getDescNormalFSDir
+    val numClusters = config.getNumOfClusters
+    val maxIter = config.getClusterMaxIter
+    val resultFile = config.getClusterDumpFile
+
+    val exec = new MyExec
+    FileUtil.executeRecursively(exec, inDir)
+    exec.close()
+
+    val sc = new SparkContext("local", "KMeansClusteringExecutor")
+    val data = sc.textFile("all-vectors.txt")
+    val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble)))
+    val clusters = KMeans.train(parsedData, numClusters, maxIter)
+
+    val writer = new PrintWriter(resultFile)
+    for(center <- clusters.clusterCenters){
+      writer.println(center)
+    }
+    writer.close
+  }
+}



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@labs.apache.org
For additional commands, e-mail: commits-h...@labs.apache.org

Reply via email to