Author: koji Date: Thu Jul 24 07:54:52 2014 New Revision: 1613030 URL: http://svn.apache.org/r1613030 Log: add another kmeans program by using Apache Spark
Added: labs/alike/trunk/build.sbt labs/alike/trunk/demo/conf.xml - copied unchanged from r1612742, labs/alike/trunk/demo/demo-conf.xml labs/alike/trunk/project/ (with props) labs/alike/trunk/src/main/ labs/alike/trunk/src/main/scala/ labs/alike/trunk/src/main/scala/org/ labs/alike/trunk/src/main/scala/org/apache/ labs/alike/trunk/src/main/scala/org/apache/alike/ labs/alike/trunk/src/main/scala/org/apache/alike/KMeansClusteringExecutor.scala Removed: labs/alike/trunk/demo/demo-conf.xml Modified: labs/alike/trunk/README.txt labs/alike/trunk/build.properties labs/alike/trunk/build.xml labs/alike/trunk/demo/ (props changed) labs/alike/trunk/demo/README.txt labs/alike/trunk/demo/build.xml labs/alike/trunk/src/java/org/apache/alike/AlikeConfig.java labs/alike/trunk/src/java/org/apache/alike/KMeansLauncher.java Modified: labs/alike/trunk/README.txt URL: http://svn.apache.org/viewvc/labs/alike/trunk/README.txt?rev=1613030&r1=1613029&r2=1613030&view=diff ============================================================================== --- labs/alike/trunk/README.txt (original) +++ labs/alike/trunk/README.txt Thu Jul 24 07:54:52 2014 @@ -10,8 +10,8 @@ Alike is a framework for searching simil Prerequisites ------------- -As alike is written by Python and Java, these runtime environments are needed. -Ant and Ivy are needed to build alike, too. +As alike is written by Python, Scala and Java, these runtime environments are needed. +Ant, Ivy and sbt are needed to build alike, too. OpenCV should be preinstalled with Python Client API. Just for reference, I did the following procedure to install OpenCV and Python in my Mac OS X 10.6.8 (Snow Leopard). @@ -28,8 +28,8 @@ The procedure may vary depending on your is the first wall to use alike, sharing your successful procedures will be great contribution to community! -The rest of alike dependencies, such as Mahout and Lucene, are available for download via -ivy:retrieve ant task. +The rest of alike dependencies, such as Mahout, Spark and Lucene, are available for download via +ivy:retrieve ant task or abt compile/package. Solrpy is needed if you want to try the demo. Just for your reference, I did the following to install solrpy on my Mac OS X 10.6.8. Modified: labs/alike/trunk/build.properties URL: http://svn.apache.org/viewvc/labs/alike/trunk/build.properties?rev=1613030&r1=1613029&r2=1613030&view=diff ============================================================================== --- labs/alike/trunk/build.properties (original) +++ labs/alike/trunk/build.properties Thu Jul 24 07:54:52 2014 @@ -10,7 +10,7 @@ test.cls.dir = test-classes test.result.dir = test-result tools.dir = tools prettify.dir = ${tools.dir}/prettify -samples.dir = samples +demo.dir = demo d3.js = d3.v2.min.js product.jar = ${PRODUCT_NAME}-${PRODUCT_VERSION}.jar product.job = ${PRODUCT_NAME}-${PRODUCT_VERSION}.job Added: labs/alike/trunk/build.sbt URL: http://svn.apache.org/viewvc/labs/alike/trunk/build.sbt?rev=1613030&view=auto ============================================================================== --- labs/alike/trunk/build.sbt (added) +++ labs/alike/trunk/build.sbt Thu Jul 24 07:54:52 2014 @@ -0,0 +1,18 @@ +organization := "org.apache.alike" + +name := "alike spark" + +version := "0.2" + +//scalaVersion := "2.11.1" +crossScalaVersions := Seq("2.10", "2.11.1") + +libraryDependencies += "org.apache.spark" %% "spark-core" % "1.0.1" + +libraryDependencies += "org.apache.spark" %% "spark-mllib" % "1.0.1" + +managedClasspath in Compile += file("classes") + +mainClass := Some("org.apache.alike.KMeansClusteringExecutor") + +resolvers += "Akka Repository" at "http://repo.akka.io/releases/" Modified: labs/alike/trunk/build.xml URL: http://svn.apache.org/viewvc/labs/alike/trunk/build.xml?rev=1613030&r1=1613029&r2=1613030&view=diff ============================================================================== --- labs/alike/trunk/build.xml (original) +++ labs/alike/trunk/build.xml Thu Jul 24 07:54:52 2014 @@ -24,7 +24,7 @@ <property file="${mybase.dir}/build.properties"/> <path id="common.path.lib"> - <fileset dir="${lib.dir}" includes="*.jar" excludes="lucene-*-3.6.0.jar"/> + <fileset dir="${lib.dir}" includes="*.jar" excludes="lucene-*-4.9..0.jar"/> </path> <!-- ================================================================== --> @@ -49,7 +49,7 @@ <target name="alike-javadoc" description="build alike javadoc"> <property name="javadoc.link.java" value="http://docs.oracle.com/javase/jp/6/api/"/> <property name="javadoc.link.mahout" value="https://builds.apache.org/job/Mahout-Quality/javadoc/"/> - <property name="javadoc.link.lucene.core" value="http://lucene.apache.org/core/4_0_0/core/"/> + <property name="javadoc.link.lucene.core" value="http://lucene.apache.org/core/4_9_0/core/"/> <delete dir="${javadoc.dir}"/> <mkdir dir="${javadoc.dir}"/> <copy todir="${javadoc.dir}/prettify"> @@ -74,13 +74,13 @@ windowtitle="${PRODUCT_NAME} ${PRODUCT_VERSION} API" doctitle="${PRODUCT_NAME} ${PRODUCT_VERSION} API" stylesheetfile="${javadoc.dir}/prettify/stylesheet+prettify.css" - bottom="Copyright &copy; 2012 <a href="http://apache.org/">Apache Software Foundation</a> All Rights Reserved."> + bottom="Copyright &copy; 2012-2014 <a href="http://apache.org/">Apache Software Foundation</a> All Rights Reserved."> <classpath refid="common.path.lib"/> <link href="${javadoc.link.java}"/> <link href="${javadoc.link.mahout}"/> <link href="${javadoc.link.lucene.core}"/> <header><![CDATA[ -<a href="http://labs.apache.org/labs.html">Apache alike Copyright (c) 2012 Apache Software Foundation</a> +<a href="http://labs.apache.org/labs.html">Apache alike Copyright (c) 2012-2014 Apache Software Foundation</a> <script src="{@docRoot}/prettify/prettify.js" type="text/javascript"></script> <script language="JavaScript">window.onload=function(){windowTitle();prettyPrint();}</script> ]]></header> @@ -92,12 +92,15 @@ <mkdir dir="${job.dir}"/> <copy todir="${job.dir}"> <fileset dir="${cls.dir}"/> + <fileset dir="${demo.dir}"> + <include name="conf.xml"/> + </fileset> </copy> <unjar dest="${job.dir}"> <fileset dir="${lib.dir}"> <include name="**/*.jar"/> <exclude name="hadoop-*.jar"/> - <exclude name="lucene-*-3.6.0.jar"/> + <exclude name="lucene-*-4.9.0.jar"/> <exclude name="*-javadoc.jar"/> <exclude name="*-sources.jar"/> </fileset> Propchange: labs/alike/trunk/demo/ ------------------------------------------------------------------------------ --- svn:ignore (original) +++ svn:ignore Thu Jul 24 07:54:52 2014 @@ -11,3 +11,4 @@ solr-demo-data.xml .input-vectors.crc hadoop-conf desc +all-vectors.txt Modified: labs/alike/trunk/demo/README.txt URL: http://svn.apache.org/viewvc/labs/alike/trunk/demo/README.txt?rev=1613030&r1=1613029&r2=1613030&view=diff ============================================================================== --- labs/alike/trunk/demo/README.txt (original) +++ labs/alike/trunk/demo/README.txt Thu Jul 24 07:54:52 2014 @@ -18,6 +18,7 @@ 1. build alike (in the parent directory of this directory) $ ant + $ sbt clean package 2. change to this directory @@ -67,7 +68,7 @@ $ python run_desc_extractor.py 7. if you want to run mahout on hadoop distributed environment, do the following, - otherwise, go to 8. + otherwise, go to 9. (1) install Hadoop, set HADOOP_HOME environment variable (2) copy $HADOOP_HOME/conf to demo/hadoop-conf @@ -80,17 +81,32 @@ (5) set HADOOP_CONF_DIR environment variable to demo/hadoop-conf (6) execute hadoop namenode -format and start-all.sh -8. run clustering and vector quantization programs +8. provide conf.xml file. As there is s sample conf file "conf.xml.sample", copy it to conf.xml + and modify conf.xml appropriately. - $ ant piv + $ cp conf.xml.sample conf.xml + $ vi conf.xml + +9. run clustering and vector quantization programs. You can choose use of (1) Mahout or (2) Spark + for clustering + # + # (1) If you prefer Mahout + # + $ ant piv # kmeans may take tens of minutes $ ant kmeans - $ ant clusterdump + + # + # (2) If you prefer Spark (um, it takes 4 and half hours on my MacBook) + # + $ spark-submit --jars ../apache-alike-0.2.job --class org.apache.alike.KMeansClusteringExecutor ../target/scala-2.10/alike-spark_2.10-0.2.jar + + # After (1) or (2), do the following for vector quantization $ ant qv -9. goto Solr site, download Solr 4.9.0 or superior and unzip +10. goto Solr site, download Solr 4.9.0 or superior and unzip # Apache Solr web site and download solr-4.9.0.tgz or superior http://lucene.apache.org/solr/ @@ -98,17 +114,17 @@ # unzip $ tar xvzf solr-4.9.0.tgz -10. startup Solr server +11. startup Solr server $ cd solr-4.9.0/example $ java -Dsolr.solr.home=../../solrhome -jar start.jar -11. index demo vector quantization data +12. index demo vector quantization data $ ./post.sh solr-demo-data.xml -12. startup demo web server +13. startup demo web server $ python demoserver.py -13. access to http://localhost:8080/ in your web browser and enjoy the demo! +14. access to http://localhost:8080/ in your web browser and enjoy the demo! Modified: labs/alike/trunk/demo/build.xml URL: http://svn.apache.org/viewvc/labs/alike/trunk/demo/build.xml?rev=1613030&r1=1613029&r2=1613030&view=diff ============================================================================== --- labs/alike/trunk/demo/build.xml (original) +++ labs/alike/trunk/demo/build.xml Thu Jul 24 07:54:52 2014 @@ -21,7 +21,7 @@ default="piv" basedir=".."> <import file="../build.xml"/> - <property name="conf" value="demo-conf.xml"/> + <property name="conf" value="conf.xml"/> <target name="check-hadoop"> <available file="demo/hadoop-conf" type="dir" property="hadoop.conf.dir" value="demo/hadoop-conf"/> Propchange: labs/alike/trunk/project/ ------------------------------------------------------------------------------ --- svn:ignore (added) +++ svn:ignore Thu Jul 24 07:54:52 2014 @@ -0,0 +1 @@ +target Modified: labs/alike/trunk/src/java/org/apache/alike/AlikeConfig.java URL: http://svn.apache.org/viewvc/labs/alike/trunk/src/java/org/apache/alike/AlikeConfig.java?rev=1613030&r1=1613029&r2=1613030&view=diff ============================================================================== --- labs/alike/trunk/src/java/org/apache/alike/AlikeConfig.java (original) +++ labs/alike/trunk/src/java/org/apache/alike/AlikeConfig.java Thu Jul 24 07:54:52 2014 @@ -17,8 +17,8 @@ package org.apache.alike; +import java.io.InputStream; import java.lang.reflect.Constructor; -import java.lang.reflect.InvocationTargetException; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; @@ -33,7 +33,15 @@ public final class AlikeConfig { private final InputSource is; public AlikeConfig(String confFile){ - is = new InputSource(confFile); + this(new InputSource(confFile)); + } + + public AlikeConfig(InputStream inSt){ + this(new InputSource(inSt)); + } + + private AlikeConfig(InputSource is){ + this.is = is; xpath = XPathFactory.newInstance().newXPath(); } @@ -72,10 +80,14 @@ public final class AlikeConfig { return getStringValue("/config/clustering/outDir/text()"); } - public String getClusterMaxIter(){ + public String getClusterMaxIterStr(){ return getStringValue("/config/clustering/cluster[@method='kmeans']/param[@name='maxIter']/text()"); } + public int getClusterMaxIter(){ + return Integer.parseInt(getClusterMaxIterStr()); + } + public String getClusterConvergenceDelta(){ return getStringValue("/config/clustering/cluster[@method='kmeans']/param[@name='cd']/text()"); } Modified: labs/alike/trunk/src/java/org/apache/alike/KMeansLauncher.java URL: http://svn.apache.org/viewvc/labs/alike/trunk/src/java/org/apache/alike/KMeansLauncher.java?rev=1613030&r1=1613029&r2=1613030&view=diff ============================================================================== --- labs/alike/trunk/src/java/org/apache/alike/KMeansLauncher.java (original) +++ labs/alike/trunk/src/java/org/apache/alike/KMeansLauncher.java Thu Jul 24 07:54:52 2014 @@ -50,7 +50,7 @@ public final class KMeansLauncher { arguments[6] = "-k"; arguments[7] = config.getNumOfClustersStr(); arguments[8] = "--maxIter"; - arguments[9] = config.getClusterMaxIter(); + arguments[9] = config.getClusterMaxIterStr(); arguments[10] = "-cd"; arguments[11] = config.getClusterConvergenceDelta(); Added: labs/alike/trunk/src/main/scala/org/apache/alike/KMeansClusteringExecutor.scala URL: http://svn.apache.org/viewvc/labs/alike/trunk/src/main/scala/org/apache/alike/KMeansClusteringExecutor.scala?rev=1613030&view=auto ============================================================================== --- labs/alike/trunk/src/main/scala/org/apache/alike/KMeansClusteringExecutor.scala (added) +++ labs/alike/trunk/src/main/scala/org/apache/alike/KMeansClusteringExecutor.scala Thu Jul 24 07:54:52 2014 @@ -0,0 +1,50 @@ +package org.apache.alike + +import java.io.File +import java.io.InputStream +import java.io.PrintWriter +import org.apache.alike.FileUtil.Executor +import org.apache.spark.SparkContext +import org.apache.spark.mllib.clustering.KMeans +import org.apache.spark.mllib.linalg.Vectors +import scala.io.Source + + +class MyExec extends Executor { + val writer = new PrintWriter("all-vectors.txt") + override def isExecutable(theFile: File) : Boolean = theFile.getName.endsWith(".txt") + def execute(theFile : File){ + val ite = Source.fromFile(theFile).getLines + ite.next(); ite.next() // skip filename and count lines + for(line <- ite){ + writer.println(line) + } + } + def close() { writer.close } +} + +object KMeansClusteringExecutor { + def main(args: Array[String]){ + val is = this.getClass.getClassLoader.getResourceAsStream("conf.xml") + val config = new AlikeConfig(is) + val inDir = config.getDescNormalFSDir + val numClusters = config.getNumOfClusters + val maxIter = config.getClusterMaxIter + val resultFile = config.getClusterDumpFile + + val exec = new MyExec + FileUtil.executeRecursively(exec, inDir) + exec.close() + + val sc = new SparkContext("local", "KMeansClusteringExecutor") + val data = sc.textFile("all-vectors.txt") + val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))) + val clusters = KMeans.train(parsedData, numClusters, maxIter) + + val writer = new PrintWriter(resultFile) + for(center <- clusters.clusterCenters){ + writer.println(center) + } + writer.close + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@labs.apache.org For additional commands, e-mail: commits-h...@labs.apache.org