Repository: mahout Updated Branches: refs/heads/master d9e26c64d -> e24c4afb6
MAHOUT-1604, MAHOUT-1541 changes all reference to positon in the CLI to columns Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/e24c4afb Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/e24c4afb Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/e24c4afb Branch: refs/heads/master Commit: e24c4afb699c2930d372c701fe2de874a2a2f6c0 Parents: d9e26c6 Author: pferrel <[email protected]> Authored: Thu Sep 4 09:44:00 2014 -0700 Committer: pferrel <[email protected]> Committed: Thu Sep 4 09:55:17 2014 -0700 ---------------------------------------------------------------------- .../mahout/drivers/ItemSimilarityDriver.scala | 10 ++-- .../mahout/drivers/MahoutOptionParser.scala | 24 ++++---- .../org/apache/mahout/drivers/Schema.scala | 4 +- .../drivers/TextDelimitedReaderWriter.scala | 10 ++-- .../drivers/ItemSimilarityDriverSuite.scala | 60 ++++++++++---------- 5 files changed, 54 insertions(+), 54 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/e24c4afb/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala ---------------------------------------------------------------------- diff --git a/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala b/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala index b05b55d..0b8ded6 100644 --- a/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala +++ b/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala @@ -128,13 +128,13 @@ object ItemSimilarityDriver extends MahoutDriver { val readSchema1 = new Schema("delim" -> parser.opts("inDelim").asInstanceOf[String], "filter" -> parser.opts("filter1").asInstanceOf[String], - "rowIDPosition" -> parser.opts("rowIDPosition").asInstanceOf[Int], - "columnIDPosition" -> parser.opts("itemIDPosition").asInstanceOf[Int], - "filterPosition" -> parser.opts("filterPosition").asInstanceOf[Int]) + "rowIDColumn" -> parser.opts("rowIDColumn").asInstanceOf[Int], + "columnIDPosition" -> parser.opts("itemIDColumn").asInstanceOf[Int], + "filterColumn" -> parser.opts("filterColumn").asInstanceOf[Int]) reader1 = new TextDelimitedIndexedDatasetReader(readSchema1) - if ((parser.opts("filterPosition").asInstanceOf[Int] != -1 && parser.opts("filter2").asInstanceOf[String] != null) + if ((parser.opts("filterColumn").asInstanceOf[Int] != -1 && parser.opts("filter2").asInstanceOf[String] != null) || (parser.opts("input2").asInstanceOf[String] != null && !parser.opts("input2").asInstanceOf[String].isEmpty )){ // only need to change the filter used compared to readSchema1 val readSchema2 = new Schema(readSchema1) += ("filter" -> parser.opts("filter2").asInstanceOf[String]) @@ -180,7 +180,7 @@ object ItemSimilarityDriver extends MahoutDriver { datasetB - } else if (parser.opts("filterPosition").asInstanceOf[Int] != -1 + } else if (parser.opts("filterColumn").asInstanceOf[Int] != -1 && parser.opts("filter2").asInstanceOf[String] != null) { // get cross-cooccurrences interactions by using two filters on a single set of files http://git-wip-us.apache.org/repos/asf/mahout/blob/e24c4afb/spark/src/main/scala/org/apache/mahout/drivers/MahoutOptionParser.scala ---------------------------------------------------------------------- diff --git a/spark/src/main/scala/org/apache/mahout/drivers/MahoutOptionParser.scala b/spark/src/main/scala/org/apache/mahout/drivers/MahoutOptionParser.scala index 6908bd2..ad7a76b 100644 --- a/spark/src/main/scala/org/apache/mahout/drivers/MahoutOptionParser.scala +++ b/spark/src/main/scala/org/apache/mahout/drivers/MahoutOptionParser.scala @@ -41,9 +41,9 @@ object MahoutOptionParser { "filenamePattern" -> "^part-.*") final val TextDelimitedElementsOptions = immutable.HashMap[String, Any]( - "rowIDPosition" -> 0, - "itemIDPosition" -> 1, - "filterPosition" -> -1, + "rowIDColumn" -> 0, + "itemIDColumn" -> 1, + "filterColumn" -> -1, "filter1" -> null.asInstanceOf[String], "filter2" -> null.asInstanceOf[String], "inDelim" -> "[,\t ]") @@ -135,20 +135,20 @@ class MahoutOptionParser(programName: String) extends OptionParser[Map[String, A options + ("filter2" -> x) } text ("String (or regex) whose presence indicates a datum for the secondary item set (optional). If not present no secondary dataset is collected") - opt[Int]("rowIDPosition") abbr ("rc") action { (x, options) => - options + ("rowIDPosition" -> x) + opt[Int]("rowIDColumn") abbr ("rc") action { (x, options) => + options + ("rowIDColumn" -> x) } text ("Column number (0 based Int) containing the row ID string (optional). Default: 0") validate { x => if (x >= 0) success else failure("Option --rowIDColNum must be >= 0") } - opt[Int]("itemIDPosition") abbr ("ic") action { (x, options) => - options + ("itemIDPosition" -> x) + opt[Int]("itemIDColumn") abbr ("ic") action { (x, options) => + options + ("itemIDColumn" -> x) } text ("Column number (0 based Int) containing the item ID string (optional). Default: 1") validate { x => if (x >= 0) success else failure("Option --itemIDColNum must be >= 0") } - opt[Int]("filterPosition") abbr ("fc") action { (x, options) => - options + ("filterPosition" -> x) + opt[Int]("filterColumn") abbr ("fc") action { (x, options) => + options + ("filterColumn" -> x) } text ("Column number (0 based Int) containing the filter string (optional). Default: -1 for no filter") validate { x => if (x >= -1) success else failure("Option --filterColNum must be >= -1") } @@ -156,9 +156,9 @@ class MahoutOptionParser(programName: String) extends OptionParser[Map[String, A note("\nUsing all defaults the input is expected of the form: \"userID<tab>itemId\" or \"userID<tab>itemID<tab>any-text...\" and all rows will be used") checkConfig { options: Map[String, Any] => - if (options("filterPosition").asInstanceOf[Int] == options("itemIDPosition").asInstanceOf[Int] - || options("filterPosition").asInstanceOf[Int] == options("rowIDPosition").asInstanceOf[Int] - || options("rowIDPosition").asInstanceOf[Int] == options("itemIDPosition").asInstanceOf[Int]) + if (options("filterColumn").asInstanceOf[Int] == options("itemIDColumn").asInstanceOf[Int] + || options("filterColumn").asInstanceOf[Int] == options("rowIDColumn").asInstanceOf[Int] + || options("rowIDColumn").asInstanceOf[Int] == options("itemIDColumn").asInstanceOf[Int]) failure("The row, item, and filter positions must be unique.") else success } http://git-wip-us.apache.org/repos/asf/mahout/blob/e24c4afb/spark/src/main/scala/org/apache/mahout/drivers/Schema.scala ---------------------------------------------------------------------- diff --git a/spark/src/main/scala/org/apache/mahout/drivers/Schema.scala b/spark/src/main/scala/org/apache/mahout/drivers/Schema.scala index 42b2658..92163be 100644 --- a/spark/src/main/scala/org/apache/mahout/drivers/Schema.scala +++ b/spark/src/main/scala/org/apache/mahout/drivers/Schema.scala @@ -48,9 +48,9 @@ class Schema(params: Tuple2[String, Any]*) extends HashMap[String, Any] { class DefaultElementReadSchema extends Schema( "delim" -> "[,\t ]", //comma, tab or space "filter" -> "", - "rowIDPosition" -> 0, + "rowIDColumn" -> 0, "columnIDPosition" -> 1, - "filterPosition" -> -1) + "filterColumn" -> -1) /** Default Schema for text delimited drm file output * This tells the writer to write a DRM of the default form: http://git-wip-us.apache.org/repos/asf/mahout/blob/e24c4afb/spark/src/main/scala/org/apache/mahout/drivers/TextDelimitedReaderWriter.scala ---------------------------------------------------------------------- diff --git a/spark/src/main/scala/org/apache/mahout/drivers/TextDelimitedReaderWriter.scala b/spark/src/main/scala/org/apache/mahout/drivers/TextDelimitedReaderWriter.scala index 53a36a5..274ad98 100644 --- a/spark/src/main/scala/org/apache/mahout/drivers/TextDelimitedReaderWriter.scala +++ b/spark/src/main/scala/org/apache/mahout/drivers/TextDelimitedReaderWriter.scala @@ -43,9 +43,9 @@ trait TDIndexedDatasetReader extends Reader[IndexedDataset]{ existingRowIDs: BiMap[String, Int] = HashBiMap.create()): IndexedDataset = { try { val delimiter = readSchema("delim").asInstanceOf[String] - val rowIDPosition = readSchema("rowIDPosition").asInstanceOf[Int] + val rowIDColumn = readSchema("rowIDColumn").asInstanceOf[Int] val columnIDPosition = readSchema("columnIDPosition").asInstanceOf[Int] - val filterPosition = readSchema("filterPosition").asInstanceOf[Int] + val filterColumn = readSchema("filterColumn").asInstanceOf[Int] val filterBy = readSchema("filter").asInstanceOf[String] // instance vars must be put into locally scoped vals when used in closures that are executed but Spark @@ -57,15 +57,15 @@ trait TDIndexedDatasetReader extends Reader[IndexedDataset]{ var columns = mc.textFile(source).map { line => line.split(delimiter) } // -1 means no filter in the input text, take them all - if(filterPosition != -1) { + if(filterColumn != -1) { // get the rows that have a column matching the filter - columns = columns.filter { tokens => tokens(filterPosition) == filterBy } + columns = columns.filter { tokens => tokens(filterColumn) == filterBy } } // get row and column IDs //val m = columns.collect val interactions = columns.map { tokens => - tokens(rowIDPosition) -> tokens(columnIDPosition) + tokens(rowIDColumn) -> tokens(columnIDPosition) } interactions.cache() http://git-wip-us.apache.org/repos/asf/mahout/blob/e24c4afb/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala ---------------------------------------------------------------------- diff --git a/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala b/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala index 0a73469..79cd6d9 100644 --- a/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala +++ b/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala @@ -94,9 +94,9 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite { "--filter1", "purchase", "--filter2", "view", "--inDelim", ",", - "--itemIDPosition", "2", - "--rowIDPosition", "0", - "--filterPosition", "1" + "--itemIDColumn", "2", + "--rowIDColumn", "0", + "--filterColumn", "1" )) */ // local multi-threaded Spark with HDFS using large dataset @@ -108,9 +108,9 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite { "--filter1", "purchase", "--filter2", "view", "--inDelim", ",", - "--itemIDPosition", "2", - "--rowIDPosition", "0", - "--filterPosition", "1" + "--itemIDColumn", "2", + "--rowIDColumn", "0", + "--filterColumn", "1" )) */ @@ -153,9 +153,9 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite { "--filter1", "purchase", "--filter2", "view", "--inDelim", ",", - "--itemIDPosition", "2", - "--rowIDPosition", "0", - "--filterPosition", "1", + "--itemIDColumn", "2", + "--rowIDColumn", "0", + "--filterColumn", "1", "--writeAllDatasets")) // todo: these comparisons rely on a sort producing the same lines, which could possibly @@ -207,9 +207,9 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite { "--filter1", "purchase", "--filter2", "view", "--inDelim", "[,\t]", - "--itemIDPosition", "2", - "--rowIDPosition", "0", - "--filterPosition", "1")) + "--itemIDColumn", "2", + "--rowIDColumn", "0", + "--filterColumn", "1")) // todo: a better test would be to get sorted vectors and compare rows instead of tokens, this might miss // some error cases @@ -259,9 +259,9 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite { "--filter1", "purchase", "--filter2", "view", "--inDelim", "\t", - "--itemIDPosition", "4", - "--rowIDPosition", "1", - "--filterPosition", "2")) + "--itemIDColumn", "4", + "--rowIDColumn", "1", + "--filterColumn", "2")) val indicatorLines = mahoutCtx.textFile(OutPath + "/indicator-matrix/").collect.toIterable @@ -420,9 +420,9 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite { "--filter1", "purchase", "--filter2", "view", "--inDelim", "\t", - "--itemIDPosition", "2", - "--rowIDPosition", "0", - "--filterPosition", "1", + "--itemIDColumn", "2", + "--rowIDColumn", "0", + "--filterColumn", "1", "--filenamePattern", "m..tsv", "--recursive")) @@ -475,9 +475,9 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite { "--filter1", "purchase", "--filter2", "view", "--inDelim", ",", - "--itemIDPosition", "2", - "--rowIDPosition", "0", - "--filterPosition", "1")) + "--itemIDColumn", "2", + "--rowIDColumn", "0", + "--filterColumn", "1")) val indicatorLines = mahoutCtx.textFile(OutPath + "/indicator-matrix/").collect.toIterable tokenize(indicatorLines) should contain theSameElementsAs SelfSimilairtyTokens @@ -541,9 +541,9 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite { "--filter1", "purchase", "--filter2", "view", "--inDelim", ",", - "--itemIDPosition", "2", - "--rowIDPosition", "0", - "--filterPosition", "1")) + "--itemIDColumn", "2", + "--rowIDColumn", "0", + "--filterColumn", "1")) val indicatorLines = mahoutCtx.textFile(OutPath + "/indicator-matrix/").collect.toIterable val crossIndicatorLines = mahoutCtx.textFile(OutPath + "/cross-indicator-matrix/").collect.toIterable @@ -603,9 +603,9 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite { "--filter1", "purchase", "--filter2", "view", "--inDelim", ",", - "--itemIDPosition", "2", - "--rowIDPosition", "0", - "--filterPosition", "1", + "--itemIDColumn", "2", + "--rowIDColumn", "0", + "--filterColumn", "1", "--writeAllDatasets")) val indicatorLines = mahoutCtx.textFile(OutPath + "/indicator-matrix/").collect.toIterable @@ -697,9 +697,9 @@ removed ==> u3 0 0 1 0 "--filter1", "purchase", "--filter2", "view", "--inDelim", ",", - "--itemIDPosition", "2", - "--rowIDPosition", "0", - "--filterPosition", "1", + "--itemIDColumn", "2", + "--rowIDColumn", "0", + "--filterColumn", "1", "--writeAllDatasets")) val indicatorLines = mahoutCtx.textFile(OutPath + "/indicator-matrix/").collect.toIterable
