[
https://issues.apache.org/jira/browse/FLINK-1719?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14908156#comment-14908156
]
ASF GitHub Bot commented on FLINK-1719:
---------------------------------------
Github user sachingoel0101 commented on a diff in the pull request:
https://github.com/apache/flink/pull/1156#discussion_r40441936
--- Diff:
flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/classification/MultinomialNaiveBayes.scala
---
@@ -0,0 +1,900 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.ml.classification
+
+import java.{lang, util}
+
+import org.apache.flink.api.common.functions._
+import org.apache.flink.api.scala._
+import org.apache.flink.configuration.Configuration
+import org.apache.flink.core.fs.FileSystem.WriteMode
+import org.apache.flink.ml.common.{ParameterMap, Parameter}
+import org.apache.flink.ml.pipeline.{PredictDataSetOperation,
FitOperation, Predictor}
+import org.apache.flink.util.Collector
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+import scala.collection.mutable.ListBuffer
+import scala.collection.mutable.Map
+
+/**
+ * While building the model different approaches need to be compared.
+ * For that purpose the fitParameters are used. Every possibility that
might enhance
+ * the implementation can be chosen separately by using the following list
of parameters:
+ *
+ * Possibility 1: way of calculating document count
+ * P1 = 0 -> use .count() to get count of all documents
+ * P1 = 1 -> use a reducer and a mapper to create a broadcast data set
containing the count of
+ * all documents
+ *
+ * Possibility 2: all words in class (order of operators)
+ * If p2 = 1 improves the speed, many other calculations must switch
their operators, too.
+ * P2 = 0 -> first the reducer, than the mapper
+ * P2 = 1 -> first the mapper, than the reducer
+ *
+ * Possibility 3: way of calculating pwc
+ * P2 = 0 -> join singleWordsInClass and allWordsInClass to wordsInClass
data set
+ * P3 = 1 -> work on singleWordsInClass data set and broadcast
allWordsInClass data set
+ *
+ * Schneider/Rennie 1: ignore/reduce word frequency information
+ * SR1 = 0 -> word frequency information is not ignored
+ * SR1 = 1 -> word frequency information is ignored (Schneiders approach)
+ * SR1 = 2 -> word frequency information is reduced (Rennies approach)
+ *
+ * Schneider1: ignore P(c_j) in cMAP formula
+ * S1 = 0 -> normal cMAP formula
+ * S2 = 1 -> cMAP without P(c_j)
+ *
+ * Rennie1: transform document frequency
+ * R1 = 0 -> normal formula
+ * R1 = 1 -> apply inverse document frequecy
+ * Note: if R1 = 1 and SR1 = 2, both approaches get applied.
+ *
+ */
+class MultinomialNaiveBayes extends Predictor[MultinomialNaiveBayes] {
+
+ import MultinomialNaiveBayes._
+
+ //The model, that stores all needed information that are related to one
specific word
+ var wordRelatedModelData: Option[DataSet[(String, String, Double)]] =
+ None // (class name -> word -> log P(w|c))
+
+ //The model, that stores all needed information that are related to one
specifc class+
+ var classRelatedModelData: Option[DataSet[(String, Double, Double)]] =
+ None // (class name -> p(c) -> log p(w|c) not in class)
+
+ //A data set that stores additional needed information for some of the
improvements
+ var improvementData: Option[DataSet[(String, Double)]] =
+ None // (word -> log number of documents in all classes / word
frequency in all classes
+
+ // ============================== Parameter configuration
========================================
+
+ def setP1(value: Int): MultinomialNaiveBayes = {
+ parameters.add(P1, value)
+ this
+ }
+
+ def setP2(value: Int): MultinomialNaiveBayes = {
+ parameters.add(P2, value)
+ this
+ }
+
+ def setP3(value: Int): MultinomialNaiveBayes = {
+ parameters.add(P3, value)
+ this
+ }
+
+ def setSR1(value: Int): MultinomialNaiveBayes = {
+ parameters.add(SR1, value)
+ this
+ }
+
+ def setS1(value: Int): MultinomialNaiveBayes = {
+ parameters.add(S1, value)
+ this
+ }
+
+ def setR1(value: Int): MultinomialNaiveBayes = {
+ parameters.add(R1, value)
+ this
+ }
+
+ // =============================================== Methods
=======================================
+
+ /**
+ * Save already existing model data created by the NaiveBayes algorithm.
Requires the designated
+ * locations. The saved data is a representation of the
[[wordRelatedModelData]] and
+ * [[classRelatedModelData]].
+ * @param wordRelated, the save location for the wordRelated data
+ * @param classRelated, the save location for the classRelated data
+ */
+ def saveModelDataSet(wordRelated: String, classRelated: String) : Unit =
{
+ wordRelatedModelData.get.writeAsCsv(wordRelated, "\n", "|",
WriteMode.OVERWRITE)
+ classRelatedModelData.get.writeAsCsv(classRelated, "\n", "|",
WriteMode.OVERWRITE)
+ }
+
+ /**
+ * Save the improvment data set. Requires the designated save location.
The saved data is a
+ * representation of the [[improvementData]] data set.
+ * @param path, the save location for the improvment data
+ */
+ def saveImprovementDataSet(path: String) : Unit = {
+ improvementData.get.writeAsCsv(path, "\n", "|", WriteMode.OVERWRITE)
+ }
+
+ /**
+ * Sets the [[wordRelatedModelData]] and the [[classRelatedModelData]]
to the given data sets.
+ * @param wordRelated, the data set representing the wordRelated model
+ * @param classRelated, the data set representing the classRelated model
+ */
+ def setModelDataSet(wordRelated : DataSet[(String, String, Double)],
+ classRelated: DataSet[(String, Double, Double)]) :
Unit = {
+ this.wordRelatedModelData = Some(wordRelated)
+ this.classRelatedModelData = Some(classRelated)
+ }
+
+ def setImprovementDataSet(impSet : DataSet[(String, Double)]) : Unit = {
+ this.improvementData = Some(impSet)
+ }
+
+}
+
+object MultinomialNaiveBayes {
+
+ // ========================================== Parameters
=========================================
+ case object P1 extends Parameter[Int] {
+ override val defaultValue: Option[Int] = Some(0)
+ }
+
+ case object P2 extends Parameter[Int] {
+ override val defaultValue: Option[Int] = Some(0)
+ }
+
+ case object P3 extends Parameter[Int] {
+ override val defaultValue: Option[Int] = Some(0)
+ }
+
+ case object SR1 extends Parameter[Int] {
+ override val defaultValue: Option[Int] = Some(0)
+ }
+
+ case object S1 extends Parameter[Int] {
+ override val defaultValue: Option[Int] = Some(0)
+ }
+
+ case object R1 extends Parameter[Int] {
+ override val defaultValue: Option[Int] = Some(0)
+ }
+
+ // ======================================== Factory Methods
======================================
+ def apply(): MultinomialNaiveBayes = {
+ new MultinomialNaiveBayes()
+ }
+
+ // ====================================== Operations
=============================================
+ /**
+ * Trains the models to fit the training data. The resulting
+ * [[MultinomialNaiveBayes.wordRelatedModelData]] and
+ * [[MultinomialNaiveBayes.classRelatedModelData]] are stored in the
[[MultinomialNaiveBayes]]
+ * instance.
+ */
+
+ implicit val fitNNB = new FitOperation[MultinomialNaiveBayes, (String,
String)] {
+ /**
+ * The [[FitOperation]] used to create the model. Requires an instance
of
+ * [[MultinomialNaiveBayes]], a [[ParameterMap]] and the input data
set. This data set
+ * maps (string -> string) containing (label -> text, words separated
by ",")
+ * @param instance of [[MultinomialNaiveBayes]]
+ * @param fitParameters, additional parameters
+ * @param input, the to processed data set
+ */
+ override def fit(instance: MultinomialNaiveBayes,
+ fitParameters: ParameterMap,
+ input: DataSet[(String, String)]): Unit = {
+
+ val resultingParameters = instance.parameters ++ fitParameters
+
+ //Count the amount of documents for each class.
+ // 1. Map: replace the document text by a 1
+ // 2. Group-Reduce: sum the 1s by class
+ val documentsPerClass: DataSet[(String, Int)] = input.map { input =>
(input._1, 1)}
--- End diff --
move the map to next line.
> Add naive Bayes classification algorithm to machine learning library
> --------------------------------------------------------------------
>
> Key: FLINK-1719
> URL: https://issues.apache.org/jira/browse/FLINK-1719
> Project: Flink
> Issue Type: New Feature
> Components: Machine Learning Library
> Reporter: Till Rohrmann
> Assignee: Jonathan Hasenburg
> Labels: ML
>
> Add naive Bayes algorithm to Flink's machine learning library as a basic
> classification algorithm. Maybe we can incorporate some of the improvements
> developed by [Karl-Michael
> Schneider|http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.59.2085&rep=rep1&type=pdf],
> [Sang-Bum Kim et
> al.|http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=1704799] or
> [Jason Rennie et
> al.|http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf] into the
> implementation.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)