Re: TFIDFConverter generates empty tfidf-vectors

Gokhan Capan Sun, 08 Sep 2013 05:13:42 -0700

Taner,

It seems to have tf-idf vectors later, you need to create tf vectors
(DictionaryVectorizer.createTermFrequencyVectors) with logNormalize option
set to false, and normPower option set to -1.0f. This applies to
HighDFWordsPruner.pruneVectors, too.


I believe that solves your problem.

Best

Gokhan


On Wed, Sep 4, 2013 at 4:54 PM, Taner Diler <taner.di...@gmail.com> wrote:

> Actually, my real motivation was to visualize reuters vectors like
> DisplayKMeans example and then implement to web contents that I've
> collected and additionaly to discover what I can do with generated tfidf
> vectors. But TDIDF doesn't work and why?
>
> There is one main class that doesn't extend AbstractJob. Yes it has main
> method that executes all steps. And I'm trying to implement a sample that
> in mahout wiki and everywhere in net. In Eclipse, I've just added
> mahout-0.8-job.jar  , you know it includes all depended package, and
> hadoop-core.1.2.0.jar.
>
> import java.io.IOException;
> import java.util.ArrayList;
> import java.util.Collections;
> import java.util.Comparator;
> import java.util.HashMap;
> import java.util.List;
> import java.util.Map;
> import java.util.Set;
>
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.io.IntWritable;
> import org.apache.hadoop.io.LongWritable;
> import org.apache.hadoop.io.SequenceFile;
> import org.apache.hadoop.io.Text;
> import org.apache.hadoop.mapred.SequenceFileAsBinaryInputFormat;
> import org.apache.lucene.analysis.Analyzer;
> import org.apache.lucene.analysis.standard.StandardAnalyzer;
> import org.apache.lucene.util.Version;
> import org.apache.mahout.clustering.Cluster;
> import org.apache.mahout.clustering.canopy.CanopyDriver;
> import org.apache.mahout.clustering.classify.WeightedVectorWritable;
> import org.apache.mahout.clustering.display.DisplayKMeans;
> import org.apache.mahout.clustering.kmeans.KMeansDriver;
> import org.apache.mahout.common.HadoopUtil;
> import org.apache.mahout.common.Pair;
> import org.apache.mahout.common.StringTuple;
> import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
> import org.apache.mahout.common.distance.TanimotoDistanceMeasure;
> import org.apache.mahout.math.Vector.Element;
> import org.apache.mahout.math.VectorWritable;
> import org.apache.mahout.vectorizer.DictionaryVectorizer;
> import org.apache.mahout.vectorizer.DocumentProcessor;
> import org.apache.mahout.vectorizer.tfidf.TFIDFConverter;
>
>
> public class MahoutReutersKMeans {
>
>     private static int minSupport = 2;
>     private static int maxNGramSize = 2;
>     private static float minLLRValue = 1;
>     private static float normPower = 2;
>     private static boolean logNormalize = true;
>     private static int numReducers = 1;
>     private static int chunkSizeInMegabytes = 200;
>     private static boolean sequentialAccess = true;
>     private static boolean namedVectors = false;
>
>     private static int minDf = 5;
>
>     private static long maxDF = 95;
>
>     /**
>      * @param args
>      * @throws IOException
>      * @throws InterruptedException
>      * @throws ClassNotFoundException
>      */
>     public static void main(String[] args) throws IOException,
> ClassNotFoundException, InterruptedException {
>
>         Configuration conf = new Configuration();
>         String HADOOP_HOME = System.getenv("HADOOP_PREFIX");
>
>         conf.addResource(new Path(HADOOP_HOME, "conf/core-site.xml"));
>         conf.addResource(new Path(HADOOP_HOME, "conf/hdfs-site.xml"));
>         conf.addResource(new Path(HADOOP_HOME, "conf/mapred-site.xml"));
>
>         FileSystem fs  = FileSystem.get(conf);
>
>         Path inputDir = new Path("reuters-seqfiles");
>         String outputDir = "reuters-kmeans-try";
>         HadoopUtil.delete(conf, new Path(outputDir));
>         StandardAnalyzer analyzer = new
> StandardAnalyzer(Version.LUCENE_43);
>         Path tokenizedPath = new
> Path(DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
>         DocumentProcessor.tokenizeDocuments(inputDir,
> analyzer.getClass().asSubclass(Analyzer.class), tokenizedPath, conf);
>
>
>         DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, new
> Path(outputDir),
>                 DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER, conf,
> minSupport , maxNGramSize, minLLRValue, normPower , logNormalize,
> numReducers , chunkSizeInMegabytes , sequentialAccess, namedVectors);
>
>
>         Pair<Long[], List<Path>> features = TFIDFConverter.calculateDF(new
> Path(outputDir,
>                 DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER), new
> Path(outputDir), conf, chunkSizeInMegabytes);
>         TFIDFConverter.processTfIdf(new Path(outputDir,
>                 DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER), new
> Path(outputDir), conf, features, minDf , maxDF , normPower, logNormalize,
> sequentialAccess, false, numReducers);
>
>
> //        Path tfidfVectorsPath = new Path(outputDir, "tfidf-vectors");
> //        Path canopyCentroidsPath = new Path(outputDir,
> "canopy-centroids");
> //        Path clustersPath = new Path(outputDir, "clusters");
> //
> //        CanopyDriver.run(conf, tfidfVectorsPath, canopyCentroidsPath, new
> EuclideanDistanceMeasure(), 250, 120, false, 0.01, false);
> //        KMeansDriver.run(conf, tfidfVectorsPath, new
> Path(canopyCentroidsPath, "clusters-0-final"), clustersPath, new
> TanimotoDistanceMeasure(), 0.01, 20, true, 0.01, false);
> //
> //        SequenceFile.Reader reader = new SequenceFile.Reader(fs, new
> Path("reuters-clusters/" + Cluster.CLUSTERED_POINTS_DIR + "/part-m-00000"),
> conf);
> //
> //        IntWritable key = new IntWritable();
> //        WeightedVectorWritable value = new WeightedVectorWritable();
> //        while (reader.next(key, value)) {
> //            System.out.println(key.toString()+" belongs to cluster
> "+value.toString());
> //        }
> //        reader.close();
>
>     }
>
> }
>
>
>
>
> On Wed, Sep 4, 2013 at 3:29 PM, Gokhan Capan <gkhn...@gmail.com> wrote:
>
> > Taner,
> >
> > A few questions:
> >
> > Is there a specific reason not to consider using seq2sparse directly?
> (You
> > can edit seq2sparse.props to avoid passing commandline arguments every
> time
> > you run it, if that is the case)
> >
> > Java code you attached seems to do the same thing with
> > SparseVectorFromSequenceFiles#run(String[]),  which is also the method
> > called when you run seq2sparse. I'm gonna debug it anyway.
> >
> > And I would like to know how you run the java code. Does your main class
> > extend AbstractJob to make it "runnable" using bin/mahout? And does it
> have
> > a main method that submits your job to your hadoop cluster? Are you using
> > hadoop jar command to run it?
> >
> > Best
> >
> > Gokhan
> >
> >
> > On Wed, Sep 4, 2013 at 1:15 PM, Taner Diler <taner.di...@gmail.com>
> wrote:
> >
> > > Suneel, samples from generated seqfiles:
> > >
> > > df-count
> > >
> > > Key: -1: Value: 21578
> > > Key: 0: Value: 43
> > > Key: 1: Value: 2
> > > Key: 2: Value: 2
> > > Key: 3: Value: 2
> > > ...
> > >
> > > tf-vectors
> > >
> > > Key class: class org.apache.hadoop.io.Text Value Class: class
> > > org.apache.mahout.math.VectorWritable
> > > Key: /reut2-000.sgm-0.txt: Value:
> > >
> > >
> >
> {62:0.024521886354905213,222:0.024521886354905213,291:0.024521886354905213,1411:0.024521886354905213,1421:0.024521886354905213,1451:0.024521886
> > > 354905213,1456:0.024521886354905213....
> > >
> > > wordcount/ngrams
> > >
> > > Key class: class org.apache.hadoop.io.Text Value Class: class
> > > org.apache.hadoop.io.DoubleWritable
> > > Key: 0: Value: 166.0
> > > Key: 0.003: Value: 2.0
> > > Key: 0.006913: Value: 2.0
> > > Key: 0.007050: Value: 2.0
> > >
> > > wordcount/subgrams
> > >
> > > Key class: class org.apache.mahout.vectorizer.collocations.llr.Gram
> Value
> > > Class: class org.apache.mahout.vectorizer.collocations.llr.Gram
> > > Key: '0 0'[n]:12: Value: '0'[h]:166
> > > Key: '0 25'[n]:2: Value: '0'[h]:166
> > > Key: '0 92'[n]:107: Value: '0'[h]:166
> > >
> > > frequency.file-0
> > >
> > > Key class: class org.apache.hadoop.io.IntWritable Value Class: class
> > > org.apache.hadoop.io.LongWritable
> > > Key: 0: Value: 43
> > > Key: 1: Value: 2
> > > Key: 2: Value: 2
> > > Key: 3: Value: 2
> > > Key: 4: Value: 9
> > > Key: 5: Value: 4
> > >
> > >
> > > dictionary.file-0
> > >
> > > Key class: class org.apache.hadoop.io.Text Value Class: class
> > > org.apache.hadoop.io.IntWritable
> > > Key: 0: Value: 0
> > > Key: 0.003: Value: 1
> > > Key: 0.006913: Value: 2
> > > Key: 0.007050: Value: 3
> > > Key: 0.01: Value: 4
> > > Key: 0.02: Value: 5
> > > Key: 0.025: Value: 6
> > >
> > >
> > >
> > >
> > >
> > > On Wed, Sep 4, 2013 at 12:45 PM, Taner Diler <taner.di...@gmail.com>
> > > wrote:
> > >
> > > > mahout seq2sparse -i reuters-seqfiles/ -o reuters-kmeans-try -chunk
> 200
> > > > -wt tfidf -s 2 -md 5 -x 95 -ng 2 -ml 50 -n 2 -seq
> > > >
> > > > this command works well.
> > > >
> > > > Gokhan, I changed minLLR value to 1.0 in java but result is same
> empty
> > > > tfidf-vectors.
> > > >
> > > >
> > > > On Tue, Sep 3, 2013 at 10:47 AM, Taner Diler <taner.di...@gmail.com
> > > >wrote:
> > > >
> > > >> Gokhan, I try it from commandline it works. I will send the command
> to
> > > >> compare command line parameters to TFIDFConverter params.
> > > >>
> > > >> Suneel, I had checked the seqfiles. I didn't see any problem other
> > > >> generated seqfiles but I will checked  and send samples from each
> > > seqfiles.
> > > >>
> > > >>
> > > >> On Sun, Sep 1, 2013 at 11:02 PM, Gokhan Capan <gkhn...@gmail.com>
> > > wrote:
> > > >>
> > > >>> Suneel is right indeed. I assumed that everything performed prior
> to
> > > >>> vector
> > > >>> generation is done correctly.
> > > >>>
> > > >>> By the way, if the suggestions do not work, could you try running
> > > >>> seq2sparse from commandline with the same arguments and see if that
> > > works
> > > >>> well?
> > > >>>
> > > >>> On Sun, Sep 1, 2013 at 7:23 PM, Suneel Marthi <
> > suneel_mar...@yahoo.com
> > > >>> >wrote:
> > > >>>
> > > >>> > I would first check to see if the input 'seqfiles' for
> > TFIDFGenerator
> > > >>> have
> > > >>> > any meat in them.
> > > >>> > This could also happen if the input seqfiles are empty.
> > > >>>
> > > >>>
> > > >>> >
> > > >>> >
> > > >>> > ________________________________
> > > >>> >  From: Taner Diler <taner.di...@gmail.com>
> > > >>> > To: user@mahout.apache.org
> > > >>> > Sent: Sunday, September 1, 2013 2:24 AM
> > > >>> > Subject: TFIDFConverter generates empty tfidf-vectors
> > > >>> >
> > > >>> >
> > > >>> > Hi all,
> > > >>> >
> > > >>> > I try to run Reuters KMeans example in Java, but TFIDFComverter
> > > >>> generates
> > > >>> > tfidf-vectors as empty. How can I fix that?
> > > >>> >
> > > >>> >     private static int minSupport = 2;
> > > >>> >     private static int maxNGramSize = 2;
> > > >>> >     private static float minLLRValue = 50;
> > > >>> >     private static float normPower = 2;
> > > >>> >     private static boolean logNormalize = true;
> > > >>> >     private static int numReducers = 1;
> > > >>> >     private static int chunkSizeInMegabytes = 200;
> > > >>> >     private static boolean sequentialAccess = true;
> > > >>> >     private static boolean namedVectors = false;
> > > >>> >     private static int minDf = 5;
> > > >>> >     private static long maxDF = 95;
> > > >>> >
> > > >>> >         Path inputDir = new Path("reuters-seqfiles");
> > > >>> >         String outputDir = "reuters-kmeans-try";
> > > >>> >         HadoopUtil.delete(conf, new Path(outputDir));
> > > >>> >         StandardAnalyzer analyzer = new
> > > >>> > StandardAnalyzer(Version.LUCENE_43);
> > > >>> >         Path tokenizedPath = new
> > > >>> > Path(DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
> > > >>> >         DocumentProcessor.tokenizeDocuments(inputDir,
> > > >>> > analyzer.getClass().asSubclass(Analyzer.class), tokenizedPath,
> > conf);
> > > >>> >
> > > >>> >
> > > >>> >
> > > DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,
> > > >>> new
> > > >>> > Path(outputDir),
> > > >>> >
> DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER,
> > > >>> conf,
> > > >>> > minSupport , maxNGramSize, minLLRValue, normPower , logNormalize,
> > > >>> > numReducers , chunkSizeInMegabytes , sequentialAccess,
> > namedVectors);
> > > >>> >
> > > >>> >
> > > >>> >         Pair<Long[], List<Path>> features =
> > > >>> TFIDFConverter.calculateDF(new
> > > >>> > Path(outputDir,
> > > >>> >
> > DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
> > > >>> new
> > > >>> > Path(outputDir), conf, chunkSizeInMegabytes);
> > > >>> >         TFIDFConverter.processTfIdf(new Path(outputDir,
> > > >>> >
> > DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
> > > >>> new
> > > >>> > Path(outputDir), conf, features, minDf , maxDF , normPower,
> > > >>> logNormalize,
> > > >>> > sequentialAccess, false, numReducers);
> > > >>> >
> > > >>>
> > > >>
> > > >>
> > > >
> > >
> >
>

Re: TFIDFConverter generates empty tfidf-vectors

Reply via email to