Author: srowen
Date: Fri Apr  6 14:44:05 2012
New Revision: 1310357

URL: http://svn.apache.org/viewvc?rev=1310357&view=rev
Log:
MAHOUT-973 one more file needed for fix to compute maxDF as a percent of total 
count

Modified:
    
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java?rev=1310357&r1=1310356&r2=1310357&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
 Fri Apr  6 14:44:05 2012
@@ -284,8 +284,9 @@ public final class SparseVectorsFromSequ
          Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);
 
          // Calculate the standard deviation
-         double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 
0.0D, conf);
-         maxDF = (int) (maxDFSigma * stdDev);
+         double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, 
conf);
+         long vectorCount = docFrequenciesFeatures.getFirst()[1];
+         maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount);
 
          // Prune the term frequency vectors
          Path tfDir = new Path(outputDir, tfDirName);


Reply via email to