Hi all,
I try to run Reuters KMeans example in Java, but TFIDFComverter generates
tfidf-vectors as empty. How can I fix that?
private static int minSupport = 2;
private static int maxNGramSize = 2;
private static float minLLRValue = 50;
private static float normPower = 2;
private static boolean logNormalize = true;
private static int numReducers = 1;
private static int chunkSizeInMegabytes = 200;
private static boolean sequentialAccess = true;
private static boolean namedVectors = false;
private static int minDf = 5;
private static long maxDF = 95;
Path inputDir = new Path("reuters-seqfiles");
String outputDir = "reuters-kmeans-try";
HadoopUtil.delete(conf, new Path(outputDir));
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
Path tokenizedPath = new
Path(DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
DocumentProcessor.tokenizeDocuments(inputDir,
analyzer.getClass().asSubclass(Analyzer.class), tokenizedPath, conf);
DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, new
Path(outputDir),
DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER, conf,
minSupport , maxNGramSize, minLLRValue, normPower , logNormalize,
numReducers , chunkSizeInMegabytes , sequentialAccess, namedVectors);
Pair<Long[], List<Path>> features = TFIDFConverter.calculateDF(new
Path(outputDir,
DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER), new
Path(outputDir), conf, chunkSizeInMegabytes);
TFIDFConverter.processTfIdf(new Path(outputDir,
DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER), new
Path(outputDir), conf, features, minDf , maxDF , normPower, logNormalize,
sequentialAccess, false, numReducers);