Hi all,

I try to run Reuters KMeans example in Java, but TFIDFComverter generates
tfidf-vectors as empty. How can I fix that?

    private static int minSupport = 2;
    private static int maxNGramSize = 2;
    private static float minLLRValue = 50;
    private static float normPower = 2;
    private static boolean logNormalize = true;
    private static int numReducers = 1;
    private static int chunkSizeInMegabytes = 200;
    private static boolean sequentialAccess = true;
    private static boolean namedVectors = false;
    private static int minDf = 5;
    private static long maxDF = 95;

        Path inputDir = new Path("reuters-seqfiles");
        String outputDir = "reuters-kmeans-try";
        HadoopUtil.delete(conf, new Path(outputDir));
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
        Path tokenizedPath = new
Path(DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
        DocumentProcessor.tokenizeDocuments(inputDir,
analyzer.getClass().asSubclass(Analyzer.class), tokenizedPath, conf);


        DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, new
Path(outputDir),
                DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER, conf,
minSupport , maxNGramSize, minLLRValue, normPower , logNormalize,
numReducers , chunkSizeInMegabytes , sequentialAccess, namedVectors);


        Pair<Long[], List<Path>> features = TFIDFConverter.calculateDF(new
Path(outputDir,
                DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER), new
Path(outputDir), conf, chunkSizeInMegabytes);
        TFIDFConverter.processTfIdf(new Path(outputDir,
                DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER), new
Path(outputDir), conf, features, minDf , maxDF , normPower, logNormalize,
sequentialAccess, false, numReducers);

Reply via email to