Revision: 17417
http://sourceforge.net/p/gate/code/17417
Author: adamfunk
Date: 2014-02-25 08:36:20 +0000 (Tue, 25 Feb 2014)
Log Message:
-----------
Successfully integrated DFB into TF.IDF system.
Some spaghetti still needs to be untangled.
Modified Paths:
--------------
gate/trunk/plugins/TermRaider/src/gate/termraider/PMIExample.java
gate/trunk/plugins/TermRaider/src/gate/termraider/TermRaiderEnglish.java
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/AbstractTermbank.java
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/AnnotationTermbank.java
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/DocumentFrequencyBank.java
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/HyponymyTermbank.java
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/TfIdfTermbank.java
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/modes/IdfCalculation.java
gate/trunk/plugins/TermRaider/src/gate/termraider/gui/TermbankViewer.java
gate/trunk/plugins/TermRaider/src/gate/termraider/output/CsvGenerator.java
Modified: gate/trunk/plugins/TermRaider/src/gate/termraider/PMIExample.java
===================================================================
--- gate/trunk/plugins/TermRaider/src/gate/termraider/PMIExample.java
2014-02-25 02:22:21 UTC (rev 17416)
+++ gate/trunk/plugins/TermRaider/src/gate/termraider/PMIExample.java
2014-02-25 08:36:20 UTC (rev 17417)
@@ -1,10 +1,7 @@
package gate.termraider;
import gate.creole.PackagedController;
-import gate.creole.metadata.AutoInstance;
-import gate.creole.metadata.AutoInstanceParam;
-import gate.creole.metadata.CreoleParameter;
-import gate.creole.metadata.CreoleResource;
+import gate.creole.metadata.*;
@CreoleResource(name = "PMI Example (English)",
icon = "TermRaiderApp",
@@ -12,5 +9,5 @@
@AutoInstanceParam(name="pipelineURL",
value="applications/pmi-example.gapp"),
@AutoInstanceParam(name="menu", value="TermRaider")}))
public class PMIExample extends PackagedController {
-
+ private static final long serialVersionUID = -4725697168124226331L;
}
Modified:
gate/trunk/plugins/TermRaider/src/gate/termraider/TermRaiderEnglish.java
===================================================================
--- gate/trunk/plugins/TermRaider/src/gate/termraider/TermRaiderEnglish.java
2014-02-25 02:22:21 UTC (rev 17416)
+++ gate/trunk/plugins/TermRaider/src/gate/termraider/TermRaiderEnglish.java
2014-02-25 08:36:20 UTC (rev 17417)
@@ -1,10 +1,7 @@
package gate.termraider;
import gate.creole.PackagedController;
-import gate.creole.metadata.AutoInstance;
-import gate.creole.metadata.AutoInstanceParam;
-import gate.creole.metadata.CreoleParameter;
-import gate.creole.metadata.CreoleResource;
+import gate.creole.metadata.*;
@CreoleResource(name = "TermRaider English Term Extraction",
icon = "TermRaiderApp",
@@ -12,5 +9,5 @@
@AutoInstanceParam(name="pipelineURL",
value="applications/termraider-eng.gapp"),
@AutoInstanceParam(name="menu", value="TermRaider")}))
public class TermRaiderEnglish extends PackagedController {
-
+ private static final long serialVersionUID = -1599367292323903155L;
}
Modified:
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/AbstractTermbank.java
===================================================================
---
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/AbstractTermbank.java
2014-02-25 02:22:21 UTC (rev 17416)
+++
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/AbstractTermbank.java
2014-02-25 08:36:20 UTC (rev 17417)
@@ -149,7 +149,7 @@
boolean wasLoaded = corpus.isDocumentLoaded(i);
Document document = (Document) corpus.get(i);
- addData(document);
+ processDocument(document);
// datastore safety
if (! wasLoaded) {
@@ -160,7 +160,7 @@
}
- private void scanTypesLanguagesDocFreq() {
+ protected void scanTypesLanguagesDocFreq() {
this.types = new TreeSet<String>();
this.languages = new TreeSet<String>();
for (Term term : this.termFrequencies.keySet()) {
@@ -171,12 +171,12 @@
}
- /* BEHOLD THE GUBBINS to distinguish the various types of Termbanks*/
+ /* BEHOLD THE GUBBINS to distinguish the various types of Termbanks */
/**
* This method needs to call incrementTermFreq(...)!
*/
- protected abstract void addData(Document document);
+ protected abstract void processDocument(Document document);
protected abstract void calculateScores();
@@ -288,5 +288,11 @@
public Set<String> getInputAnnotationTypes() {
return this.inputAnnotationTypes;
}
+
+
+ public abstract String getCsvHeader();
+
+
+ public abstract String getCsvLine(Term term);
}
Modified:
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/AnnotationTermbank.java
===================================================================
---
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/AnnotationTermbank.java
2014-02-25 02:22:21 UTC (rev 17416)
+++
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/AnnotationTermbank.java
2014-02-25 08:36:20 UTC (rev 17417)
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2008--2012, The University of Sheffield. See the file
+ * Copyright (c) 2008--2014, The University of Sheffield. See the file
* COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
@@ -17,6 +17,7 @@
import gate.termraider.util.*;
import gate.termraider.bank.modes.*;
import java.util.*;
+import org.apache.commons.lang.StringEscapeUtils;
@@ -36,7 +37,7 @@
- protected void addData(Document document) {
+ protected void processDocument(Document document) {
String documentSource = Utilities.sourceOrName(document);
AnnotationSet candidates =
document.getAnnotations(inputASName).get(inputAnnotationTypes);
@@ -105,8 +106,39 @@
docFrequencies = new HashMap<Term, Integer>();
}
+
+ public String getCsvHeader() {
+ StringBuilder sb = new StringBuilder();
+ sb.append(StringEscapeUtils.escapeCsv("Term"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("Lang"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("Type"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("ScoreType"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("Score"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("Document_Count"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("Term_Frequency"));
+ return sb.toString();
+ }
+ public String getCsvLine(Term term) {
+ StringBuilder sb = new StringBuilder();
+ sb.append(StringEscapeUtils.escapeCsv(term.getTermString()));
+ sb.append(',');
+ sb.append(StringEscapeUtils.escapeCsv(term.getLanguageCode()));
+ sb.append(',');
+ sb.append(StringEscapeUtils.escapeCsv(term.getType()));
+ sb.append(',');
+ sb.append(StringEscapeUtils.escapeCsv(this.getScoreProperty()));
+ sb.append(',');
+ sb.append(StringEscapeUtils.escapeCsv(this.getScore(term).toString()));
+ sb.append(',');
+
sb.append(StringEscapeUtils.escapeCsv(Integer.toString(this.getDocFrequency(term))));
+ sb.append(',');
+
sb.append(StringEscapeUtils.escapeCsv(Integer.toString(this.getTermFrequency(term))));
+ return sb.toString();
+ }
+
+
/***** CREOLE PARAMETERS *****/
@CreoleParameter(comment = "annotation feature containing the score to
index",
Modified:
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/DocumentFrequencyBank.java
===================================================================
---
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/DocumentFrequencyBank.java
2014-02-25 02:22:21 UTC (rev 17416)
+++
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/DocumentFrequencyBank.java
2014-02-25 08:36:20 UTC (rev 17417)
@@ -16,6 +16,8 @@
import javax.swing.Action;
+import org.apache.commons.lang.StringEscapeUtils;
+
import gate.Annotation;
import gate.AnnotationSet;
import gate.Corpus;
@@ -34,7 +36,7 @@
@CreoleResource(name = "DocumentFrequencyBank",
icon = "termbank-lr.png",
comment = "Document frequency counter derived from corpora and other DFBs")
-public class DocumentFrequencyBank extends AbstractBank
+public class DocumentFrequencyBank extends AbstractTermbank
implements ActionsPublisher{
private static final long serialVersionUID = 5149075094060830331L;
@@ -56,9 +58,11 @@
public Resource init() throws ResourceInstantiationException {
prepare();
+ resetScores();
processInputBanks();
processCorpora();
- churnData();
+ scanTypesLanguagesDocFreq();
+ calculateScores();
return this;
}
@@ -76,9 +80,12 @@
if (inputBanks == null) {
inputBanks = new HashSet<DocumentFrequencyBank>();
}
-
+ }
+
+ protected void resetScores() {
documentTotal = 0;
documentFrequencies = new HashMap<Term, Integer>();
+ termFrequencies = new HashMap<Term, Integer>();
languages = new HashSet<String>();
types = new HashSet<String>();
stringLookupTable = new HashMap<String, Set<Term>>();
@@ -140,7 +147,7 @@
}
- private void churnData() {
+ protected void calculateScores() {
if (this.getTerms().size() > 0) {
minFrequency =
this.getFrequencyStrict(this.getTerms().iterator().next());
}
@@ -192,7 +199,13 @@
}
+ @Override
+ public int getDocFrequency(Term term) {
+ return getFrequencyLax(term);
+ }
+
+
@CreoleParameter(comment = "Other DFBs to compile into the new one")
public void setInputBanks(Set<DocumentFrequencyBank> inputBanks) {
this.inputBanks = inputBanks;
@@ -287,6 +300,8 @@
}
+
+
private void increment(Term term, int i) {
int count = i;
if (documentFrequencies.containsKey(term)) {
@@ -317,4 +332,32 @@
public int getTotalDocs() {
return this.documentTotal;
}
+
+
+ public String getCsvLine(Term term) {
+ StringBuilder sb = new StringBuilder();
+ sb.append(StringEscapeUtils.escapeCsv(term.getTermString()));
+ sb.append(',');
+ sb.append(StringEscapeUtils.escapeCsv(term.getLanguageCode()));
+ sb.append(',');
+ sb.append(StringEscapeUtils.escapeCsv(term.getType()));
+ sb.append(',');
+
sb.append(StringEscapeUtils.escapeCsv(Integer.toString(this.getDocFrequency(term))));
+ return sb.toString();
+ }
+
+
+ public String getCsvHeader() {
+ StringBuilder sb = new StringBuilder();
+ sb.append(StringEscapeUtils.escapeCsv("Term"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("Lang"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("Type"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("DocFrequency"));
+ sb.append('\n');
+ sb.append(',').append(StringEscapeUtils.escapeCsv("_TOTAL_DOCS_"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv(""));
+ sb.append(',').append(StringEscapeUtils.escapeCsv(""));
+
sb.append(',').append(StringEscapeUtils.escapeCsv(Integer.toString(this.getTotalDocs())));
+ return sb.toString();
+ }
}
Modified:
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/HyponymyTermbank.java
===================================================================
---
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/HyponymyTermbank.java
2014-02-25 02:22:21 UTC (rev 17416)
+++
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/HyponymyTermbank.java
2014-02-25 08:36:20 UTC (rev 17417)
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2008--2012, The University of Sheffield. See the file
+ * Copyright (c) 2008--2014, The University of Sheffield. See the file
* COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
@@ -15,6 +15,7 @@
import gate.gui.ActionsPublisher;
import gate.*;
import gate.termraider.util.*;
+import org.apache.commons.lang.StringEscapeUtils;
import java.util.*;
@@ -56,7 +57,7 @@
}
- protected void addData(Document document) {
+ protected void processDocument(Document document) {
String documentSource = Utilities.sourceOrName(document);
AnnotationSet candidates =
document.getAnnotations(inputASName).get(inputAnnotationTypes);
@@ -167,6 +168,36 @@
}
+ public String getCsvHeader() {
+ StringBuilder sb = new StringBuilder();
+ sb.append(StringEscapeUtils.escapeCsv("Term"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("Lang"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("Type"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("ScoreType"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("Score"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("Document_Count"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("Term_Frequency"));
+ return sb.toString();
+ }
+
+ public String getCsvLine(Term term) {
+ StringBuilder sb = new StringBuilder();
+ sb.append(StringEscapeUtils.escapeCsv(term.getTermString()));
+ sb.append(',');
+ sb.append(StringEscapeUtils.escapeCsv(term.getLanguageCode()));
+ sb.append(',');
+ sb.append(StringEscapeUtils.escapeCsv(term.getType()));
+ sb.append(',');
+ sb.append(StringEscapeUtils.escapeCsv(this.getScoreProperty()));
+ sb.append(',');
+ sb.append(StringEscapeUtils.escapeCsv(this.getScore(term).toString()));
+ sb.append(',');
+
sb.append(StringEscapeUtils.escapeCsv(Integer.toString(this.getDocFrequency(term))));
+ sb.append(',');
+
sb.append(StringEscapeUtils.escapeCsv(Integer.toString(this.getTermFrequency(term))));
+ return sb.toString();
+ }
+
/***** CREOLE PARAMETERS *****/
@CreoleParameter(comment = "Annotation features (in order) to be scanned as
terms' heads")
Modified:
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/TfIdfTermbank.java
===================================================================
--- gate/trunk/plugins/TermRaider/src/gate/termraider/bank/TfIdfTermbank.java
2014-02-25 02:22:21 UTC (rev 17416)
+++ gate/trunk/plugins/TermRaider/src/gate/termraider/bank/TfIdfTermbank.java
2014-02-25 08:36:20 UTC (rev 17417)
@@ -11,16 +11,20 @@
*/
package gate.termraider.bank;
+import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.*;
import gate.gui.ActionsPublisher;
import gate.*;
import gate.termraider.bank.modes.IdfCalculation;
import gate.termraider.bank.modes.TfCalculation;
import gate.termraider.util.*;
+
import java.util.*;
+import org.apache.commons.lang.StringEscapeUtils;
+
@CreoleResource(name = "TfIdfTermbank",
icon = "termbank-lr.png",
comment = "TermRaider Termbank derived from vectors in document
features")
@@ -33,12 +37,13 @@
/* EXTRA CREOLE PARAMETERS */
private TfCalculation tfCalculation;
private IdfCalculation idfCalculation;
+ private DocumentFrequencyBank docFreqSource;
/* EXTRA DATA */
private int documentCount;
- protected void addData(Document document) {
+ protected void processDocument(Document document) {
documentCount++;
String documentSource = Utilities.sourceOrName(document);
AnnotationSet candidates =
document.getAnnotations(inputASName).get(inputAnnotationTypes);
@@ -62,8 +67,9 @@
protected void calculateScores() {
for (Term term : termFrequencies.keySet()) {
int tf = termFrequencies.get(term);
- int df = termDocuments.get(term).size();
- double score = TfCalculation.calculate(tfCalculation, tf) *
IdfCalculation.calculate(idfCalculation, df, documentCount);
+ int df = docFreqSource.getDocFrequency(term);
+ int n = docFreqSource.getTotalDocs();
+ double score = TfCalculation.calculate(tfCalculation, tf) *
IdfCalculation.calculate(idfCalculation, df, n);
rawTermScores.put(term, Double.valueOf(score));
termScores.put(term, Utilities.normalizeScore(score));
}
@@ -90,8 +96,21 @@
}
+ public int getDocCount() {
+ return this.documentCount;
+ }
/***** CREOLE PARAMETERS *****/
+
+ @CreoleParameter(comment = "document frequency bank (unset = create from
these corpora)")
+ public void setDocFreqSource(DocumentFrequencyBank dfb) {
+ this.docFreqSource = dfb;
+ }
+
+ public DocumentFrequencyBank getDocFreqSource() {
+ return this.docFreqSource;
+ }
+
@CreoleParameter(comment = "term frequency calculation",
defaultValue = "Logarithmic")
@@ -122,4 +141,59 @@
super.setScoreProperty(name);
}
+
+ public String getCsvHeader() {
+ StringBuilder sb = new StringBuilder();
+ sb.append(StringEscapeUtils.escapeCsv("Term"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("Lang"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("Type"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("ScoreType"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("Score"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("Document_Count"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("Ref_Doc_Frequency"));
+ sb.append(',').append(StringEscapeUtils.escapeCsv("Term_Frequency"));
+ return sb.toString();
+ }
+
+
+ public String getCsvLine(Term term) {
+ StringBuilder sb = new StringBuilder();
+ sb.append(StringEscapeUtils.escapeCsv(term.getTermString()));
+ sb.append(',');
+ sb.append(StringEscapeUtils.escapeCsv(term.getLanguageCode()));
+ sb.append(',');
+ sb.append(StringEscapeUtils.escapeCsv(term.getType()));
+ sb.append(',');
+ sb.append(StringEscapeUtils.escapeCsv(this.getScoreProperty()));
+ sb.append(',');
+ sb.append(StringEscapeUtils.escapeCsv(this.getScore(term).toString()));
+ sb.append(',');
+
sb.append(StringEscapeUtils.escapeCsv(Integer.toString(this.getDocFrequency(term))));
+ sb.append(',');
+
sb.append(StringEscapeUtils.escapeCsv(Integer.toString(this.docFreqSource.getDocFrequency(term))));
+ sb.append(',');
+
sb.append(StringEscapeUtils.escapeCsv(Integer.toString(this.getTermFrequency(term))));
+ return sb.toString();
+ }
+
+
+ protected void prepare() throws ResourceInstantiationException {
+ if ( (corpora == null) || (corpora.size() == 0) ) {
+ throw new ResourceInstantiationException("No corpora given");
+ }
+
+ // If no DFB is specified, we create one from the given corpora
+ if (this.docFreqSource == null) {
+ FeatureMap dfbParameters = Factory.newFeatureMap();
+ dfbParameters.put("inputASName", this.inputASName);
+ dfbParameters.put("languageFeature", this.languageFeature);
+ dfbParameters.put("inputAnnotationFeature", this.inputAnnotationFeature);
+ dfbParameters.put("corpora", this.corpora);
+ dfbParameters.put("debugMode", this.debugMode);
+
+ DocumentFrequencyBank dfb = (DocumentFrequencyBank)
Factory.createResource(DocumentFrequencyBank.class.getName(), dfbParameters);
+ this.setDocFreqSource(dfb);
+ }
+ }
+
}
Modified:
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/modes/IdfCalculation.java
===================================================================
---
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/modes/IdfCalculation.java
2014-02-25 02:22:21 UTC (rev 17416)
+++
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/modes/IdfCalculation.java
2014-02-25 08:36:20 UTC (rev 17417)
@@ -13,15 +13,10 @@
public enum IdfCalculation {
Natural,
- Logarithmic,
- LogarithmicPlus1;
+ Logarithmic;
/* These calculations are from Manning & Schütze, Foundations of
* Statistical NLP, section 15.2 (p.544).
- *
- * TODO: Use (df + 1) normalization methods so we can handle
- * terms not found in the IDF table (to allow for external
- * IDF sources in future use).
*/
public static double calculate(IdfCalculation mode, int rawDF, int
corpusSize) {
@@ -29,15 +24,14 @@
double n = (double) corpusSize;
if (mode == Logarithmic) {
- return logarithm(n / df);
+ return 1.0 + logarithm(n / (df + 1.0));
}
- if (mode == LogarithmicPlus1) {
- return 1.0 + logarithm(n / df);
- }
+ // TODO: review the df calculation modes; they must always return
+ // something > 0.
// must be Natural
- return 1.0 / df;
+ return 1.0 / (df + 1.0);
}
public static final double logBase = 2.0;
Modified:
gate/trunk/plugins/TermRaider/src/gate/termraider/gui/TermbankViewer.java
===================================================================
--- gate/trunk/plugins/TermRaider/src/gate/termraider/gui/TermbankViewer.java
2014-02-25 02:22:21 UTC (rev 17416)
+++ gate/trunk/plugins/TermRaider/src/gate/termraider/gui/TermbankViewer.java
2014-02-25 08:36:20 UTC (rev 17417)
@@ -313,7 +313,8 @@
}
public void setTarget(Object target) {
- if(target == null || ! (target instanceof AbstractTermbank)) {
+ if(target == null || ! (target instanceof AbstractTermbank)
+ || (target instanceof DocumentFrequencyBank) ) {
throw new IllegalArgumentException("This Viewer cannot show a "
+ (target == null ? "null" : target.getClass().toString()));
}
Modified:
gate/trunk/plugins/TermRaider/src/gate/termraider/output/CsvGenerator.java
===================================================================
--- gate/trunk/plugins/TermRaider/src/gate/termraider/output/CsvGenerator.java
2014-02-25 02:22:21 UTC (rev 17416)
+++ gate/trunk/plugins/TermRaider/src/gate/termraider/output/CsvGenerator.java
2014-02-25 08:36:20 UTC (rev 17417)
@@ -15,54 +15,27 @@
import java.io.*;
import java.util.*;
-
-import org.apache.commons.lang.*;
-
import gate.termraider.bank.*;
import gate.termraider.util.*;
+
public class CsvGenerator {
- public static void generateAndSaveCsv(AbstractBank bank,
+ public static void generateAndSaveCsv(AbstractTermbank bank,
Number threshold, File outputFile) throws GateException {
PrintWriter writer = initializeWriter(outputFile);
-
- if (bank instanceof AbstractTermbank) {
- String scorePropertyName = bank.getScoreProperty();
- generateTermbankCsv((AbstractTermbank) bank, writer,
threshold.doubleValue(), scorePropertyName);
- }
- else if (bank instanceof DocumentFrequencyBank) {
- generateDFCsv((DocumentFrequencyBank) bank, writer,
threshold.intValue());
- }
-
- writer.flush();
- writer.close();
- if (bank.getDebugMode()) {
- System.out.println("Saved CSV to " + outputFile.getAbsolutePath() +
- " from " + bank.getName() + " (" + bank.getClass().getName() +
")");
- }
- }
-
-
- private static void generateTermbankCsv(AbstractTermbank bank, PrintWriter
writer,
- double threshold, String scorePropertyName) {
Map<Term, Double> termScores = bank.getTermScores();
- Map<Term, Set<String>> termDocuments = bank.getTermDocuments();
- Map<Term, Integer> termFrequencies = null;
- termFrequencies = bank.getTermFrequencies();
addComment(bank, "threshold = " + threshold);
List<Term> sortedTerms = bank.getTermsByDescendingScore();
addComment(bank, "Unfiltered nbr of terms = " + sortedTerms.size());
int written = 0;
- writeTermbankHeader(writer);
+ writer.println(bank.getCsvHeader());
for (Term term : sortedTerms) {
Double score = termScores.get(term);
- if (score >= threshold) {
- Set<String> documents = termDocuments.get(term);
- Integer frequency = termFrequencies.get(term);
- writeTermBankContent(writer, term, score, documents, frequency,
scorePropertyName);
+ if (score >= threshold.doubleValue()) {
+ writer.println(bank.getCsvLine(term));
written++;
}
else { // the rest must be lower
@@ -73,30 +46,6 @@
}
- private static void generateDFCsv(DocumentFrequencyBank bank, PrintWriter
writer, int threshold) {
- Map<Term, Integer> frequencies = bank.getDocFrequencies();
- addComment(bank, "threshold = " + threshold);
- List<Term> sortedTerms = bank.getTermsByDescendingFreq();
-
- addComment(bank, "Unfiltered nbr of terms = " + sortedTerms.size());
- int written = 0;
- writeDFHeader(writer);
- writeDFContent(writer, "_TOTAL_DOCS_", bank.getTotalDocs());
-
- for (Term term : sortedTerms) {
- Integer freq = frequencies.get(term);
- if (freq >= threshold) {
- writeDFContent(writer, term, freq);
- written++;
- }
- else { // the rest must be lower
- break;
- }
- }
- addComment(bank, "Filtered nbr of terms = " + written);
- }
-
-
private static void addComment(AbstractBank termbank, String commentStr) {
if (termbank.getDebugMode()) {
System.out.println(commentStr);
@@ -113,74 +62,4 @@
}
}
-
- private static void writeTermBankContent(PrintWriter writer, Term term,
Double score,
- Set<String> documents, Integer frequency, String scorePropertyName) {
- StringBuilder sb = new StringBuilder();
- sb.append(StringEscapeUtils.escapeCsv(term.getTermString()));
- sb.append(',');
- sb.append(StringEscapeUtils.escapeCsv(term.getLanguageCode()));
- sb.append(',');
- sb.append(StringEscapeUtils.escapeCsv(term.getType()));
- sb.append(',');
- sb.append(StringEscapeUtils.escapeCsv(scorePropertyName));
- sb.append(',');
- sb.append(StringEscapeUtils.escapeCsv(score.toString()));
- sb.append(',');
- sb.append(StringEscapeUtils.escapeCsv(Integer.toString(documents.size())));
- sb.append(',');
- sb.append(StringEscapeUtils.escapeCsv(frequency.toString()));
- writer.println(sb.toString());
- }
-
-
- private static void writeTermbankHeader(PrintWriter writer) {
- StringBuilder sb = new StringBuilder();
- sb.append(StringEscapeUtils.escapeCsv("Term"));
- sb.append(',').append(StringEscapeUtils.escapeCsv("Lang"));
- sb.append(',').append(StringEscapeUtils.escapeCsv("Type"));
- sb.append(',').append(StringEscapeUtils.escapeCsv("ScoreType"));
- sb.append(',').append(StringEscapeUtils.escapeCsv("Score"));
- sb.append(',').append(StringEscapeUtils.escapeCsv("Document_Count"));
- sb.append(',').append(StringEscapeUtils.escapeCsv("Term_Frequency"));
- writer.println(sb.toString());
- }
-
-
- private static void writeDFContent(PrintWriter writer, Term term, Integer
frequency) {
- StringBuilder sb = new StringBuilder();
- sb.append(StringEscapeUtils.escapeCsv(term.getTermString()));
- sb.append(',');
- sb.append(StringEscapeUtils.escapeCsv(term.getLanguageCode()));
- sb.append(',');
- sb.append(StringEscapeUtils.escapeCsv(term.getType()));
- sb.append(',');
- sb.append(StringEscapeUtils.escapeCsv(frequency.toString()));
- writer.println(sb.toString());
- }
-
-
-
- private static void writeDFContent(PrintWriter writer, String string,
Integer frequency) {
- StringBuilder sb = new StringBuilder();
- sb.append(StringEscapeUtils.escapeCsv(string));
- sb.append(',');
- sb.append(StringEscapeUtils.escapeCsv(""));
- sb.append(',');
- sb.append(StringEscapeUtils.escapeCsv(""));
- sb.append(',');
- sb.append(StringEscapeUtils.escapeCsv(frequency.toString()));
- writer.println(sb.toString());
- }
-
-
- private static void writeDFHeader(PrintWriter writer) {
- StringBuilder sb = new StringBuilder();
- sb.append(StringEscapeUtils.escapeCsv("Term"));
- sb.append(',').append(StringEscapeUtils.escapeCsv("Lang"));
- sb.append(',').append(StringEscapeUtils.escapeCsv("Type"));
- sb.append(',').append(StringEscapeUtils.escapeCsv("DocFrequency"));
- writer.println(sb.toString());
- }
-
}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Flow-based real-time traffic analytics software. Cisco certified tool.
Monitor traffic, SLAs, QoS, Medianet, WAAS etc. with NetFlow Analyzer
Customize your own dashboards, set traffic alerts and generate reports.
Network behavioral analysis & security monitoring. All-in-one tool.
http://pubads.g.doubleclick.net/gampad/clk?id=126839071&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs