Revision: 17445
http://sourceforge.net/p/gate/code/17445
Author: adamfunk
Date: 2014-02-26 16:44:09 +0000 (Wed, 26 Feb 2014)
Log Message:
-----------
More juggling & decrufting
Modified Paths:
--------------
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/AbstractTermbank.java
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/AnnotationTermbank.java
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/HyponymyTermbank.java
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/TfIdfTermbank.java
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/modes/IdfCalculation.java
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/modes/TfCalculation.java
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/util/Utilities.java
Added Paths:
-----------
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/modes/Normalization.java
Modified:
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/AbstractTermbank.java
===================================================================
---
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/AbstractTermbank.java
2014-02-26 16:19:55 UTC (rev 17444)
+++
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/AbstractTermbank.java
2014-02-26 16:44:09 UTC (rev 17445)
@@ -205,32 +205,20 @@
/* BEHOLD THE GUBBINS to distinguish the various types of Termbanks */
- /**
- * This method needs to call incrementTermFreq(...)!
- */
+ protected abstract void resetScores();
+
protected abstract void processDocument(Document document);
protected abstract void calculateScores();
- protected abstract void resetScores();
-
protected int incrementTermFreq(Term term, int increment) {
- return incrementMap(termFrequencies, term, increment);
+ return Utilities.incrementMap(termFrequencies, term, increment);
}
- protected int incrementMap(Map<Term, Integer> map, Term key, int increment) {
- int count = 0;
- if (map.containsKey(key)) {
- count = map.get(key).intValue();
- }
- count += increment;
- map.put(key, Integer.valueOf(count));
- return count;
- }
public Double getScore(Term term) {
Modified:
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/AnnotationTermbank.java
===================================================================
---
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/AnnotationTermbank.java
2014-02-26 16:19:55 UTC (rev 17444)
+++
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/AnnotationTermbank.java
2014-02-26 16:44:09 UTC (rev 17445)
@@ -81,7 +81,7 @@
}
rawTermScores.put(term, score);
- termScores.put(term, Utilities.normalizeScore(score));
+ termScores.put(term, Normalization.normalizeScore(score));
}
termsByDescendingScore = new ArrayList<Term>(termScores.keySet());
Modified:
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/HyponymyTermbank.java
===================================================================
---
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/HyponymyTermbank.java
2014-02-26 16:19:55 UTC (rev 17444)
+++
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/HyponymyTermbank.java
2014-02-26 16:44:09 UTC (rev 17445)
@@ -14,6 +14,7 @@
import gate.creole.metadata.*;
import gate.gui.ActionsPublisher;
import gate.*;
+import gate.termraider.bank.modes.*;
import gate.termraider.util.*;
import org.apache.commons.lang.StringEscapeUtils;
import java.util.*;
@@ -135,7 +136,7 @@
for (Term term : terms) {
double rawScore = calculateOneRawScore(term);
rawTermScores.put(term, rawScore);
- double score = Utilities.normalizeScore(rawScore);
+ double score = Normalization.normalizeScore(rawScore);
termScores.put(term, score);
}
Modified:
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/TfIdfTermbank.java
===================================================================
---
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/TfIdfTermbank.java
2014-02-26 16:19:55 UTC (rev 17444)
+++
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/TfIdfTermbank.java
2014-02-26 16:44:09 UTC (rev 17445)
@@ -15,8 +15,7 @@
import gate.creole.metadata.*;
import gate.gui.ActionsPublisher;
import gate.*;
-import gate.termraider.bank.modes.IdfCalculation;
-import gate.termraider.bank.modes.TfCalculation;
+import gate.termraider.bank.modes.*;
import gate.termraider.util.*;
import java.util.*;
import org.apache.commons.lang.StringEscapeUtils;
@@ -87,7 +86,7 @@
int n = docFreqSource.getTotalDocs();
double score = TfCalculation.calculate(tfCalculation, tf) *
IdfCalculation.calculate(idfCalculation, df, n);
rawTermScores.put(term, Double.valueOf(score));
- termScores.put(term, Utilities.normalizeScore(score));
+ termScores.put(term, Normalization.normalizeScore(score));
}
termsByDescendingScore = new ArrayList<Term>(termScores.keySet());
Modified:
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/modes/IdfCalculation.java
===================================================================
---
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/modes/IdfCalculation.java
2014-02-26 16:19:55 UTC (rev 17444)
+++
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/modes/IdfCalculation.java
2014-02-26 16:44:09 UTC (rev 17445)
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2012, The University of Sheffield. See the file
+ * Copyright (c) 2012--2014, The University of Sheffield. See the file
* COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
@@ -11,12 +11,15 @@
*/
package gate.termraider.bank.modes;
+import gate.termraider.util.Utilities;
+
public enum IdfCalculation {
- Natural,
- Logarithmic;
+ Logarithmic,
+ Scaled,
+ Natural;
- /* These calculations are from Manning & Schütze, Foundations of
- * Statistical NLP, section 15.2 (p.544).
+ /* These calculations are partly based on Manning & Schütze,
+ * Foundations of Statistical NLP, section 15.2 (p.544).
*/
public static double calculate(IdfCalculation mode, int rawDF, int
corpusSize) {
@@ -24,25 +27,15 @@
double n = (double) corpusSize;
if (mode == Logarithmic) {
- return 1.0 + logarithm(n / (df + 1.0));
+ return 1.0 + Utilities.log2(n / (df + 1.0));
}
+
+ if (mode == Scaled) {
+ return (1.0 + n )/ (df + 1.0);
+ }
- // TODO: review the df calculation modes; they must always return
- // something > 0.
-
// must be Natural
return 1.0 / (df + 1.0);
}
- public static final double logBase = 2.0;
- private static double conversion;
-
- static {
- conversion = Math.log10(logBase);
- }
-
- public static double logarithm(double input) {
- return Math.log10(input) / conversion;
- }
-
}
Added:
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/modes/Normalization.java
===================================================================
---
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/modes/Normalization.java
(rev 0)
+++
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/modes/Normalization.java
2014-02-26 16:44:09 UTC (rev 17445)
@@ -0,0 +1,37 @@
+package gate.termraider.bank.modes;
+
+public enum Normalization {
+ None,
+ Sigmoid;
+
+
+ private static double xScale = 4.8;
+
+
+ public static double calculate(Normalization mode, Number raw) {
+ if (mode == None) {
+ return raw.doubleValue();
+ }
+
+ // must be sigmoid
+ return normalizeScore(raw.doubleValue());
+ }
+
+
+ // TODO: make the following private and add normalization
+ // options to the termbanks (except DFB)
+
+ /**
+ * The following produces the right half of a sigmoid
+ * curve adjusted so that
+ * f(0) = 0; f(inf) = 100; f(x>0) > 0
+ * @param score from 0 to inf
+ * @return score from 0 to 100
+ */
+ public static double normalizeScore(double score) {
+ double norm = 2.0 / (1.0 + Math.exp(-score / xScale)) - 1.0;
+ return (double) (100.0F * norm);
+ }
+
+
+}
Property changes on:
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/modes/Normalization.java
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+Id
\ No newline at end of property
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Modified:
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/modes/TfCalculation.java
===================================================================
---
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/modes/TfCalculation.java
2014-02-26 16:19:55 UTC (rev 17444)
+++
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/modes/TfCalculation.java
2014-02-26 16:44:09 UTC (rev 17445)
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2012, The University of Sheffield. See the file
+ * Copyright (c) 2012--2014, The University of Sheffield. See the file
* COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
@@ -11,8 +11,11 @@
*/
package gate.termraider.bank.modes;
+import gate.termraider.util.Utilities;
+
public enum TfCalculation {
Natural,
+ Sqrt,
Logarithmic;
@@ -20,9 +23,13 @@
double tf = (double) rawTF;
if (mode == Logarithmic) {
- return 1.0 + IdfCalculation.logarithm(tf);
+ return 1.0 + Utilities.log2(tf);
}
+ else if (mode == Sqrt) {
+ return Math.sqrt(tf);
+ }
+
// must be Natural
return tf;
}
Modified:
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/util/Utilities.java
===================================================================
---
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/util/Utilities.java
2014-02-26 16:19:55 UTC (rev 17444)
+++
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/util/Utilities.java
2014-02-26 16:44:09 UTC (rev 17445)
@@ -25,13 +25,23 @@
public static final String EXTENSION_CSV = "csv";
private static double log10of2;
- private static double xScale = 4.8;
static {
log10of2 = Math.log10(2.0);
}
+
+ public static int incrementMap(Map<Term, Integer> map, Term key, int
increment) {
+ int count = 0;
+ if (map.containsKey(key)) {
+ count = map.get(key).intValue();
+ }
+ count += increment;
+ map.put(key, Integer.valueOf(count));
+ return count;
+ }
+
public static double meanDoubleList(List<Double> list) {
if (list.isEmpty()) {
return 0.0;
@@ -44,20 +54,7 @@
return total / ((double) list.size());
}
- /**
- * The following produces the right half of a sigmoid
- * curve adjusted so that
- * f(0) = 0; f(inf) = 100; f(x>0) > 0
- * @param score from 0 to inf
- * @return score from 0 to 100
- */
- public static double normalizeScore(double score) {
- double norm = 2.0 / (1.0 + Math.exp(-score / xScale)) - 1.0;
- return (double) (100.0F * norm);
- }
-
-
public static Double convertToDouble(Object x) {
if (x instanceof Number) {
return ((Number) x).doubleValue();
@@ -131,10 +128,7 @@
return url.toString();
}
-
-
-
-
+
public static File addExtensionIfNotExtended(File file, String extension) {
String name = file.getName();
if (name.contains(".")) {
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Flow-based real-time traffic analytics software. Cisco certified tool.
Monitor traffic, SLAs, QoS, Medianet, WAAS etc. with NetFlow Analyzer
Customize your own dashboards, set traffic alerts and generate reports.
Network behavioral analysis & security monitoring. All-in-one tool.
http://pubads.g.doubleclick.net/gampad/clk?id=126839071&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs