Revision: 17369
          http://sourceforge.net/p/gate/code/17369
Author:   adamfunk
Date:     2014-02-20 15:12:35 +0000 (Thu, 20 Feb 2014)
Log Message:
-----------
Roughing in the DocumentFrequencyBank

Modified Paths:
--------------
    gate/trunk/plugins/TermRaider/src/gate/termraider/bank/TfIdfTermbank.java
    gate/trunk/plugins/TermRaider/src/gate/termraider/util/AbstractBank.java

Added Paths:
-----------
    
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/DocumentFrequencyBank.java

Added: 
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/DocumentFrequencyBank.java
===================================================================
--- 
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/DocumentFrequencyBank.java
                           (rev 0)
+++ 
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/DocumentFrequencyBank.java
   2014-02-20 15:12:35 UTC (rev 17369)
@@ -0,0 +1,248 @@
+/*
+ *  Copyright (c) 2008-2014, The University of Sheffield. See the file
+ *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
+ *
+ *  This file is part of GATE (see http://gate.ac.uk/), and is free
+ *  software, licenced under the GNU Library General Public License,
+ *  Version 2, June 1991 (in the distribution as file licence.html,
+ *  and also available at http://gate.ac.uk/gate/licence.html).
+ *
+ *  $Id$
+ */
+package gate.termraider.bank;
+
+import java.io.File;
+import java.util.*;
+import javax.swing.Action;
+import gate.Annotation;
+import gate.AnnotationSet;
+import gate.Corpus;
+import gate.Document;
+import gate.Factory;
+import gate.Resource;
+import gate.creole.ResourceInstantiationException;
+import gate.creole.metadata.CreoleParameter;
+import gate.gui.ActionsPublisher;
+import gate.termraider.gui.ActionSaveCsv;
+import gate.termraider.util.*;
+import gate.util.GateException;
+
+public class DocumentFrequencyBank extends AbstractBank
+implements ActionsPublisher{
+  
+  private static final long serialVersionUID = 5149075094060830331L;
+  
+  
+  private Set<DocumentFrequencyBank> inputBanks;
+  private boolean debugMode;
+  protected String inputASName;
+  protected Set<String> inputAnnotationTypes;
+  
+  private int documentTotal;
+  private Map<Term, Integer> documentFrequencies;
+  private int minFrequency, maxFrequency;
+
+  // transient to allow serialization
+  protected transient List<Action> actionsList;
+
+
+  public Resource init() throws ResourceInstantiationException {
+    prepare();
+    processInputBanks();
+    processCorpora();
+    churnData();
+    return this;
+  }
+  
+
+  public void cleanup() {
+    super.cleanup();
+  }
+  
+  
+  
+  protected void prepare() throws ResourceInstantiationException {
+    if (corpora == null) {
+      corpora = new HashSet<Corpus>();
+    }
+    if (inputBanks == null) {
+      inputBanks = new HashSet<DocumentFrequencyBank>();
+    }
+    
+    documentTotal = 0;
+    documentFrequencies = new HashMap<Term, Integer>();
+    languages = new HashSet<String>();
+    types = new HashSet<String>();
+  }
+
+  
+  protected void createActions() {
+    actionsList = new ArrayList<Action>();
+    actionsList.add(new ActionSaveCsv("Save as CSV...", this));
+  }
+  
+  
+  protected void processCorpora() {
+    for (Corpus corpus : corpora) {
+      processCorpus(corpus);
+      if (debugMode) {
+        System.out.println("Termbank: added corpus " + corpus.getName() + " 
with " + corpus.size() + " documents");
+      }
+    }
+  }
+  
+  
+  protected void processInputBanks() {
+    for (DocumentFrequencyBank bank : inputBanks) {
+      this.documentTotal += bank.documentTotal;
+      for (Term term : bank.getTerms()) {
+        increment(term, bank.getFrequency(term));
+      }
+    }
+  }
+  
+  
+  protected void processCorpus(Corpus corpus) {
+    for (int i=0 ; i < corpus.size() ; i++) {
+      boolean wasLoaded = corpus.isDocumentLoaded(i);
+      Document document = (Document) corpus.get(i);
+      addData(document);
+      // datastore safety
+      if (! wasLoaded) {
+        corpus.unloadDocument(document);
+        Factory.deleteResource(document);
+      }
+    }
+  }
+
+  
+  protected void addData(Document document) {
+    documentTotal++;
+    AnnotationSet candidates = 
document.getAnnotations(inputASName).get(inputAnnotationTypes);
+
+    Set<Term> documentTerms = new HashSet<Term>();
+    for (Annotation candidate : candidates) {
+      documentTerms.add(makeTerm(candidate, document));
+    }
+    
+    for (Term term : documentTerms) {
+      increment(term, 1);
+    }
+  }
+
+  
+  private void churnData() {
+    minFrequency = this.getFrequency(this.getTerms().iterator().next());
+    maxFrequency = 0;
+    for (Term term : this.getTerms()) {
+      int freq = this.getFrequency(term);
+      maxFrequency = Math.max(maxFrequency, freq);
+      minFrequency = Math.min(minFrequency, freq);
+      this.types.add(term.getType());
+      this.languages.add(term.getLanguageCode());
+    }
+  }
+  
+  
+  public Set<Term> getTerms() {
+    return documentFrequencies.keySet();
+  }
+  
+  public int getFrequency(Term term) {
+    if (documentFrequencies.containsKey(term)) {
+      return documentFrequencies.get(term).intValue();
+    }
+    
+    return 0;
+  }
+  
+  
+  
+  
+  @CreoleParameter(comment = "Other DFBs to compile into the new one")
+  public void setInputBanks(Set<DocumentFrequencyBank> inputBanks) {
+    this.inputBanks = inputBanks;
+  }
+  
+  public Set<DocumentFrequencyBank> getInputBanks() {
+    return this.inputBanks;
+  }
+
+
+  @Override
+  public List<Action> getActions() {
+    // lazy instantiation because actionsList is transient
+    if (actionsList == null) {
+      createActions();
+    }
+    
+    return this.actionsList;
+  }
+
+
+  @Override
+  public Double getMinScore() {
+    return new Double(this.minFrequency);
+  }
+
+
+  @Override
+  public Double getMaxScore() {
+    return new Double(this.maxFrequency);
+  }
+
+
+  @Override
+  public void saveAsCsv(double threshold, File file) throws GateException {
+    // TODO Auto-generated method stub
+    
+  }
+
+
+  @Override
+  public void saveAsCsv(File file) throws GateException {
+    // TODO Auto-generated method stub
+  }
+
+  
+  @CreoleParameter(comment = "print debugging information during 
initialization",
+          defaultValue = "false")
+  public void setDebugMode(Boolean debug) {
+    this.debugMode = debug;
+  }
+
+  public Boolean getDebugMode() {
+    return this.debugMode;
+  }
+
+  
+  @CreoleParameter(comment = "input AS name",
+          defaultValue = "")
+  public void setInputASName(String name) {
+    this.inputASName = name;
+  }
+  public String getInputASName() {
+    return this.inputASName;
+  }
+  
+  
+  @CreoleParameter(comment = "input annotation types",
+          defaultValue = "SingleWord;MultiWord")
+  public void setInputAnnotationTypes(Set<String> names) {
+    this.inputAnnotationTypes = names;
+  }
+  
+  public Set<String> getInputAnnotationTypes() {
+    return this.inputAnnotationTypes;
+  }
+  
+  
+  private void increment(Term term, int i) {
+    int count = i;
+    if (documentFrequencies.containsKey(term)) {
+      count += documentFrequencies.get(term).intValue();
+    }
+    documentFrequencies.put(term, count);
+  }
+  
+}


Property changes on: 
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/DocumentFrequencyBank.java
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+Id
\ No newline at end of property
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Modified: 
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/TfIdfTermbank.java
===================================================================
--- gate/trunk/plugins/TermRaider/src/gate/termraider/bank/TfIdfTermbank.java   
2014-02-20 15:00:45 UTC (rev 17368)
+++ gate/trunk/plugins/TermRaider/src/gate/termraider/bank/TfIdfTermbank.java   
2014-02-20 15:12:35 UTC (rev 17369)
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2008-2012, The University of Sheffield. See the file
+ *  Copyright (c) 2008-2014, The University of Sheffield. See the file
  *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
  *
  *  This file is part of GATE (see http://gate.ac.uk/), and is free

Modified: 
gate/trunk/plugins/TermRaider/src/gate/termraider/util/AbstractBank.java
===================================================================
--- gate/trunk/plugins/TermRaider/src/gate/termraider/util/AbstractBank.java    
2014-02-20 15:00:45 UTC (rev 17368)
+++ gate/trunk/plugins/TermRaider/src/gate/termraider/util/AbstractBank.java    
2014-02-20 15:12:35 UTC (rev 17369)
@@ -110,7 +110,7 @@
     return this.inputAnnotationFeature;
   }
   
-  @CreoleParameter(comment = "Processed corpora to analyse for pairs of terms")
+  @CreoleParameter(comment = "Processed corpora to analyse")
   public void setCorpora(Set<Corpus> corpora) {
     this.corpora = corpora;
   }

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Managing the Performance of Cloud-Based Applications
Take advantage of what the Cloud has to offer - Avoid Common Pitfalls.
Read the Whitepaper.
http://pubads.g.doubleclick.net/gampad/clk?id=121054471&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to