Author: laylaoesper
Date: 2011-01-04 09:16:58 -0800 (Tue, 04 Jan 2011)
New Revision: 23293
Added:
csplugins/trunk/soc/layla/WordCloudPlugin/trunk/WordCloud/src/cytoscape/csplugins/wordcloud/Stemmer.java
Modified:
csplugins/trunk/soc/layla/WordCloudPlugin/trunk/WordCloud/jars/WordCloud.jar
csplugins/trunk/soc/layla/WordCloudPlugin/trunk/WordCloud/src/cytoscape/csplugins/wordcloud/CloudParameters.java
csplugins/trunk/soc/layla/WordCloudPlugin/trunk/WordCloud/src/cytoscape/csplugins/wordcloud/SemanticSummaryInputPanel.java
csplugins/trunk/soc/layla/WordCloudPlugin/trunk/WordCloud/src/cytoscape/csplugins/wordcloud/SemanticSummaryParameters.java
Log:
Adding stemming capabilities to the WordCloud plugin.
Modified:
csplugins/trunk/soc/layla/WordCloudPlugin/trunk/WordCloud/jars/WordCloud.jar
===================================================================
(Binary files differ)
Modified:
csplugins/trunk/soc/layla/WordCloudPlugin/trunk/WordCloud/src/cytoscape/csplugins/wordcloud/CloudParameters.java
===================================================================
---
csplugins/trunk/soc/layla/WordCloudPlugin/trunk/WordCloud/src/cytoscape/csplugins/wordcloud/CloudParameters.java
2011-01-04 16:49:43 UTC (rev 23292)
+++
csplugins/trunk/soc/layla/WordCloudPlugin/trunk/WordCloud/src/cytoscape/csplugins/wordcloud/CloudParameters.java
2011-01-04 17:16:58 UTC (rev 23293)
@@ -1055,6 +1055,22 @@
while (token.hasMoreTokens())
{
String a = token.nextToken();
+
+
+ //Stem the word if parameter is set
+ if (this.getNetworkParams().getIsStemming()) //Check for
stemming
+ {
+ Stemmer stem = new Stemmer();
+ for (int i = 0; i < a.length(); i++)
+ {
+ char ch = a.charAt(i);
+ stem.add(ch);
+ }
+ stem.stem();
+ a = stem.toString();
+ }
+
+
if (!wordSet.contains(a))
wordSet.add(a);
}
Modified:
csplugins/trunk/soc/layla/WordCloudPlugin/trunk/WordCloud/src/cytoscape/csplugins/wordcloud/SemanticSummaryInputPanel.java
===================================================================
---
csplugins/trunk/soc/layla/WordCloudPlugin/trunk/WordCloud/src/cytoscape/csplugins/wordcloud/SemanticSummaryInputPanel.java
2011-01-04 16:49:43 UTC (rev 23292)
+++
csplugins/trunk/soc/layla/WordCloudPlugin/trunk/WordCloud/src/cytoscape/csplugins/wordcloud/SemanticSummaryInputPanel.java
2011-01-04 17:16:58 UTC (rev 23293)
@@ -126,6 +126,7 @@
//Checkbox
private JCheckBox numExclusion;
private JCheckBox useNetworkCounts;
+ private JCheckBox stemmer;
//SliderBar
private SliderBarPanel sliderPanel;
@@ -283,6 +284,12 @@
networkPanel.add(tokenizationPanel);
+ //Stemmer Panel
+ CollapsiblePanel stemmingPanel = createStemmingPanel();
+ stemmingPanel.setCollapsed(true);
+
+ networkPanel.add(stemmingPanel);
+
//Add to collapsible panel
collapsiblePanel2.getContentPane().add(networkPanel,
BorderLayout.NORTH);
@@ -735,7 +742,55 @@
return collapsiblePanel;
}
+ /**
+ * Creates a CollapsiblePanel that holds the word stemming information.
+ * @return CollapsiblePanel - word stemming panel interface.
+ */
+ private CollapsiblePanel createStemmingPanel()
+ {
+ CollapsiblePanel collapsiblePanel = new CollapsiblePanel("Word
Stemming");
+
+ JPanel panel = new JPanel();
+ panel.setLayout(new GridLayout(0,1));
+
+ StringBuffer buf = new StringBuffer();
+
+
+ //Word panel
+ JPanel wordPanel = new JPanel();
+ //wordPanel.setLayout(new GridBagLayout());
+ wordPanel.setLayout(new BorderLayout());
+
+ //Create Checkbox
+ stemmer = new JCheckBox("Enable Stemming");
+
+ buf = new StringBuffer();
+ buf.append("<html>" + "Causes all words to be stemmed using the
Porter Stemmer algorithm." + "<br>");
+ buf.append("<b>Notice:</b> This will allow words with a similar
stem to map to the same word." + "<br>");
+ buf.append("However, words stems may not be what you expect." +
"</html>");
+ stemmer.setToolTipText(buf.toString());
+ stemmer.addActionListener(this);
+ stemmer.setSelected(false);
+ stemmer.setEnabled(false);
+
+ //GridBagConstraints gridBagConstraints = new
GridBagConstraints();
+ //gridBagConstraints.gridx = 0;
+ //gridBagConstraints.gridy = 0;
+ //gridBagConstraints.gridwidth = 1;
+ //gridBagConstraints.anchor = GridBagConstraints.WEST;
+ //gridBagConstraints.insets = new Insets(5,0,0,0);
+ wordPanel.add(stemmer, BorderLayout.WEST);
+
+ //Add components to main panel
+ panel.add(wordPanel);
+
+ //Add to collapsible panel
+ collapsiblePanel.getContentPane().add(panel,
BorderLayout.NORTH);
+
+ return collapsiblePanel;
+ }
+
/**
* Creates a CollapsiblePanel that holds the Cloud Layout information.
* @return CollapsiblePanel - cloud Layout panel interface.
@@ -936,6 +991,7 @@
addWordButton.setEnabled(false);
numExclusion.setEnabled(false);
useNetworkCounts.setEnabled(false);
+ stemmer.setEnabled(false);
}
else
{
@@ -943,6 +999,7 @@
addWordButton.setEnabled(true);
numExclusion.setEnabled(true);
useNetworkCounts.setEnabled(true);
+ stemmer.setEnabled(true);
}
CloudDisplayPanel displayPanel =
SemanticSummaryManager.getInstance().getCloudWindow();
@@ -1182,6 +1239,16 @@
}
/**
+ * Sets the stemming checkbox based on the current network.
+ */
+ private void updateStemmingBox()
+ {
+ SemanticSummaryParameters networkParams =
SemanticSummaryManager.getInstance().getCurNetwork();
+ boolean val = networkParams.getIsStemming();
+ stemmer.setSelected(val);
+ }
+
+ /**
* Refreshes everything in the input panel that is on the network level.
*/
public void refreshNetworkSettings()
@@ -1189,6 +1256,7 @@
this.refreshRemovalCMB();
this.updateNumExclusionBox();
this.updateDelimiterCMBs();
+ this.updateStemmingBox();
}
@@ -1586,6 +1654,18 @@
}
}//end collapse Network Normalization panel
}//end of useNetworkCounts
+
+ if (_box == stemmer)
+ {
+ Boolean selected = stemmer.isSelected();
+
+ //add value to networkParams and update
+ SemanticSummaryParameters networkParams =
SemanticSummaryManager.getInstance().getCurNetwork();
+ networkParams.setIsStemming(selected);
+
+ //Reset flags
+ networkParams.networkChanged();
+ }
}//end checkboxes
}
@@ -1758,7 +1838,12 @@
return saveCloudButton;
}
+ public JCheckBox getStemmerCheckBox()
+ {
+ return stemmer;
+ }
+
/**
* Private Class to ensure that text fields are being set properly
*/
Modified:
csplugins/trunk/soc/layla/WordCloudPlugin/trunk/WordCloud/src/cytoscape/csplugins/wordcloud/SemanticSummaryParameters.java
===================================================================
---
csplugins/trunk/soc/layla/WordCloudPlugin/trunk/WordCloud/src/cytoscape/csplugins/wordcloud/SemanticSummaryParameters.java
2011-01-04 16:49:43 UTC (rev 23292)
+++
csplugins/trunk/soc/layla/WordCloudPlugin/trunk/WordCloud/src/cytoscape/csplugins/wordcloud/SemanticSummaryParameters.java
2011-01-04 17:16:58 UTC (rev 23293)
@@ -65,6 +65,9 @@
private WordFilter filter;
private WordDelimiters delimiters;
+ //Stemming stuff
+ private boolean isStemming = false;
+
//CONSTRUCTORS
/**
@@ -114,6 +117,13 @@
nodeNameList.add(nodeName);
}
this.nodeList = nodeNameList;
+
+ //Backwards compatible isStemming
+ String val = props.get("isStemming");
+ if (val == null)
+ {this.isStemming = false;}
+ else
+ {this.isStemming =
Boolean.parseBoolean(props.get("isStemming"));}
}
@@ -220,6 +230,7 @@
}
paramVariables.append("NodeList\t" + output.toString() + "\n");
+ paramVariables.append("isStemming\t" + isStemming + "\n");
return paramVariables.toString();
}
@@ -390,4 +401,14 @@
delimiters = aDelimiter;
}
+ public boolean getIsStemming()
+ {
+ return isStemming;
+ }
+
+ public void setIsStemming(boolean val)
+ {
+ isStemming = val;
+ }
+
}
Added:
csplugins/trunk/soc/layla/WordCloudPlugin/trunk/WordCloud/src/cytoscape/csplugins/wordcloud/Stemmer.java
===================================================================
---
csplugins/trunk/soc/layla/WordCloudPlugin/trunk/WordCloud/src/cytoscape/csplugins/wordcloud/Stemmer.java
(rev 0)
+++
csplugins/trunk/soc/layla/WordCloudPlugin/trunk/WordCloud/src/cytoscape/csplugins/wordcloud/Stemmer.java
2011-01-04 17:16:58 UTC (rev 23293)
@@ -0,0 +1,450 @@
+/*
+ File: Stemmer.java
+
+ Copyright 2010 - The Cytoscape Consortium (www.cytoscape.org)
+
+ Code written by: Layla Oesper
+ Authors: Layla Oesper, Ruth Isserlin, Daniele Merico
+
+ This library is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with this project. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+package cytoscape.csplugins.wordcloud;
+
+/*
+
+Porter stemmer in Java. The original paper is in
+
+ Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+ no. 3, pp 130-137,
+
+See also http://www.tartarus.org/~martin/PorterStemmer
+
+History:
+
+Release 1
+
+Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
+The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
+is then out outside the bounds of b.
+
+Release 2
+
+Similarly,
+
+Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
+'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
+b[j] is then outside the bounds of b.
+
+Release 3
+
+Considerably revised 4/9/00 in the light of many helpful suggestions
+from Brian Goetz of Quiotix Corporation ([email protected]).
+
+Release 4
+
+*/
+
+import java.io.*;
+
+/**
+* Stemmer, implementing the Porter Stemming Algorithm
+*
+* The Stemmer class transforms a word into its root form. The input
+* word can be provided a character at time (by calling add()), or at once
+* by calling one of the various stem(something) methods.
+*/
+
+class Stemmer
+{ private char[] b;
+private int i, /* offset into b */
+ i_end, /* offset to end of stemmed word */
+ j, k;
+private static final int INC = 50;
+ /* unit of size whereby b is increased */
+public Stemmer()
+{ b = new char[INC];
+ i = 0;
+ i_end = 0;
+}
+
+/**
+ * Add a character to the word being stemmed. When you are finished
+ * adding characters, you can call stem(void) to stem the word.
+ */
+
+public void add(char ch)
+{ if (i == b.length)
+ { char[] new_b = new char[i+INC];
+ for (int c = 0; c < i; c++) new_b[c] = b[c];
+ b = new_b;
+ }
+ b[i++] = ch;
+}
+
+
+/** Adds wLen characters to the word being stemmed contained in a portion
+ * of a char[] array. This is like repeated calls of add(char ch), but
+ * faster.
+ */
+
+public void add(char[] w, int wLen)
+{ if (i+wLen >= b.length)
+ { char[] new_b = new char[i+wLen+INC];
+ for (int c = 0; c < i; c++) new_b[c] = b[c];
+ b = new_b;
+ }
+ for (int c = 0; c < wLen; c++) b[i++] = w[c];
+}
+
+/**
+ * After a word has been stemmed, it can be retrieved by toString(),
+ * or a reference to the internal buffer can be retrieved by getResultBuffer
+ * and getResultLength (which is generally more efficient.)
+ */
+public String toString() { return new String(b,0,i_end); }
+
+/**
+ * Returns the length of the word resulting from the stemming process.
+ */
+public int getResultLength() { return i_end; }
+
+/**
+ * Returns a reference to a character buffer containing the results of
+ * the stemming process. You also need to consult getResultLength()
+ * to determine the length of the result.
+ */
+public char[] getResultBuffer() { return b; }
+
+/* cons(i) is true <=> b[i] is a consonant. */
+
+private final boolean cons(int i)
+{ switch (b[i])
+ { case 'a': case 'e': case 'i': case 'o': case 'u': return false;
+ case 'y': return (i==0) ? true : !cons(i-1);
+ default: return true;
+ }
+}
+
+/* m() measures the number of consonant sequences between 0 and j. if c is
+ a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
+ presence,
+
+ <c><v> gives 0
+ <c>vc<v> gives 1
+ <c>vcvc<v> gives 2
+ <c>vcvcvc<v> gives 3
+ ....
+*/
+
+private final int m()
+{ int n = 0;
+ int i = 0;
+ while(true)
+ { if (i > j) return n;
+ if (! cons(i)) break; i++;
+ }
+ i++;
+ while(true)
+ { while(true)
+ { if (i > j) return n;
+ if (cons(i)) break;
+ i++;
+ }
+ i++;
+ n++;
+ while(true)
+ { if (i > j) return n;
+ if (! cons(i)) break;
+ i++;
+ }
+ i++;
+ }
+}
+
+/* vowelinstem() is true <=> 0,...j contains a vowel */
+
+private final boolean vowelinstem()
+{ int i; for (i = 0; i <= j; i++) if (! cons(i)) return true;
+ return false;
+}
+
+/* doublec(j) is true <=> j,(j-1) contain a double consonant. */
+
+private final boolean doublec(int j)
+{ if (j < 1) return false;
+ if (b[j] != b[j-1]) return false;
+ return cons(j);
+}
+
+/* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
+ and also if the second c is not w,x or y. this is used when trying to
+ restore an e at the end of a short word. e.g.
+
+ cav(e), lov(e), hop(e), crim(e), but
+ snow, box, tray.
+
+*/
+
+private final boolean cvc(int i)
+{ if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) return false;
+ { int ch = b[i];
+ if (ch == 'w' || ch == 'x' || ch == 'y') return false;
+ }
+ return true;
+}
+
+private final boolean ends(String s)
+{ int l = s.length();
+ int o = k-l+1;
+ if (o < 0) return false;
+ for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) return false;
+ j = k-l;
+ return true;
+}
+
+/* setto(s) sets (j+1),...k to the characters in the string s, readjusting
+ k. */
+
+private final void setto(String s)
+{ int l = s.length();
+ int o = j+1;
+ for (int i = 0; i < l; i++) b[o+i] = s.charAt(i);
+ k = j+l;
+}
+
+/* r(s) is used further down. */
+
+private final void r(String s) { if (m() > 0) setto(s); }
+
+/* step1() gets rid of plurals and -ed or -ing. e.g.
+
+ caresses -> caress
+ ponies -> poni
+ ties -> ti
+ caress -> caress
+ cats -> cat
+
+ feed -> feed
+ agreed -> agree
+ disabled -> disable
+
+ matting -> mat
+ mating -> mate
+ meeting -> meet
+ milling -> mill
+ messing -> mess
+
+ meetings -> meet
+
+*/
+
+private final void step1()
+{ if (b[k] == 's')
+ { if (ends("sses")) k -= 2; else
+ if (ends("ies")) setto("i"); else
+ if (b[k-1] != 's') k--;
+ }
+ if (ends("eed")) { if (m() > 0) k--; } else
+ if ((ends("ed") || ends("ing")) && vowelinstem())
+ { k = j;
+ if (ends("at")) setto("ate"); else
+ if (ends("bl")) setto("ble"); else
+ if (ends("iz")) setto("ize"); else
+ if (doublec(k))
+ { k--;
+ { int ch = b[k];
+ if (ch == 'l' || ch == 's' || ch == 'z') k++;
+ }
+ }
+ else if (m() == 1 && cvc(k)) setto("e");
+ }
+}
+
+/* step2() turns terminal y to i when there is another vowel in the stem. */
+
+private final void step2() { if (ends("y") && vowelinstem()) b[k] = 'i'; }
+
+/* step3() maps double suffices to single ones. so -ization ( = -ize plus
+ -ation) maps to -ize etc. note that the string before the suffix must give
+ m() > 0. */
+
+private final void step3() { if (k == 0) return; /* For Bug 1 */ switch
(b[k-1])
+{
+ case 'a': if (ends("ational")) { r("ate"); break; }
+ if (ends("tional")) { r("tion"); break; }
+ break;
+ case 'c': if (ends("enci")) { r("ence"); break; }
+ if (ends("anci")) { r("ance"); break; }
+ break;
+ case 'e': if (ends("izer")) { r("ize"); break; }
+ break;
+ case 'l': if (ends("bli")) { r("ble"); break; }
+ if (ends("alli")) { r("al"); break; }
+ if (ends("entli")) { r("ent"); break; }
+ if (ends("eli")) { r("e"); break; }
+ if (ends("ousli")) { r("ous"); break; }
+ break;
+ case 'o': if (ends("ization")) { r("ize"); break; }
+ if (ends("ation")) { r("ate"); break; }
+ if (ends("ator")) { r("ate"); break; }
+ break;
+ case 's': if (ends("alism")) { r("al"); break; }
+ if (ends("iveness")) { r("ive"); break; }
+ if (ends("fulness")) { r("ful"); break; }
+ if (ends("ousness")) { r("ous"); break; }
+ break;
+ case 't': if (ends("aliti")) { r("al"); break; }
+ if (ends("iviti")) { r("ive"); break; }
+ if (ends("biliti")) { r("ble"); break; }
+ break;
+ case 'g': if (ends("logi")) { r("log"); break; }
+} }
+
+/* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
+
+private final void step4() { switch (b[k])
+{
+ case 'e': if (ends("icate")) { r("ic"); break; }
+ if (ends("ative")) { r(""); break; }
+ if (ends("alize")) { r("al"); break; }
+ break;
+ case 'i': if (ends("iciti")) { r("ic"); break; }
+ break;
+ case 'l': if (ends("ical")) { r("ic"); break; }
+ if (ends("ful")) { r(""); break; }
+ break;
+ case 's': if (ends("ness")) { r(""); break; }
+ break;
+} }
+
+/* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
+
+private final void step5()
+{ if (k == 0) return; /* for Bug 1 */ switch (b[k-1])
+ { case 'a': if (ends("al")) break; return;
+ case 'c': if (ends("ance")) break;
+ if (ends("ence")) break; return;
+ case 'e': if (ends("er")) break; return;
+ case 'i': if (ends("ic")) break; return;
+ case 'l': if (ends("able")) break;
+ if (ends("ible")) break; return;
+ case 'n': if (ends("ant")) break;
+ if (ends("ement")) break;
+ if (ends("ment")) break;
+ /* element etc. not stripped before the m */
+ if (ends("ent")) break; return;
+ case 'o': if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't'))
break;
+ /* j >= 0 fixes Bug 2 */
+ if (ends("ou")) break; return;
+ /* takes care of -ous */
+ case 's': if (ends("ism")) break; return;
+ case 't': if (ends("ate")) break;
+ if (ends("iti")) break; return;
+ case 'u': if (ends("ous")) break; return;
+ case 'v': if (ends("ive")) break; return;
+ case 'z': if (ends("ize")) break; return;
+ default: return;
+ }
+ if (m() > 1) k = j;
+}
+
+/* step6() removes a final -e if m() > 1. */
+
+private final void step6()
+{ j = k;
+ if (b[k] == 'e')
+ { int a = m();
+ if (a > 1 || a == 1 && !cvc(k-1)) k--;
+ }
+ if (b[k] == 'l' && doublec(k) && m() > 1) k--;
+}
+
+/** Stem the word placed into the Stemmer buffer through calls to add().
+ * Returns true if the stemming process resulted in a word different
+ * from the input. You can retrieve the result with
+ * getResultLength()/getResultBuffer() or toString().
+ */
+public void stem()
+{ k = i - 1;
+ if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); }
+ i_end = k+1; i = 0;
+}
+
+/** Test program for demonstrating the Stemmer. It reads text from a
+ * a list of files, stems each word, and writes the result to standard
+ * output. Note that the word stemmed is expected to be in lower case:
+ * forcing lower case must be done outside the Stemmer class.
+ * Usage: Stemmer file-name file-name ...
+ */
+public static void main(String[] args)
+{
+ char[] w = new char[501];
+ Stemmer s = new Stemmer();
+ for (int i = 0; i < args.length; i++)
+ try
+ {
+ FileInputStream in = new FileInputStream(args[i]);
+
+ try
+ { while(true)
+
+ { int ch = in.read();
+ if (Character.isLetter((char) ch))
+ {
+ int j = 0;
+ while(true)
+ { ch = Character.toLowerCase((char) ch);
+ w[j] = (char) ch;
+ if (j < 500) j++;
+ ch = in.read();
+ if (!Character.isLetter((char) ch))
+ {
+ /* to test add(char ch) */
+ for (int c = 0; c < j; c++) s.add(w[c]);
+
+ /* or, to test add(char[] w, int j) */
+ /* s.add(w, j); */
+
+ s.stem();
+ { String u;
+
+ /* and now, to test toString() : */
+ u = s.toString();
+
+ /* to test getResultBuffer(), getResultLength() : */
+ /* u = new String(s.getResultBuffer(), 0,
s.getResultLength()); */
+
+ System.out.print(u);
+ }
+ break;
+ }
+ }
+ }
+ if (ch < 0) break;
+ System.out.print((char)ch);
+ }
+ }
+ catch (IOException e)
+ { System.out.println("error reading " + args[i]);
+ break;
+ }
+ }
+ catch (FileNotFoundException e)
+ { System.out.println("file " + args[i] + " not found");
+ break;
+ }
+}
+}
+
--
You received this message because you are subscribed to the Google Groups
"cytoscape-cvs" group.
To post to this group, send email to [email protected].
To unsubscribe from this group, send email to
[email protected].
For more options, visit this group at
http://groups.google.com/group/cytoscape-cvs?hl=en.