Revision: 17691
http://sourceforge.net/p/gate/code/17691
Author: markagreenwood
Date: 2014-03-18 14:34:18 +0000 (Tue, 18 Mar 2014)
Log Message:
-----------
so that Leon and friends don't need to RTFM :)
Modified Paths:
--------------
gate/trunk/plugins/Twitter/resources/twitie-en.xgapp
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java
gate/trunk/plugins/Twitter/src/gate/twitter/HashtagTokenizer.java
gate/trunk/plugins/Twitter/src/gate/twitter/Normaliser.java
Added Paths:
-----------
gate/trunk/plugins/Twitter/resources/tokeniser/DefaultTokeniser.rules
gate/trunk/plugins/Twitter/src/gate/twitter/tokenizer/
gate/trunk/plugins/Twitter/src/gate/twitter/tokenizer/TokenizerEN.java
Added: gate/trunk/plugins/Twitter/resources/tokeniser/DefaultTokeniser.rules
===================================================================
--- gate/trunk/plugins/Twitter/resources/tokeniser/DefaultTokeniser.rules
(rev 0)
+++ gate/trunk/plugins/Twitter/resources/tokeniser/DefaultTokeniser.rules
2014-03-18 14:34:18 UTC (rev 17691)
@@ -0,0 +1,104 @@
+#DefaultTokeniser.rules#
+#diana 28/6/00#
+#update 9/7/00#
+
+#Tokeniser rule file
+#Each rule should be on one line
+#Lines that end with "\" are appended with the next one. This facility \
+ is used for longer rules that cannot be written on a single line
+#
+#Lines starting with "#" are treated as comment
+//Lines starting with "//" are treated as comment
+# Empty lines are ignored.
+
+#A rule has a left hand side (LHS) and a right hand side (RHS);
+#the LHS is a regular expression tha has to be matched on the input
+#the RHS describes the annotations to be added to the AnnotationSet.
+#LHS is separated from the RHS by '>'
+#LHS knows about the following operators:
+# + (1..n)
+# * (0..n)
+# | (boolean OR)
+#
+#RHS uses as separator ';' and has the following format
+#{LHS} > {Annotation type};{attribute1}={value1};...;{attribute n}={value n}
+
+
+#The primitive constructs are:
+#UNASSIGNED
+#UPPERCASE_LETTER
+#LOWERCASE_LETTER
+#TITLECASE_LETTER
+#MODIFIER_LETTER
+#OTHER_LETTER
+#NON_SPACING_MARK
+#ENCLOSING_MARK
+#COMBINING_SPACING_MARK
+#DECIMAL_DIGIT_NUMBER
+#LETTER_NUMBER
+#OTHER_NUMBER
+#SPACE_SEPARATOR
+#LINE_SEPARATOR
+#PARAGRAPH_SEPARATOR
+#CONTROL
+#FORMAT
+#PRIVATE_USE
+#SURROGATE
+#DASH_PUNCTUATION
+#START_PUNCTUATION
+#END_PUNCTUATION
+#CONNECTOR_PUNCTUATION
+#OTHER_PUNCTUATION
+#MATH_SYMBOL
+#CURRENCY_SYMBOL
+#MODIFIER_SYMBOL
+#OTHER_SYMBOL
+#...representing the corresponding enumerated Unicode category types
+# See java.lang.Character for the Java version you are using
+
+#------- The rules start here -----------------
+
+#words#
+// a word can be any combination of letters, including hyphens,
+// but excluding symbols and punctuation, e.g. apostrophes
+// Note that there is an alternative version of the tokeniser that
+// treats hyphens as separate tokens
+
+
+"UPPERCASE_LETTER" (LOWERCASE_LETTER
(LOWERCASE_LETTER|DASH_PUNCTUATION|FORMAT)*)* >
Token;orth=upperInitial;kind=word;
+"UPPERCASE_LETTER" (DASH_PUNCTUATION|FORMAT)*
(UPPERCASE_LETTER|DASH_PUNCTUATION|FORMAT)+ > Token;orth=allCaps;kind=word;
+"LOWERCASE_LETTER" (LOWERCASE_LETTER|DASH_PUNCTUATION|FORMAT)* >
Token;orth=lowercase;kind=word;
+
+// MixedCaps is any mixture of caps and small letters that doesn't
+// fit in the preceding categories
+
+("LOWERCASE_LETTER" "LOWERCASE_LETTER"+"UPPERCASE_LETTER"+ \
+ (UPPERCASE_LETTER|LOWERCASE_LETTER)*)|\
+("LOWERCASE_LETTER" "LOWERCASE_LETTER"*"UPPERCASE_LETTER"+\
+ (UPPERCASE_LETTER|LOWERCASE_LETTER|DASH_PUNCTUATION|FORMAT)*)|\
+("UPPERCASE_LETTER" (DASH_PUNCTUATION)* "UPPERCASE_LETTER"
(UPPERCASE_LETTER|LOWERCASE_LETTER|DASH_PUNCTUATION|FORMAT)*\
+ ("LOWERCASE_LETTER")+
(UPPERCASE_LETTER|LOWERCASE_LETTER|DASH_PUNCTUATION|FORMAT)*)|\
+("UPPERCASE_LETTER" "LOWERCASE_LETTER"+ ("UPPERCASE_LETTER"+
"LOWERCASE_LETTER"+)+)|\
+ ((UPPERCASE_LETTER)+ (LOWERCASE_LETTER)+ (UPPERCASE_LETTER)+)\
+> Token;orth=mixedCaps;kind=word;
+
+(OTHER_LETTER|COMBINING_SPACING_MARK|NON_SPACING_MARK)+
>Token;kind=word;type=other;
+
+#numbers#
+// a number is any combination of digits
+"DECIMAL_DIGIT_NUMBER"+ >Token;kind=number;
+"OTHER_NUMBER"+ >Token;kind=number;
+
+#whitespace#
+(SPACE_SEPARATOR) >SpaceToken;kind=space;
+(CONTROL) >SpaceToken;kind=control;
+
+#symbols#
+(MODIFIER_SYMBOL|MATH_SYMBOL|OTHER_SYMBOL) > Token;kind=symbol;
+CURRENCY_SYMBOL > Token;kind=symbol;symbolkind=currency;
+
+#punctuation#
+(DASH_PUNCTUATION|FORMAT) >Token;kind=punctuation;subkind=dashpunct;
+(CONNECTOR_PUNCTUATION|OTHER_PUNCTUATION)>Token;kind=punctuation;
+("START_PUNCTUATION"|"INITIAL_QUOTE_PUNCTUATION")
>Token;kind=punctuation;position=startpunct;
+("END_PUNCTUATION"|"FINAL_QUOTE_PUNCTUATION")
>Token;kind=punctuation;position=endpunct;
Modified: gate/trunk/plugins/Twitter/resources/twitie-en.xgapp
===================================================================
--- gate/trunk/plugins/Twitter/resources/twitie-en.xgapp 2014-03-18
13:04:42 UTC (rev 17690)
+++ gate/trunk/plugins/Twitter/resources/twitie-en.xgapp 2014-03-18
14:34:18 UTC (rev 17691)
@@ -235,28 +235,11 @@
</entry>
</localMap>
</runtimeParams>
- <resourceType>gate.creole.tokeniser.DefaultTokeniser</resourceType>
+ <resourceType>gate.twitter.tokenizer.TokenizerEN</resourceType>
<resourceName>Twitter Tokenizer (EN)</resourceName>
<initParams class="gate.util.persistence.MapPersistence">
<mapType>gate.util.SimpleFeatureMapImpl</mapType>
- <localMap>
- <entry>
- <string>encoding</string>
- <string>UTF-8</string>
- </entry>
- <entry>
- <string>tokeniserRulesURL</string>
- <gate.util.persistence.PersistenceManager-URLHolder>
-
<urlString>$relpath$../../ANNIE/resources/tokeniser/DefaultTokeniser.rules</urlString>
- </gate.util.persistence.PersistenceManager-URLHolder>
- </entry>
- <entry>
- <string>transducerGrammarURL</string>
- <gate.util.persistence.PersistenceManager-URLHolder>
-
<urlString>$relpath$tokeniser/twitter+English.jape</urlString>
- </gate.util.persistence.PersistenceManager-URLHolder>
- </entry>
- </localMap>
+ <localMap/>
</initParams>
<features class="gate.util.persistence.MapPersistence">
<mapType>gate.util.SimpleFeatureMapImpl</mapType>
Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java
2014-03-18 13:04:42 UTC (rev 17690)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java
2014-03-18 14:34:18 UTC (rev 17691)
@@ -63,7 +63,7 @@
// For now, we assume the streaming API format (concatenated maps, not
in a list)
List<Tweet> tweets = TweetUtils.readTweetStrings(lines, contentKeys,
featureKeys);
- int digits = (int) Math.ceil(Math.log10((double) tweets.size()));
+ int digits = (int) Math.ceil(Math.log10(tweets.size()));
int tweetCounter = 0;
Document document = newDocument(inputUrl, tweetCounter, digits);
StringBuilder content = new StringBuilder();
Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java
2014-03-18 13:04:42 UTC (rev 17690)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java
2014-03-18 14:34:18 UTC (rev 17691)
@@ -188,7 +188,7 @@
// value to a normal object (possibly FeatureMap)
return process(value);
}
- else if (value instanceof JsonNode){
+ else if (value != null){
// Found current key; keep digging for the rest
return dig(value, keySequence, index + 1);
}
Modified: gate/trunk/plugins/Twitter/src/gate/twitter/HashtagTokenizer.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/twitter/HashtagTokenizer.java
2014-03-18 13:04:42 UTC (rev 17690)
+++ gate/trunk/plugins/Twitter/src/gate/twitter/HashtagTokenizer.java
2014-03-18 14:34:18 UTC (rev 17691)
@@ -506,7 +506,7 @@
if(annotations == null || annotations.isEmpty()) return sorted;
sorted.addAll(annotations);
- Collections.sort(sorted, (Comparator<Annotation>)lengthComparator);
+ Collections.sort(sorted, lengthComparator);
// TODO filter out annotations with the same span
Modified: gate/trunk/plugins/Twitter/src/gate/twitter/Normaliser.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/twitter/Normaliser.java 2014-03-18
13:04:42 UTC (rev 17690)
+++ gate/trunk/plugins/Twitter/src/gate/twitter/Normaliser.java 2014-03-18
14:34:18 UTC (rev 17691)
@@ -239,7 +239,7 @@
new ArrayList<Annotation>(annotations.get());
boolean found = false;
for(int i = 0; i < tempList.size(); i++) {
- Annotation annotation = (Annotation)tempList.get(i);
+ Annotation annotation = tempList.get(i);
if(annotation.getStartNode().getOffset().intValue() == start &&
annotation.getEndNode().getOffset().intValue() == end) {
// this is the one
Added: gate/trunk/plugins/Twitter/src/gate/twitter/tokenizer/TokenizerEN.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/twitter/tokenizer/TokenizerEN.java
(rev 0)
+++ gate/trunk/plugins/Twitter/src/gate/twitter/tokenizer/TokenizerEN.java
2014-03-18 14:34:18 UTC (rev 17691)
@@ -0,0 +1,20 @@
+package gate.twitter.tokenizer;
+
+import gate.creole.metadata.CreoleParameter;
+import gate.creole.metadata.CreoleResource;
+import gate.creole.tokeniser.DefaultTokeniser;
+
+import java.net.URL;
+
+@CreoleResource(name = "Twitter Tokenizer (EN)")
+public class TokenizerEN extends DefaultTokeniser {
+
+ private static final long serialVersionUID = -8104798447326556796L;
+
+ @Override
+ @CreoleParameter(comment="The URL to the rules file", suffixes="rules",
+ defaultValue = "resources/tokeniser/twitter+English.jape")
+ public void setTransducerGrammarURL(URL url) {
+ super.setTransducerGrammarURL(url);
+ }
+}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Learn Graph Databases - Download FREE O'Reilly Book
"Graph Databases" is the definitive new guide to graph databases and their
applications. Written by three acclaimed leaders in the field,
this first edition is now available. Download your free book today!
http://p.sf.net/sfu/13534_NeoTech
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs