http://www.mediawiki.org/wiki/Special:Code/MediaWiki/73299
Revision: 73299
Author: daniel
Date: 2010-09-18 20:03:48 +0000 (Sat, 18 Sep 2010)
Log Message:
-----------
detection of proper nouns
Modified Paths:
--------------
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/LanguageConfiguration_en.java
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java
Added Paths:
-----------
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/TitleGlue_en.properties
Modified:
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java
===================================================================
---
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java
2010-09-18 19:17:39 UTC (rev 73298)
+++
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java
2010-09-18 20:03:48 UTC (rev 73299)
@@ -72,6 +72,18 @@
*/
public Pattern wordPartPattern;
+ /**
+ * A pattern matching parts of names, for detecting proper nouns. This
is usually
+ * set to match any sequence of letters that starts with an upper case
latter.
+ */
+ public Pattern namePartPattern;
+
+ /**
+ * A pattern matching words that may occur inside a proper nound
(name), but do not match
+ * namePartPattern. A common example for english would be "of", as in
"Marquess of Dorset".
+ */
+ public Pattern nameGluePattern;
+
protected String languageName;
/**
@@ -108,6 +120,8 @@
public void defaults() throws IOException {
if (this.wordPattern==null) this.wordPattern =
Pattern.compile("[\\p{L}']+(?:[\\p{Pc}\\p{Pd}][\\p{L}']+)*|\\p{Nd}+(?:.\\p{Nd}+)?");
if (this.wordPartPattern==null) this.wordPartPattern =
Pattern.compile("[\\p{L}]+|\\p{Nd}+");
+ if (this.namePartPattern==null) this.namePartPattern =
Pattern.compile("\\p{Lu}[-\\p{L}]+");
+ //this.nameGluePattern is null per default!
this.sentenceManglers.add( new
RegularExpressionMangler("\\s+\\(.*?\\)", "", 0) ); //strip parentacized blocks
this.sentenceManglers.add( new
RegularExpressionMangler("^([^\\p{L}]*(\\r\\n|\\r|\\n))+[^\\p{L}0-9]*\\s*", "",
0) ); //strip leading cruft (lines without any characters)
@@ -136,6 +150,8 @@
if (with.wordPattern!=null) wordPattern = with.wordPattern;
if (with.wordPartPattern!=null) wordPartPattern =
with.wordPartPattern;
+ if (with.namePartPattern!=null) namePartPattern =
with.namePartPattern;
+ if (with.nameGluePattern!=null) nameGluePattern =
with.nameGluePattern;
if (with.phraseBreakerPattern!=null) phraseBreakerPattern =
with.phraseBreakerPattern;
if (with.stopwords!=null) stopwords.addAll(with.stopwords);
Modified:
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
===================================================================
---
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
2010-09-18 19:17:39 UTC (rev 73298)
+++
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
2010-09-18 20:03:48 UTC (rev 73299)
@@ -32,6 +32,8 @@
private Matcher sentenceFollowGlueMatcher;
private Matcher wordMatcher;
private Matcher wordPartMatcher;
+ private Matcher namePartMatcher;
+ private Matcher nameGlueMatcher;
protected Filter<String> stopwordFilter;
protected Matcher phraseBreakeMatcher;
@@ -48,28 +50,46 @@
public static PlainTextAnalyzer getPlainTextAnalyzer(Corpus corpus,
TweakSet tweaks) throws InstantiationException {
Class[] acc = getSpecializedClasses(corpus,
PlainTextAnalyzer.class, "PlainTextAnalyzer");
+
+ try {
+ Constructor<PlainTextAnalyzer> ctor =
acc[0].getConstructor(new Class[] { Corpus.class });
+ PlainTextAnalyzer analyzer = ctor.newInstance(new
Object[] { corpus } );
+
+ analyzer.configure(corpus, tweaks);
+ return analyzer;
+ } catch (SecurityException e) {
+ throw (InstantiationException)new
InstantiationException().initCause(e);
+ } catch (IllegalArgumentException e) {
+ throw (InstantiationException)new
InstantiationException().initCause(e);
+ } catch (NoSuchMethodException e) {
+ throw (InstantiationException)new
InstantiationException().initCause(e);
+ } catch (InvocationTargetException e) {
+ throw (InstantiationException)new
InstantiationException().initCause(e);
+ } catch (IllegalAccessException e) {
+ throw (InstantiationException)new
InstantiationException().initCause(e);
+ }
+ }
+
+ protected void configure(Corpus corpus, TweakSet tweaks) throws
InstantiationException {
Class[] ccc = getSpecializedClasses(corpus,
LanguageConfiguration.class, "LanguageConfiguration",
corpus.getConfigPackages());
try {
- Constructor ctor = acc[0].getConstructor(new Class[] {
Corpus.class });
- PlainTextAnalyzer analyzer =
(PlainTextAnalyzer)ctor.newInstance(new Object[] { corpus } );
-
for (int i = ccc.length-1; i >= 0; i--) { //NOTE: most
specific last, because last write wins.
LanguageConfiguration conf ;
-
+ Constructor<LanguageConfiguration> ctor;
+
try {
ctor = ccc[i].getConstructor(new
Class[] { });
- conf =
(LanguageConfiguration)ctor.newInstance(new Object[] { } );
+ conf = ctor.newInstance(new Object[] {
} );
}
catch (NoSuchMethodException ex) {
ctor = ccc[i].getConstructor(new
Class[] { String.class });
- conf =
(LanguageConfiguration)ctor.newInstance(new Object[] { corpus.getLanguage() } );
+ conf = ctor.newInstance(new Object[] {
corpus.getLanguage() } );
}
- analyzer.configure(conf, tweaks);
+ this.configure(conf, tweaks);
}
- return analyzer;
} catch (SecurityException e) {
throw (InstantiationException)new
InstantiationException().initCause(e);
} catch (IllegalArgumentException e) {
@@ -96,6 +116,8 @@
sentenceFollowGlueMatcher =
config.sentenceFollowGluePattern.matcher("");
wordMatcher = config.wordPattern.matcher("");
wordPartMatcher = config.wordPartPattern.matcher("");
+ namePartMatcher = config.namePartPattern.matcher("");
+ nameGlueMatcher = config.nameGluePattern == null ? null :
config.nameGluePattern.matcher("");
phraseBreakeMatcher = config.phraseBreakerPattern.matcher("");
stopwordFilter = new FixedSetFilter<String>(config.stopwords);
@@ -109,15 +131,20 @@
* @return
*/
public CharSequence extractFirstSentence(CharSequence text) {
- return extractNextSentence(text, null, true);
+ if (text==null || text.length()==0) return "";
+
+ text = applyManglers(config.sentenceManglers, text);
+ if (text.length()==0) return "";
+
+ CharSequence s = scanNextSentence(text, null);
+ s = AnalyzerUtils.trim(s);
+
+ return s;
}
- public CharSequence extractNextSentence(CharSequence text,
ParsePosition position, boolean mangle) {
+ public CharSequence scanNextSentence(CharSequence text, ParsePosition
position) {
if (text==null || text.length()==0) return "";
- if (mangle) text = applyManglers(config.sentenceManglers, text);
- if (text.length()==0) return "";
-
sentenceMatcher.reset(text);
sentenceTailGlueMatcher.reset(text);
sentenceFollowGlueMatcher.reset(text);
@@ -132,13 +159,14 @@
sentenceFollowGlueMatcher.region(ofs, text.length());
}
- StringBuilder s = new StringBuilder();
+ StringBuilder s = null;
boolean add = false;
while (sentenceMatcher.find()) {
int start = ofs;
ofs = sentenceMatcher.end();
if (position!=null) position.setIndex(ofs);
+ if (s==null) s = new StringBuilder();
s.append(text, start, sentenceMatcher.end());
if (sentenceMatcher.group(1)!=null) {
@@ -168,10 +196,16 @@
if (position!=null) position.setIndex(ofs);
}
- if (ofs!=0) text = AnalyzerUtils.trim(s);
- else {
- if (position!=null) position.setIndex(text.length());
- AnalyzerUtils.trim(text);
+ if (s!=null) {
+ text = s;
+ } else {
+ int end= text.length();
+
+ if (position!=null && position.getIndex()>0) {
+ text = text.subSequence(position.getIndex(),
end);
+ }
+
+ if (position!=null) position.setIndex(end);
}
return text;
@@ -189,6 +223,22 @@
return words;
}
+ public List<PhraseOccurance> extractWordOccurrance(CharSequence s) {
+ ArrayList<PhraseOccurance> words = new
ArrayList<PhraseOccurance>();
+
+ wordMatcher.reset(s);
+ while (wordMatcher.find()) {
+ int g;
+ if (wordMatcher.groupCount()>0) g = 1;
+ else g = 0;
+
+ PhraseOccurance w = new
PhraseOccurance(wordMatcher.group(g), 1, wordMatcher.start(g),
wordMatcher.end(g) - wordMatcher.start(g));
+ words.add(w);
+ }
+
+ return words;
+ }
+
public Corpus getCorpus() {
return corpus;
}
@@ -271,15 +321,128 @@
}
} */
+ public PhraseOccuranceSet extractNames(CharSequence text, int
maxWeight) {
+ text = applyManglers(config.sentenceManglers, text);
+ PhraseOccuranceSet names= new
PhraseOccuranceSet(text.toString(), new ArrayList<PhraseOccurance>());
+
+ ParsePosition pos = new ParsePosition(0);
+ while (pos.getIndex() < text.length()) {
+ int ofs = pos.getIndex();
+ CharSequence s = scanNextSentence(text, pos);
+ if (s==null || s.length()==0) break;
+
+ extractNamesFromSentence(s, ofs, names, maxWeight);
+ }
+
+ names.prune(1);
+ return names;
+ }
+
+ private void extractNamesFromSentence(CharSequence s, int ofs,
PhraseOccuranceSet names, int maxWeight) {
+ StringBuilder n = new StringBuilder();
+ StringBuilder glue = new StringBuilder();
+
+ int start = -1;
+ int weight = 0;
+ int pos = 0;
+
+ wordMatcher.reset(s);
+ while (wordMatcher.find()) {
+ int g;
+ if (wordMatcher.groupCount()>0) g = 1;
+ else g = 0;
+
+ String w = wordMatcher.group(g);
+ int i = wordMatcher.start(g);
+ CharSequence space = s.subSequence(pos, i);
+
+ if (start>=0) { //in name
+ boolean doEndName = false;
+
+ if ( weight >= maxWeight ) {
+ doEndName = true;
+ } else {
+
phraseBreakeMatcher.reset(space);
+
+ if
(phraseBreakeMatcher.matches()) { // phrase break encountered, end name
+ doEndName = true;
+ }
+ }
+
+ if ( doEndName ) {
+ names.add( new
PhraseOccurance(n.toString(), weight, start, n.length()));
+
+ n.setLength(0);
+ start = -1;
+ weight = 0;
+
+ glue.setLength(0);
+ }
+ }
+
+ namePartMatcher.reset(w);
+
+ if (namePartMatcher.matches()) { // looks like a name
part (capitalized word)
+ boolean doConcat;
+ if (start>=0) { //already in a name
+ doConcat = true;
+ } else { //new name starts
+ doConcat = true;
+
+ if ( i == 0 && stopwordFilter!=null ) {
//start of sentence, other stuff also gets capitalized. filter at least stop
words.
+ String t = w.toLowerCase();
+ if ( stopwordFilter.matches(t)
) {
+ doConcat = false;
+ }
+ }
+ }
+
+ if ( doConcat ) {
+ if (start<0) { // start new name
+ start = i;
+ } else { // in name, process glue
+ n.append(glue);
+ n.append(space);
+ }
+
+ n.append(w);
+ weight += 1;
+
+ glue.setLength(0);
+ }
+ } else if (start>=0) { //not a name part, but already
in a name
+ if (nameGlueMatcher!=null)
nameGlueMatcher.reset(w);
+
+ if (nameGlueMatcher!=null &&
nameGlueMatcher.matches()) { // is glue word
+ glue.append(space);
+ glue.append(w);
+ } else { //name finished
+ names.add( new
PhraseOccurance(n.toString(), weight, ofs + start, n.length()));
+
+ n.setLength(0);
+ start = -1;
+ weight = 0;
+
+ glue.setLength(0);
+ }
+ }
+
+ pos = i + w.length();
+ }
+
+ if (start>=0) { //name finished
+ names.add( new PhraseOccurance(n.toString(), weight,
ofs + start, n.length()));
+ }
+ }
+
public PhraseOccuranceSet extractPhrases(CharSequence text, int
maxWeight, int maxDepth) {
+ text = applyManglers(config.sentenceManglers, text);
PhraseOccuranceSet phrases = new
PhraseOccuranceSet(text.toString(), new ArrayList<PhraseOccurance>());
- text = applyManglers(config.sentenceManglers, text);
-
ParsePosition pos = new ParsePosition(0);
while (pos.getIndex() < text.length()) {
int ofs = pos.getIndex();
- CharSequence s = extractNextSentence(text, pos, false);
+ CharSequence s = scanNextSentence(text, pos);
if (s==null || s.length()==0) break;
buildPhrases(s, ofs, phrases, maxWeight);
Modified:
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/LanguageConfiguration_en.java
===================================================================
---
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/LanguageConfiguration_en.java
2010-09-18 19:17:39 UTC (rev 73298)
+++
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/LanguageConfiguration_en.java
2010-09-18 20:03:48 UTC (rev 73299)
@@ -25,6 +25,8 @@
"\\p{L}(?:\\.\\p{L})+|\\p{L}*\\p{Lu}"+ //XXX: gives a lot of
false positives!
")$"
);
+
+ this.nameGluePattern = Pattern.compile("of|on|in|the"); //
common non-capitalized components of proper nouns
}
}
Added:
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/TitleGlue_en.properties
===================================================================
---
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/TitleGlue_en.properties
(rev 0)
+++
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/TitleGlue_en.properties
2010-09-18 20:03:48 UTC (rev 73299)
@@ -0,0 +1,216 @@
+all
+another
+any
+anybody
+anyone
+anything
+both
+each
+each
+other
+either
+everybody
+everyone
+everything
+few
+he
+her
+hers
+herself
+him
+himself
+his
+it
+its
+itself
+little
+many
+me
+mine
+more
+most
+much
+myself
+neither
+no
+one
+nobody
+none
+nothing
+one
+one
+other
+others
+ours
+ourselves
+several
+she
+some
+somebody
+someone
+something
+that
+theirs
+them
+themselves
+these
+they
+this
+those
+us
+we
+what
+whatever
+which
+whichever
+who
+whoever
+whom
+whomever
+whose
+you
+yours
+yourself
+yourselves
+
+aboard
+about
+above
+absent
+across
+after
+against
+along
+alongside
+amid
+amidst
+among
+amongst
+around
+as
+aside
+astride
+at
+athwart
+atop
+barring
+before
+behind
+below
+beneath
+beside
+besides
+between
+betwixt
+beyond
+but
+by
+circa
+concerning
+despite
+down
+during
+except
+excluding
+failing
+following
+for
+from
+given
+in
+including
+inside
+into
+like
+mid
+midst
+minus
+near
+next
+notwithstanding
+of
+off
+on
+onto
+opposite
+out
+outside
+over
+pace
+past
+per
+plus
+pro
+qua
+regarding
+round
+save
+since
+than
+through
+throughout
+till
+times
+to
+toward
+towards
+under
+underneath
+unlike
+until
+up
+upon
+versus
+vs.
+via
+vice
+with
+within
+without
+worth
+
+for
+and
+no
+nor
+not
+but
+or
+yet
+either
+neither
+whether
+both
+only
+
+and
+so
+be
+am
+are
+is
+been
+has
+have
+had
+will
+won't
+arn't
+was
+were
+shall
+would
+
+ago
+apart
+aside
+away
+hence
+notwithstanding
+on
+through
+withal
+
+the
+a
+an
\ No newline at end of file
Modified:
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java
===================================================================
---
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java
2010-09-18 19:17:39 UTC (rev 73298)
+++
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java
2010-09-18 20:03:48 UTC (rev 73299)
@@ -2,6 +2,7 @@
import java.io.IOException;
import java.net.URISyntaxException;
+import java.text.ParsePosition;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@@ -32,14 +33,52 @@
}
public void testExtractFirstSentence() {
- String text = "Foo (also abc. cde.) is the 2. Quux in
Xyzzy. Its not a Barf.\n";
+ String text = "Hello World";
CharSequence s = extractFirstSentence(text);
+ assertEquals("simple sentence", "Hello World",
s.toString());
+
+ //-------------------
+ text = "Hello World.";
+
+ s = extractFirstSentence(text);
+ assertEquals("simple sentence", "Hello World.",
s.toString());
+
+ //-------------------
+ text = "Hello World.\n";
+
+ s = extractFirstSentence(text);
+ assertEquals("simple sentence", "Hello World.",
s.toString());
+
+ //-------------------
+ text = "Foo (also abc. cde.) is the 2. Quux in Xyzzy.
Its not a Barf.\n";
+
+ s = extractFirstSentence(text);
assertEquals("simple sentence", "Foo is the 2. Quux in
Xyzzy.", s.toString());
//TODO: all the nasty stuff...
}
+ public void testScanNextSentence() {
+ String text = "Hello World";
+
+ ParsePosition pos = new ParsePosition(0);
+ CharSequence s = scanNextSentence(text, pos);
+ assertEquals("Hello World", s.toString());
+
+ // ----------------------
+ text = "He's John Doe, I'm Jane. Alex is not here.";
+
+ pos = new ParsePosition(0);
+ s = scanNextSentence(text, pos);
+ assertEquals("He's John Doe, I'm Jane. ", s.toString());
+
+ s = scanNextSentence(text, pos);
+ assertEquals("Alex is not here.", s.toString());
+
+ //TODO: all the nasty stuff...
+ }
+
public void testExtractWords() {
List<String> words = extractWords("");
assertEquals(theList(), words);
@@ -72,6 +111,36 @@
assertEquals(theList( "23", "foo", "42" ), words);
}
+ public void testExtractNames() {
+ PhraseOccuranceSet names = extractNames("", 2);
+ assertEquals(0, names.size());
+ assertEquals(theList(), getWordList(names));
+
+ names = extractNames("foo", 2);
+ assertEquals(theList(), getWordList(names));
+
+ names = extractNames("Foo", 2);
+ assertEquals(theList("Foo"), getWordList(names));
+
+ names = extractNames("The", 2);
+ assertEquals(theList(), getWordList(names));
+
+ names = extractNames("The Foo", 2);
+ assertEquals(theList("Foo"), getWordList(names));
+
+ names = extractNames("The Foo Bar Bear", 2);
+ assertEquals(theList("Foo Bar", "Bear"),
getWordList(names));
+
+ names = extractNames("meet the Foo of Bar on tuesday",
2);
+ assertEquals(theList("Foo of Bar"), getWordList(names));
+
+ names = extractNames("He's John Doe, I'm Jane. Alex is
not here.", 2);
+ assertEquals(theList("John Doe", "Jane", "Alex"),
getWordList(names));
+
+ names = extractNames("Anne-Catrin Drinkwater, Joe;
Jane.", 3);
+ assertEquals(theList("Anne-Catrin Drinkwater", "Joe",
"Jane"), getWordList(names));
+ }
+
public void testExtractPhrases() {
PhraseOccuranceSet phrases = extractPhrases("", 3, 3);
assertEquals(0, phrases.size());
@@ -171,12 +240,9 @@
protected TestPlainTextAnalyzer testAnalyzer;
@Override
- public void setUp() throws URISyntaxException, IOException {
- LanguageConfiguration config = new LanguageConfiguration();
-
- //corpus = new Corpus("TEST", "en", "en", "en", "en", "en",
"en", null);
+ public void setUp() throws URISyntaxException, IOException,
InstantiationException {
testAnalyzer = new TestPlainTextAnalyzer(corpus);
- testAnalyzer.configure(config, tweaks);
+ testAnalyzer.configure(corpus, tweaks);
testAnalyzer.initialize();
analyzer = testAnalyzer;
@@ -186,10 +252,18 @@
testAnalyzer.testExtractFirstSentence();
}
+ public void testScanNextSentence() {
+ testAnalyzer.testScanNextSentence();
+ }
+
public void testExtractWords() {
testAnalyzer.testExtractWords();
}
+ public void testExtractNames() {
+ testAnalyzer.testExtractNames();
+ }
+
public void testExtractPhrases() {
testAnalyzer.testExtractPhrases();
}
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs