http://www.mediawiki.org/wiki/Special:Code/MediaWiki/73299

Revision: 73299
Author:   daniel
Date:     2010-09-18 20:03:48 +0000 (Sat, 18 Sep 2010)

Log Message:
-----------
detection of proper nouns

Modified Paths:
--------------
    
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java
    
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
    
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/LanguageConfiguration_en.java
    
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java

Added Paths:
-----------
    
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/TitleGlue_en.properties

Modified: 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java
===================================================================
--- 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java
     2010-09-18 19:17:39 UTC (rev 73298)
+++ 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/LanguageConfiguration.java
     2010-09-18 20:03:48 UTC (rev 73299)
@@ -72,6 +72,18 @@
         */
        public Pattern wordPartPattern;
 
+       /**
+        * A pattern matching parts of names, for detecting proper nouns. This 
is usually
+        * set to match any sequence of letters that starts with an upper case 
latter. 
+        */
+       public Pattern namePartPattern;
+
+       /**
+        * A pattern matching words that may occur inside a proper nound 
(name), but do not match
+        * namePartPattern. A common example for english would be "of", as in 
"Marquess of Dorset".
+        */
+       public Pattern nameGluePattern;
+
        protected String languageName;
 
        /**
@@ -108,6 +120,8 @@
        public void defaults() throws IOException {
                if (this.wordPattern==null) this.wordPattern = 
Pattern.compile("[\\p{L}']+(?:[\\p{Pc}\\p{Pd}][\\p{L}']+)*|\\p{Nd}+(?:.\\p{Nd}+)?");
 
                if (this.wordPartPattern==null) this.wordPartPattern = 
Pattern.compile("[\\p{L}]+|\\p{Nd}+"); 
+               if (this.namePartPattern==null) this.namePartPattern = 
Pattern.compile("\\p{Lu}[-\\p{L}]+"); 
+               //this.nameGluePattern is null per default! 
 
                this.sentenceManglers.add( new 
RegularExpressionMangler("\\s+\\(.*?\\)", "", 0) ); //strip parentacized blocks 
                this.sentenceManglers.add( new 
RegularExpressionMangler("^([^\\p{L}]*(\\r\\n|\\r|\\n))+[^\\p{L}0-9]*\\s*", "", 
0) ); //strip leading cruft (lines without any characters)
@@ -136,6 +150,8 @@
 
                if (with.wordPattern!=null) wordPattern = with.wordPattern;
                if (with.wordPartPattern!=null) wordPartPattern = 
with.wordPartPattern;
+               if (with.namePartPattern!=null) namePartPattern = 
with.namePartPattern;
+               if (with.nameGluePattern!=null) nameGluePattern = 
with.nameGluePattern;
                if (with.phraseBreakerPattern!=null) phraseBreakerPattern = 
with.phraseBreakerPattern;
                
                if (with.stopwords!=null) stopwords.addAll(with.stopwords);

Modified: 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
===================================================================
--- 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
 2010-09-18 19:17:39 UTC (rev 73298)
+++ 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
 2010-09-18 20:03:48 UTC (rev 73299)
@@ -32,6 +32,8 @@
        private Matcher sentenceFollowGlueMatcher;
        private Matcher wordMatcher;
        private Matcher wordPartMatcher;
+       private Matcher namePartMatcher;
+       private Matcher nameGlueMatcher;
 
        protected Filter<String> stopwordFilter;
        protected Matcher phraseBreakeMatcher;
@@ -48,28 +50,46 @@
        
        public static PlainTextAnalyzer getPlainTextAnalyzer(Corpus corpus, 
TweakSet tweaks) throws InstantiationException {
                Class[] acc = getSpecializedClasses(corpus, 
PlainTextAnalyzer.class, "PlainTextAnalyzer");
+               
+               try {
+                       Constructor<PlainTextAnalyzer> ctor = 
acc[0].getConstructor(new Class[] { Corpus.class });
+                       PlainTextAnalyzer analyzer = ctor.newInstance(new 
Object[] { corpus } );
+       
+                       analyzer.configure(corpus, tweaks);
+                       return analyzer;
+               } catch (SecurityException e) {
+                       throw (InstantiationException)new 
InstantiationException().initCause(e);
+               } catch (IllegalArgumentException e) {
+                       throw (InstantiationException)new 
InstantiationException().initCause(e);
+               } catch (NoSuchMethodException e) {
+                       throw (InstantiationException)new 
InstantiationException().initCause(e);
+               } catch (InvocationTargetException e) {
+                       throw (InstantiationException)new 
InstantiationException().initCause(e);
+               } catch (IllegalAccessException e) {
+                       throw (InstantiationException)new 
InstantiationException().initCause(e);
+               }
+       }
+       
+       protected void  configure(Corpus corpus, TweakSet tweaks) throws 
InstantiationException {
                Class[] ccc = getSpecializedClasses(corpus, 
LanguageConfiguration.class, "LanguageConfiguration", 
corpus.getConfigPackages());
                
                try {
-                       Constructor ctor = acc[0].getConstructor(new Class[] { 
Corpus.class });
-                       PlainTextAnalyzer analyzer = 
(PlainTextAnalyzer)ctor.newInstance(new Object[] { corpus } );
-                       
                        for (int i = ccc.length-1; i >= 0; i--) { //NOTE: most 
specific last, because last write wins.
                                LanguageConfiguration conf ;
-                       
+                               Constructor<LanguageConfiguration> ctor;
+                               
                                try {
                                        ctor = ccc[i].getConstructor(new 
Class[] { });
-                                       conf = 
(LanguageConfiguration)ctor.newInstance(new Object[] { } );
+                                       conf = ctor.newInstance(new Object[] { 
} );
                                } 
                                catch (NoSuchMethodException ex) {
                                        ctor = ccc[i].getConstructor(new 
Class[] { String.class });
-                                       conf = 
(LanguageConfiguration)ctor.newInstance(new Object[] { corpus.getLanguage() } );
+                                       conf = ctor.newInstance(new Object[] { 
corpus.getLanguage() } );
                                }
                                
-                               analyzer.configure(conf, tweaks);
+                               this.configure(conf, tweaks);
                        }
                        
-                       return analyzer;
                } catch (SecurityException e) {
                        throw (InstantiationException)new 
InstantiationException().initCause(e);
                } catch (IllegalArgumentException e) {
@@ -96,6 +116,8 @@
                sentenceFollowGlueMatcher = 
config.sentenceFollowGluePattern.matcher("");
                wordMatcher = config.wordPattern.matcher("");
                wordPartMatcher = config.wordPartPattern.matcher("");
+               namePartMatcher = config.namePartPattern.matcher("");
+               nameGlueMatcher = config.nameGluePattern == null ? null : 
config.nameGluePattern.matcher("");
                
                phraseBreakeMatcher = config.phraseBreakerPattern.matcher("");
                stopwordFilter = new FixedSetFilter<String>(config.stopwords);
@@ -109,15 +131,20 @@
         * @return
         */
        public CharSequence extractFirstSentence(CharSequence text) {
-               return extractNextSentence(text, null, true);
+               if (text==null || text.length()==0) return "";
+               
+               text = applyManglers(config.sentenceManglers, text);
+               if (text.length()==0) return "";
+
+               CharSequence s = scanNextSentence(text, null);
+               s = AnalyzerUtils.trim(s);
+
+               return s;
        }
        
-       public CharSequence extractNextSentence(CharSequence text, 
ParsePosition position, boolean mangle) {
+       public CharSequence scanNextSentence(CharSequence text, ParsePosition 
position) {
                if (text==null || text.length()==0) return "";
                
-               if (mangle) text = applyManglers(config.sentenceManglers, text);
-               if (text.length()==0) return "";
-
                sentenceMatcher.reset(text);
                sentenceTailGlueMatcher.reset(text);
                sentenceFollowGlueMatcher.reset(text);
@@ -132,13 +159,14 @@
                        sentenceFollowGlueMatcher.region(ofs, text.length());
                }
                
-               StringBuilder s = new StringBuilder();
+               StringBuilder s = null;
                boolean add = false;
                while (sentenceMatcher.find()) {
                        int start = ofs;
                        ofs = sentenceMatcher.end();
                        if (position!=null) position.setIndex(ofs);
                        
+                       if (s==null) s = new StringBuilder();
                        s.append(text, start, sentenceMatcher.end());
                        
                        if (sentenceMatcher.group(1)!=null) {
@@ -168,10 +196,16 @@
                        if (position!=null) position.setIndex(ofs);
                }
                
-               if (ofs!=0) text = AnalyzerUtils.trim(s);
-               else {
-                       if (position!=null) position.setIndex(text.length());
-                       AnalyzerUtils.trim(text);
+               if (s!=null) {
+                       text = s;
+               } else {
+                       int end= text.length();
+                       
+                       if (position!=null && position.getIndex()>0) {
+                               text = text.subSequence(position.getIndex(), 
end);  
+                       } 
+
+                       if (position!=null) position.setIndex(end);
                }
                
                return text;
@@ -189,6 +223,22 @@
                return words;
        }
 
+       public List<PhraseOccurance> extractWordOccurrance(CharSequence s) {
+               ArrayList<PhraseOccurance> words = new 
ArrayList<PhraseOccurance>();
+               
+               wordMatcher.reset(s); 
+               while (wordMatcher.find()) {
+                       int g;
+                       if (wordMatcher.groupCount()>0) g = 1;
+                       else g = 0;
+                       
+                       PhraseOccurance w = new 
PhraseOccurance(wordMatcher.group(g), 1, wordMatcher.start(g), 
wordMatcher.end(g) - wordMatcher.start(g));
+                       words.add(w);
+               }
+               
+               return words;
+       }
+
        public Corpus getCorpus() {
                return corpus;
        }
@@ -271,15 +321,128 @@
                }
        } */
        
+       public PhraseOccuranceSet extractNames(CharSequence text, int 
maxWeight) {
+               text = applyManglers(config.sentenceManglers, text);
+               PhraseOccuranceSet names= new 
PhraseOccuranceSet(text.toString(), new ArrayList<PhraseOccurance>());
+               
+               ParsePosition pos = new ParsePosition(0);
+               while (pos.getIndex() < text.length()) {
+                       int ofs = pos.getIndex();
+                       CharSequence s = scanNextSentence(text, pos);
+                       if (s==null || s.length()==0) break;
+                       
+                       extractNamesFromSentence(s, ofs, names, maxWeight);
+               }
+
+               names.prune(1);
+               return names;
+       }
+       
+       private void extractNamesFromSentence(CharSequence s, int ofs, 
PhraseOccuranceSet names, int maxWeight) {
+               StringBuilder n = new StringBuilder();
+               StringBuilder glue = new StringBuilder();
+               
+               int start = -1;
+               int weight = 0;
+               int pos = 0;
+
+               wordMatcher.reset(s); 
+               while (wordMatcher.find()) {
+                       int g;
+                       if (wordMatcher.groupCount()>0) g = 1;
+                       else g = 0;
+                       
+                       String w = wordMatcher.group(g);
+                       int i = wordMatcher.start(g);
+                       CharSequence space = s.subSequence(pos, i);
+                       
+                       if (start>=0) { //in name
+                               boolean doEndName = false;
+                               
+                               if ( weight >= maxWeight ) {
+                                       doEndName = true;
+                               } else {
+                                               
phraseBreakeMatcher.reset(space);
+                                               
+                                               if 
(phraseBreakeMatcher.matches()) { // phrase break encountered, end name
+                                                       doEndName = true;
+                                               }
+                               }
+                               
+                               if ( doEndName ) {
+                                               names.add( new 
PhraseOccurance(n.toString(), weight, start, n.length()));
+                                               
+                                               n.setLength(0);
+                                               start = -1;
+                                               weight = 0;
+                                               
+                                               glue.setLength(0);
+                               }
+                       }
+
+                       namePartMatcher.reset(w);
+                       
+                       if (namePartMatcher.matches()) { // looks like a name 
part (capitalized word)
+                               boolean doConcat;
+                               if (start>=0) { //already in a name
+                                       doConcat = true;
+                               } else { //new name starts
+                                       doConcat = true;
+                                       
+                                       if ( i == 0 && stopwordFilter!=null ) { 
//start of sentence, other stuff also gets capitalized. filter at least stop 
words.
+                                               String t = w.toLowerCase();
+                                               if ( stopwordFilter.matches(t) 
) {
+                                                       doConcat = false;
+                                               }
+                                       }
+                               }
+
+                               if ( doConcat ) {
+                                       if (start<0) { // start new name
+                                               start = i; 
+                                       } else { // in name, process glue
+                                               n.append(glue); 
+                                               n.append(space); 
+                                       }
+                                       
+                                       n.append(w);
+                                       weight += 1;
+                                       
+                                       glue.setLength(0);
+                               }
+                       } else if (start>=0) { //not a name part, but already 
in a name
+                               if (nameGlueMatcher!=null) 
nameGlueMatcher.reset(w);
+                               
+                               if (nameGlueMatcher!=null && 
nameGlueMatcher.matches()) { // is glue word
+                                       glue.append(space);
+                                       glue.append(w);
+                               } else { //name finished
+                                       names.add( new 
PhraseOccurance(n.toString(), weight, ofs + start, n.length()));
+                                       
+                                       n.setLength(0);
+                                       start = -1;
+                                       weight = 0;
+                                       
+                                       glue.setLength(0);
+                               }
+                       }
+                       
+                       pos = i + w.length();
+               }
+
+               if (start>=0) { //name finished
+                       names.add( new PhraseOccurance(n.toString(), weight, 
ofs + start, n.length()));
+               }
+       }
+
        public PhraseOccuranceSet extractPhrases(CharSequence text, int 
maxWeight, int maxDepth) {
+               text = applyManglers(config.sentenceManglers, text);
                PhraseOccuranceSet phrases = new 
PhraseOccuranceSet(text.toString(), new ArrayList<PhraseOccurance>());
                
-               text = applyManglers(config.sentenceManglers, text);
-               
                ParsePosition pos = new ParsePosition(0);
                while (pos.getIndex() < text.length()) {
                        int ofs = pos.getIndex();
-                       CharSequence s = extractNextSentence(text, pos, false);
+                       CharSequence s = scanNextSentence(text, pos);
                        if (s==null || s.length()==0) break;
                        
                        buildPhrases(s, ofs, phrases, maxWeight);

Modified: 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/LanguageConfiguration_en.java
===================================================================
--- 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/LanguageConfiguration_en.java
     2010-09-18 19:17:39 UTC (rev 73298)
+++ 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/LanguageConfiguration_en.java
     2010-09-18 20:03:48 UTC (rev 73299)
@@ -25,6 +25,8 @@
                 "\\p{L}(?:\\.\\p{L})+|\\p{L}*\\p{Lu}"+ //XXX: gives a lot of 
false positives!
                 ")$"
                );
+               
+               this.nameGluePattern = Pattern.compile("of|on|in|the"); // 
common non-capitalized components of proper nouns
        }
 
 }

Added: 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/TitleGlue_en.properties
===================================================================
--- 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/TitleGlue_en.properties
                           (rev 0)
+++ 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/wikis/TitleGlue_en.properties
   2010-09-18 20:03:48 UTC (rev 73299)
@@ -0,0 +1,216 @@
+all
+another
+any
+anybody
+anyone
+anything
+both
+each
+each
+other
+either
+everybody
+everyone
+everything
+few
+he
+her
+hers
+herself
+him
+himself
+his
+it
+its
+itself
+little
+many
+me
+mine
+more
+most
+much
+myself
+neither
+no
+one
+nobody
+none
+nothing
+one
+one
+other
+others
+ours
+ourselves
+several
+she
+some
+somebody
+someone
+something
+that
+theirs
+them
+themselves
+these
+they
+this
+those
+us
+we
+what
+whatever
+which
+whichever
+who
+whoever
+whom
+whomever
+whose
+you
+yours
+yourself
+yourselves
+
+aboard
+about
+above
+absent
+across
+after
+against
+along
+alongside
+amid
+amidst
+among
+amongst
+around
+as
+aside
+astride
+at
+athwart
+atop
+barring
+before
+behind
+below
+beneath
+beside
+besides
+between
+betwixt
+beyond
+but
+by
+circa
+concerning
+despite
+down
+during
+except
+excluding
+failing
+following
+for
+from
+given
+in
+including
+inside
+into
+like
+mid
+midst
+minus
+near
+next
+notwithstanding
+of
+off
+on
+onto
+opposite
+out
+outside
+over
+pace
+past
+per
+plus
+pro
+qua
+regarding
+round
+save
+since
+than
+through
+throughout
+till
+times
+to
+toward
+towards
+under
+underneath
+unlike
+until
+up
+upon
+versus
+vs.
+via
+vice
+with
+within
+without
+worth
+
+for
+and
+no
+nor
+not
+but
+or
+yet
+either
+neither
+whether
+both
+only
+
+and
+so
+be
+am
+are
+is
+been
+has
+have
+had
+will
+won't
+arn't
+was
+were
+shall
+would
+
+ago
+apart
+aside
+away
+hence
+notwithstanding
+on
+through
+withal
+
+the
+a
+an
\ No newline at end of file

Modified: 
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java
===================================================================
--- 
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java
     2010-09-18 19:17:39 UTC (rev 73298)
+++ 
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java
     2010-09-18 20:03:48 UTC (rev 73299)
@@ -2,6 +2,7 @@
 
 import java.io.IOException;
 import java.net.URISyntaxException;
+import java.text.ParsePosition;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -32,14 +33,52 @@
                }
                
                public void testExtractFirstSentence() {
-                       String text = "Foo (also abc. cde.) is the 2. Quux in 
Xyzzy. Its not a Barf.\n";
+                       String text = "Hello World";
                        
                        CharSequence s = extractFirstSentence(text);
+                       assertEquals("simple sentence", "Hello World", 
s.toString());
+
+                       //-------------------
+                       text = "Hello World.";
+                       
+                       s = extractFirstSentence(text);
+                       assertEquals("simple sentence", "Hello World.", 
s.toString());
+
+                       //-------------------
+                       text = "Hello World.\n";
+                       
+                       s = extractFirstSentence(text);
+                       assertEquals("simple sentence", "Hello World.", 
s.toString());
+
+                       //-------------------
+                       text = "Foo (also abc. cde.) is the 2. Quux in Xyzzy. 
Its not a Barf.\n";
+                       
+                       s = extractFirstSentence(text);
                        assertEquals("simple sentence", "Foo is the 2. Quux in 
Xyzzy.", s.toString());
                        
                        //TODO: all the nasty stuff...
                }
 
+               public void testScanNextSentence() {
+                       String text = "Hello World";
+                       
+                       ParsePosition pos = new ParsePosition(0);
+                       CharSequence s = scanNextSentence(text, pos);
+                       assertEquals("Hello World", s.toString());
+
+                       // ----------------------
+                       text = "He's John Doe, I'm Jane. Alex is not here.";
+                       
+                       pos = new ParsePosition(0);
+                       s = scanNextSentence(text, pos);
+                       assertEquals("He's John Doe, I'm Jane. ", s.toString());
+                       
+                       s = scanNextSentence(text, pos);
+                       assertEquals("Alex is not here.", s.toString());
+                       
+                       //TODO: all the nasty stuff...
+               }
+
                public void testExtractWords() {
                        List<String> words = extractWords("");
                        assertEquals(theList(), words);
@@ -72,6 +111,36 @@
                        assertEquals(theList( "23", "foo", "42" ), words);
                }
 
+               public void testExtractNames() {
+                       PhraseOccuranceSet names = extractNames("", 2);
+                       assertEquals(0, names.size());
+                       assertEquals(theList(), getWordList(names));
+
+                       names = extractNames("foo", 2);
+                       assertEquals(theList(), getWordList(names));
+
+                       names = extractNames("Foo", 2);
+                       assertEquals(theList("Foo"), getWordList(names));
+
+                       names = extractNames("The", 2);
+                       assertEquals(theList(), getWordList(names));
+
+                       names = extractNames("The Foo", 2);
+                       assertEquals(theList("Foo"), getWordList(names));
+
+                       names = extractNames("The Foo Bar Bear", 2);
+                       assertEquals(theList("Foo Bar", "Bear"), 
getWordList(names));
+
+                       names = extractNames("meet the Foo of Bar on tuesday", 
2);
+                       assertEquals(theList("Foo of Bar"), getWordList(names));
+
+                       names = extractNames("He's John Doe, I'm Jane. Alex is 
not here.", 2);
+                       assertEquals(theList("John Doe", "Jane", "Alex"), 
getWordList(names));
+
+                       names = extractNames("Anne-Catrin Drinkwater, Joe; 
Jane.", 3);
+                       assertEquals(theList("Anne-Catrin Drinkwater", "Joe", 
"Jane"), getWordList(names));
+               }
+
                public void testExtractPhrases() {
                        PhraseOccuranceSet phrases = extractPhrases("", 3, 3);
                        assertEquals(0, phrases.size());
@@ -171,12 +240,9 @@
        protected TestPlainTextAnalyzer testAnalyzer;
        
        @Override
-       public void setUp() throws URISyntaxException, IOException {
-               LanguageConfiguration config = new LanguageConfiguration();
-               
-               //corpus = new Corpus("TEST", "en", "en", "en", "en", "en", 
"en", null);
+       public void setUp() throws URISyntaxException, IOException, 
InstantiationException {
                testAnalyzer = new TestPlainTextAnalyzer(corpus);
-               testAnalyzer.configure(config, tweaks);
+               testAnalyzer.configure(corpus, tweaks);
                testAnalyzer.initialize();
                
                analyzer = testAnalyzer;
@@ -186,10 +252,18 @@
                testAnalyzer.testExtractFirstSentence();
        }
 
+       public void testScanNextSentence() {
+               testAnalyzer.testScanNextSentence();
+       }
+
        public void testExtractWords() {
                testAnalyzer.testExtractWords();
        }
 
+       public void testExtractNames() {
+               testAnalyzer.testExtractNames();
+       }
+
        public void testExtractPhrases() {
                testAnalyzer.testExtractPhrases();
        }



_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to