http://www.mediawiki.org/wiki/Special:Code/MediaWiki/72990

Revision: 72990
Author:   daniel
Date:     2010-09-14 16:41:16 +0000 (Tue, 14 Sep 2010)

Log Message:
-----------
allow meanings to be mangled on load; limit recursion depth on phrase detector

Modified Paths:
--------------
    
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java
    
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java
    
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PhraseExtractor.java
    
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java
    
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/StoredMeaningFetcher.java
    
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java
    
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/WikiWordConcept.java
    
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/AnalyzerConsole.java
    
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
    
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java

Modified: 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java
===================================================================
--- 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java
    2010-09-14 16:32:44 UTC (rev 72989)
+++ 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java
    2010-09-14 16:41:16 UTC (rev 72990)
@@ -21,6 +21,8 @@
 
        private Map<String, C> meaningOverrides;
        
+       private int phraseSearchDepth = 8; //FIXME: magic...
+       
        public AbstractDisambiguator(MeaningFetcher<? extends C> 
meaningFetcher, int cacheCapacity) {
                if (meaningFetcher==null) throw new NullPointerException();
                
@@ -28,6 +30,14 @@
                this.meaningFetcher = meaningFetcher;
        }
        
+       public int getPhraseSearchDepth() {
+               return phraseSearchDepth;
+       }
+
+       public void setPhraseSearchDepth(int phraseSearchDepth) {
+               this.phraseSearchDepth = phraseSearchDepth;
+       }
+
        public MeaningFetcher<? extends C> getMeaningFetcher() {
                return meaningFetcher;
        }
@@ -86,7 +96,7 @@
        
        
        protected <X extends T>Map<X, List<? extends C>> 
getMeanings(PhraseNode<X> root) throws PersistenceException {
-               Collection<X> terms = getTerms(root, Integer.MAX_VALUE);
+               Collection<X> terms = getTerms(root, phraseSearchDepth);
                return getMeanings(terms);
        }
        
@@ -117,7 +127,7 @@
        }
        
        public <X extends T>Disambiguation<X, C> disambiguate(PhraseNode<X> 
root, Collection<? extends C> context) throws PersistenceException {
-               Collection<X> terms = getTerms(root, Integer.MAX_VALUE);
+               Collection<X> terms = getTerms(root, phraseSearchDepth);
                Map<X, List<? extends C>> meanings = getMeanings(terms);
                return disambiguate(root, meanings, context);
        }

Modified: 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java
===================================================================
--- 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java
   2010-09-14 16:32:44 UTC (rev 72989)
+++ 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java
   2010-09-14 16:41:16 UTC (rev 72990)
@@ -248,7 +248,7 @@
                        return getScore(r.getInterpretation(), context, 
similarities, features); 
                }
                
-               Collection<List<X>> sequences = getSequences(root, 
Integer.MAX_VALUE);
+               Collection<List<X>> sequences = getSequences(root, 
getPhraseSearchDepth());
                return disambiguate(sequences, root, meanings, context);
        }
        

Modified: 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PhraseExtractor.java
===================================================================
--- 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PhraseExtractor.java
  2010-09-14 16:32:44 UTC (rev 72989)
+++ 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PhraseExtractor.java
  2010-09-14 16:41:16 UTC (rev 72990)
@@ -4,6 +4,6 @@
 
 public interface PhraseExtractor {
 
-       public PhraseOccuranceSet extractPhrases(CharSequence s, int maxWeight);
+       public PhraseOccuranceSet extractPhrases(CharSequence s, int maxWeight, 
int maxDepth);
 
 }

Modified: 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java
===================================================================
--- 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java
  2010-09-14 16:32:44 UTC (rev 72989)
+++ 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java
  2010-09-14 16:41:16 UTC (rev 72990)
@@ -62,7 +62,7 @@
        }
 
        public <X extends T>Disambiguation<X, C> disambiguate(PhraseNode<X> 
root, Map<X, List<? extends C>> meanings, Collection<? extends C> context) {
-               Collection<List<X>> sequences = getSequences(root, 
Integer.MAX_VALUE);
+               Collection<List<X>> sequences = getSequences(root, 
getPhraseSearchDepth());
                return disambiguate(sequences, root, meanings, context);
        }
        

Modified: 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/StoredMeaningFetcher.java
===================================================================
--- 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/StoredMeaningFetcher.java
     2010-09-14 16:32:44 UTC (rev 72989)
+++ 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/StoredMeaningFetcher.java
     2010-09-14 16:41:16 UTC (rev 72990)
@@ -3,13 +3,15 @@
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.List;
+import java.util.ListIterator;
 import java.util.Map;
 
+import de.brightbyte.data.Functor2;
 import de.brightbyte.data.cursor.DataSet;
 import de.brightbyte.io.Output;
 import de.brightbyte.util.PersistenceException;
+import de.brightbyte.wikiword.model.TermReference;
 import de.brightbyte.wikiword.model.WikiWordConcept;
-import de.brightbyte.wikiword.model.TermReference;
 import de.brightbyte.wikiword.store.WikiWordConceptStore;
 import de.brightbyte.wikiword.store.WikiWordConceptStore.ConceptQuerySpec;
 
@@ -17,6 +19,7 @@
        protected WikiWordConceptStore  store; 
        protected ConceptQuerySpec spec;
        protected Output trace;
+       protected Functor2<WikiWordConcept, WikiWordConcept, String> 
meaningMangler;
        
        public StoredMeaningFetcher(WikiWordConceptStore  store) {
                this(store, null);
@@ -29,10 +32,29 @@
                this.spec = type;
        }
 
+       public Functor2<WikiWordConcept, WikiWordConcept, String> 
getMeaningMangler() {
+               return meaningMangler;
+       }
+
+       public void setMeaningMangler(Functor2<WikiWordConcept, 
WikiWordConcept, String> meaningMangler) {
+               this.meaningMangler = meaningMangler;
+       }
+
        public List<WikiWordConcept> getMeanings(String term) throws 
PersistenceException {
-               DataSet<WikiWordConcept> m = store.getMeanings(term, spec); 
//FIXME: filter/cut-off rules, sort order! //XXX: relevance value?
+               DataSet<WikiWordConcept> m = store.getMeanings(term, spec); 
                List<WikiWordConcept> meanigns = m.load();
                
+               if ( meaningMangler != null ) {
+                       ListIterator<WikiWordConcept> it = 
meanigns.listIterator();
+                       while (it.hasNext()) {
+                               WikiWordConcept c = it.next();
+                               WikiWordConcept c2 = meaningMangler.apply(c, 
term);
+                               
+                               if ( c2 == null ) it.remove();
+                               else if ( c != c2 ) it.set(c2);
+                       }
+               }
+               
                trace("fetched "+meanigns.size()+" meanings for \""+term+"\""); 
                return meanigns;
        }

Modified: 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java
===================================================================
--- 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java
  2010-09-14 16:32:44 UTC (rev 72989)
+++ 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java
  2010-09-14 16:41:16 UTC (rev 72990)
@@ -354,7 +354,7 @@
                }
        }
 
-       public void buildAggregatePhrases( int start, double minWeight, double 
maxWeight, Matcher phraseBreak ) {
+       public void buildAggregatePhrases( int start, double minWeight, double 
maxWeight, int maxDepth, Matcher phraseBreak ) {
                AggregatePhraseBuilder builder = new AggregatePhraseBuilder( 
minWeight, maxWeight, phraseBreak );
                
                if (isEmpty()) return;
@@ -363,7 +363,7 @@
                
                for (int i=start; i<end; i++) {
                                if (hasPhrasesAt(i)) {
-                                       builder.walk(getRootNodeAt(i), 0, null, 
Integer.MAX_VALUE, maxWeight);
+                                       builder.walk(getRootNodeAt(i), 0, null, 
maxDepth, maxWeight);
                                }
                }
  

Modified: 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/WikiWordConcept.java
===================================================================
--- 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/WikiWordConcept.java
     2010-09-14 16:32:44 UTC (rev 72989)
+++ 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/WikiWordConcept.java
     2010-09-14 16:41:16 UTC (rev 72990)
@@ -66,7 +66,6 @@
        }
 
        public void setName(String name) {
-               if (this.name!=null) throw new IllegalStateException("property 
already initialized");
                this.name = name;
        }
 
@@ -91,7 +90,6 @@
        }
 
        public void setFeatures(ConceptFeatures<? extends WikiWordConcept, 
Integer> features) {
-               if (this.features!=null) throw new 
IllegalStateException("property already initialized");
                if (features.getConcept()!=null && 
!this.equals(features.getConcept())) throw new 
IllegalArgumentException("ConceptFeatures bound to a different concept: 
"+features.getConcept());
                this.features = features;
        }
@@ -101,7 +99,6 @@
        }
 
        public void setProperties(ConceptProperties<? extends WikiWordConcept> 
properties) {
-               if (this.properties!=null) throw new 
IllegalStateException("property already initialized");
                if (properties.getConcept()!=null && 
!this.equals(properties.getConcept())) throw new 
IllegalArgumentException("ConceptFeatures bound to a different concept: 
"+features.getConcept());
                this.properties = properties;
        }
@@ -111,7 +108,6 @@
        }
 
        public void setResources(ConceptResources<? extends WikiWordConcept> 
resources) {
-               if (this.resources!=null) throw new 
IllegalStateException("property already initialized");
                this.resources = resources;
        }
        
@@ -120,7 +116,6 @@
        }
 
        public void setRelations(ConceptRelations<? extends WikiWordConcept> 
relations) {
-               if (this.relations!=null) throw new 
IllegalStateException("property already initialized");
                this.relations = relations;
        }
 
@@ -137,7 +132,6 @@
        }
 
        public void setTerms(TermReference[] terms) {
-               if (this.terms!=null) throw new IllegalStateException("property 
already initialized");
                this.terms = terms;
        }
 
@@ -146,7 +140,6 @@
        }
 
        public void setType(ConceptType type) {
-               if (this.type!=null && !this.type.equals(ConceptType.UNKNOWN)) 
throw new IllegalStateException("property already initialized");
                this.type = type;
        }
 

Modified: 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/AnalyzerConsole.java
===================================================================
--- 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/AnalyzerConsole.java
   2010-09-14 16:32:44 UTC (rev 72989)
+++ 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/AnalyzerConsole.java
   2010-09-14 16:41:16 UTC (rev 72990)
@@ -28,7 +28,7 @@
        public void runCommand(String cmd, List<Object> params, ConsoleOutput 
out) throws Exception {
                        if (cmd.equals("phrases") || cmd.equals("p")) {
                                Object s = params.get(1);
-                               PhraseOccuranceSet occurances = 
plainTextAnalyzer.extractPhrases(s.toString(), 5);
+                               PhraseOccuranceSet occurances = 
plainTextAnalyzer.extractPhrases(s.toString(), 5, 5);
                                out.writeList(occurances);
                                out.dumpPhraseTree(occurances.getRootNode());
                        } else {
@@ -40,7 +40,7 @@
                if (s.indexOf('|')>0 || s.indexOf(';')>0 ) {
                        return super.getPhrases(s);
                } else {
-                       PhraseOccuranceSet occurances = 
plainTextAnalyzer.extractPhrases(s, 5);
+                       PhraseOccuranceSet occurances = 
plainTextAnalyzer.extractPhrases(s, 5, 5);
                        return occurances.getRootNode();
                }
        }

Modified: 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
===================================================================
--- 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
 2010-09-14 16:32:44 UTC (rev 72989)
+++ 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
 2010-09-14 16:41:16 UTC (rev 72990)
@@ -271,7 +271,7 @@
                }
        } */
        
-       public PhraseOccuranceSet extractPhrases(CharSequence text, int 
maxWeight) {
+       public PhraseOccuranceSet extractPhrases(CharSequence text, int 
maxWeight, int maxDepth) {
                PhraseOccuranceSet phrases = new 
PhraseOccuranceSet(text.toString(), new ArrayList<PhraseOccurance>());
                
                text = applyManglers(config.sentenceManglers, text);
@@ -285,7 +285,7 @@
                        buildPhrases(s, ofs, phrases, maxWeight);
                        if (phrases.isEmpty()) continue;
                        
-                       phrases.buildAggregatePhrases(ofs, 0, maxWeight, 
phraseBreakeMatcher);
+                       phrases.buildAggregatePhrases(ofs, 0, maxWeight, 
maxDepth, phraseBreakeMatcher);
                }
 
                if (phrases.isEmpty()) return phrases; 
@@ -373,7 +373,7 @@
                BufferedReader in = new BufferedReader(new 
InputStreamReader(System.in));
                String s ;
                 while ( (s = in.readLine()) != null ) {
-                        PhraseOccuranceSet phrases = 
analyzer.extractPhrases(s, 6);
+                        PhraseOccuranceSet phrases = 
analyzer.extractPhrases(s, 6, 6);
                         DebugUtil.dump("", phrases, ConsoleIO.output);
                }
        }

Modified: 
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java
===================================================================
--- 
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java
     2010-09-14 16:32:44 UTC (rev 72989)
+++ 
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java
     2010-09-14 16:41:16 UTC (rev 72990)
@@ -73,29 +73,29 @@
                }
 
                public void testExtractPhrases() {
-                       PhraseOccuranceSet phrases = extractPhrases("", 3);
+                       PhraseOccuranceSet phrases = extractPhrases("", 3, 3);
                        assertEquals(0, phrases.size());
                        assertEquals(theList(), 
getWordList(phrases.getPhrasesAt(0)));
                        
-                       phrases = extractPhrases("foo", 3);
+                       phrases = extractPhrases("foo", 3, 3);
                        assertEquals(theList( "foo" ), 
getWordList(phrases.getPhrasesAt(0)));
 
-                       phrases = extractPhrases(" foo ", 3);
+                       phrases = extractPhrases(" foo ", 3, 3);
                        assertEquals(theList(), 
getWordList(phrases.getPhrasesAt(0)));
                        assertEquals(theList( "foo" ), 
getWordList(phrases.getPhrasesAt(1)));
                        assertEquals(theList( "foo" ), 
getWordList(phrases.getPhrasesFrom(0)));
                }
                
                public void testExtractPhrases2() {
-                       PhraseOccuranceSet phrases = extractPhrases("red green 
blue yellow black", 3);
+                       PhraseOccuranceSet phrases = extractPhrases("red green 
blue yellow black", 3, 6);
                        assertEquals(theList( "red green blue", "red green", 
"red" ), getWordList(phrases.getPhrasesAt(0)));
                        assertEquals(theList( "green blue yellow", "green 
blue", "green" ), getWordList(phrases.getPhrasesAt(4)));
 
-                       phrases = extractPhrases("red green blue yellow black", 
5);
+                       phrases = extractPhrases("red green blue yellow black", 
5, 10);
                        assertEquals(theList( "red green blue yellow black", 
"red green blue yellow", "red green blue", "red green", "red" ), 
getWordList(phrases.getPhrasesAt(0)));
                        assertEquals(theList( "green blue yellow black", "green 
blue yellow", "green blue", "green" ), getWordList(phrases.getPhrasesAt(4)));
 
-                       phrases = extractPhrases("and red and green and blue 
and yellow", 3);
+                       phrases = extractPhrases("and red and green and blue 
and yellow", 3, 12);
                        assertEquals(theList( "and red and green and blue",
                                                                                
                                "and red and green and",
                                                                                
                                "and red and green",
@@ -111,14 +111,14 @@
                                                                                
                                ), 
                                                                                
        getWordList(phrases.getPhrasesAt(4)));
 
-                       phrases = extractPhrases("red green blue. yellow 
black", 5);
+                       phrases = extractPhrases("red green blue. yellow 
black", 5, 10);
                        assertEquals(theList( "red green blue", "red green", 
"red" ), getWordList(phrases.getPhrasesAt(0)));
                        assertEquals(theList( "blue" ), 
getWordList(phrases.getPhrasesAt(10)));
                        assertEquals(theList( "yellow black", "yellow" ), 
getWordList(phrases.getPhrasesAt(16)));
                }
                
                public void testExtractPhrases3() {
-                       PhraseOccuranceSet phrases = extractPhrases("Krababbel: 
l'Foo-Bar", 3);
+                       PhraseOccuranceSet phrases = extractPhrases("Krababbel: 
l'Foo-Bar", 3, 6);
                        assertEquals(theList( "Krababbel"), 
getWordList(phrases.getPhrasesAt(0)));
 
                        assertEquals(theList( "l'Foo-Bar", 
@@ -134,7 +134,7 @@
                        assertEquals(theList( "Bar"), 
                                                                                
getWordList(phrases.getPhrasesAt(17)));
 
-                       phrases = extractPhrases("harald's 'schlaaand", 3);
+                       phrases = extractPhrases("harald's 'schlaaand", 3, 3);
                        assertEquals(theList( "harald's 'schlaaand", 
                                                                                
                                "harald's", 
                                                                                
                                "harald" 



_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to