http://www.mediawiki.org/wiki/Special:Code/MediaWiki/72990
Revision: 72990
Author: daniel
Date: 2010-09-14 16:41:16 +0000 (Tue, 14 Sep 2010)
Log Message:
-----------
allow meanings to be mangled on load; limit recursion depth on phrase detector
Modified Paths:
--------------
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PhraseExtractor.java
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/StoredMeaningFetcher.java
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/WikiWordConcept.java
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/AnalyzerConsole.java
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java
Modified:
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java
===================================================================
---
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java
2010-09-14 16:32:44 UTC (rev 72989)
+++
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/AbstractDisambiguator.java
2010-09-14 16:41:16 UTC (rev 72990)
@@ -21,6 +21,8 @@
private Map<String, C> meaningOverrides;
+ private int phraseSearchDepth = 8; //FIXME: magic...
+
public AbstractDisambiguator(MeaningFetcher<? extends C>
meaningFetcher, int cacheCapacity) {
if (meaningFetcher==null) throw new NullPointerException();
@@ -28,6 +30,14 @@
this.meaningFetcher = meaningFetcher;
}
+ public int getPhraseSearchDepth() {
+ return phraseSearchDepth;
+ }
+
+ public void setPhraseSearchDepth(int phraseSearchDepth) {
+ this.phraseSearchDepth = phraseSearchDepth;
+ }
+
public MeaningFetcher<? extends C> getMeaningFetcher() {
return meaningFetcher;
}
@@ -86,7 +96,7 @@
protected <X extends T>Map<X, List<? extends C>>
getMeanings(PhraseNode<X> root) throws PersistenceException {
- Collection<X> terms = getTerms(root, Integer.MAX_VALUE);
+ Collection<X> terms = getTerms(root, phraseSearchDepth);
return getMeanings(terms);
}
@@ -117,7 +127,7 @@
}
public <X extends T>Disambiguation<X, C> disambiguate(PhraseNode<X>
root, Collection<? extends C> context) throws PersistenceException {
- Collection<X> terms = getTerms(root, Integer.MAX_VALUE);
+ Collection<X> terms = getTerms(root, phraseSearchDepth);
Map<X, List<? extends C>> meanings = getMeanings(terms);
return disambiguate(root, meanings, context);
}
Modified:
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java
===================================================================
---
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java
2010-09-14 16:32:44 UTC (rev 72989)
+++
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java
2010-09-14 16:41:16 UTC (rev 72990)
@@ -248,7 +248,7 @@
return getScore(r.getInterpretation(), context,
similarities, features);
}
- Collection<List<X>> sequences = getSequences(root,
Integer.MAX_VALUE);
+ Collection<List<X>> sequences = getSequences(root,
getPhraseSearchDepth());
return disambiguate(sequences, root, meanings, context);
}
Modified:
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PhraseExtractor.java
===================================================================
---
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PhraseExtractor.java
2010-09-14 16:32:44 UTC (rev 72989)
+++
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PhraseExtractor.java
2010-09-14 16:41:16 UTC (rev 72990)
@@ -4,6 +4,6 @@
public interface PhraseExtractor {
- public PhraseOccuranceSet extractPhrases(CharSequence s, int maxWeight);
+ public PhraseOccuranceSet extractPhrases(CharSequence s, int maxWeight,
int maxDepth);
}
Modified:
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java
===================================================================
---
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java
2010-09-14 16:32:44 UTC (rev 72989)
+++
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/PopularityDisambiguator.java
2010-09-14 16:41:16 UTC (rev 72990)
@@ -62,7 +62,7 @@
}
public <X extends T>Disambiguation<X, C> disambiguate(PhraseNode<X>
root, Map<X, List<? extends C>> meanings, Collection<? extends C> context) {
- Collection<List<X>> sequences = getSequences(root,
Integer.MAX_VALUE);
+ Collection<List<X>> sequences = getSequences(root,
getPhraseSearchDepth());
return disambiguate(sequences, root, meanings, context);
}
Modified:
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/StoredMeaningFetcher.java
===================================================================
---
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/StoredMeaningFetcher.java
2010-09-14 16:32:44 UTC (rev 72989)
+++
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/StoredMeaningFetcher.java
2010-09-14 16:41:16 UTC (rev 72990)
@@ -3,13 +3,15 @@
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
+import java.util.ListIterator;
import java.util.Map;
+import de.brightbyte.data.Functor2;
import de.brightbyte.data.cursor.DataSet;
import de.brightbyte.io.Output;
import de.brightbyte.util.PersistenceException;
+import de.brightbyte.wikiword.model.TermReference;
import de.brightbyte.wikiword.model.WikiWordConcept;
-import de.brightbyte.wikiword.model.TermReference;
import de.brightbyte.wikiword.store.WikiWordConceptStore;
import de.brightbyte.wikiword.store.WikiWordConceptStore.ConceptQuerySpec;
@@ -17,6 +19,7 @@
protected WikiWordConceptStore store;
protected ConceptQuerySpec spec;
protected Output trace;
+ protected Functor2<WikiWordConcept, WikiWordConcept, String>
meaningMangler;
public StoredMeaningFetcher(WikiWordConceptStore store) {
this(store, null);
@@ -29,10 +32,29 @@
this.spec = type;
}
+ public Functor2<WikiWordConcept, WikiWordConcept, String>
getMeaningMangler() {
+ return meaningMangler;
+ }
+
+ public void setMeaningMangler(Functor2<WikiWordConcept,
WikiWordConcept, String> meaningMangler) {
+ this.meaningMangler = meaningMangler;
+ }
+
public List<WikiWordConcept> getMeanings(String term) throws
PersistenceException {
- DataSet<WikiWordConcept> m = store.getMeanings(term, spec);
//FIXME: filter/cut-off rules, sort order! //XXX: relevance value?
+ DataSet<WikiWordConcept> m = store.getMeanings(term, spec);
List<WikiWordConcept> meanigns = m.load();
+ if ( meaningMangler != null ) {
+ ListIterator<WikiWordConcept> it =
meanigns.listIterator();
+ while (it.hasNext()) {
+ WikiWordConcept c = it.next();
+ WikiWordConcept c2 = meaningMangler.apply(c,
term);
+
+ if ( c2 == null ) it.remove();
+ else if ( c != c2 ) it.set(c2);
+ }
+ }
+
trace("fetched "+meanigns.size()+" meanings for \""+term+"\"");
return meanigns;
}
Modified:
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java
===================================================================
---
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java
2010-09-14 16:32:44 UTC (rev 72989)
+++
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/PhraseOccuranceSet.java
2010-09-14 16:41:16 UTC (rev 72990)
@@ -354,7 +354,7 @@
}
}
- public void buildAggregatePhrases( int start, double minWeight, double
maxWeight, Matcher phraseBreak ) {
+ public void buildAggregatePhrases( int start, double minWeight, double
maxWeight, int maxDepth, Matcher phraseBreak ) {
AggregatePhraseBuilder builder = new AggregatePhraseBuilder(
minWeight, maxWeight, phraseBreak );
if (isEmpty()) return;
@@ -363,7 +363,7 @@
for (int i=start; i<end; i++) {
if (hasPhrasesAt(i)) {
- builder.walk(getRootNodeAt(i), 0, null,
Integer.MAX_VALUE, maxWeight);
+ builder.walk(getRootNodeAt(i), 0, null,
maxDepth, maxWeight);
}
}
Modified:
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/WikiWordConcept.java
===================================================================
---
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/WikiWordConcept.java
2010-09-14 16:32:44 UTC (rev 72989)
+++
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/model/WikiWordConcept.java
2010-09-14 16:41:16 UTC (rev 72990)
@@ -66,7 +66,6 @@
}
public void setName(String name) {
- if (this.name!=null) throw new IllegalStateException("property
already initialized");
this.name = name;
}
@@ -91,7 +90,6 @@
}
public void setFeatures(ConceptFeatures<? extends WikiWordConcept,
Integer> features) {
- if (this.features!=null) throw new
IllegalStateException("property already initialized");
if (features.getConcept()!=null &&
!this.equals(features.getConcept())) throw new
IllegalArgumentException("ConceptFeatures bound to a different concept:
"+features.getConcept());
this.features = features;
}
@@ -101,7 +99,6 @@
}
public void setProperties(ConceptProperties<? extends WikiWordConcept>
properties) {
- if (this.properties!=null) throw new
IllegalStateException("property already initialized");
if (properties.getConcept()!=null &&
!this.equals(properties.getConcept())) throw new
IllegalArgumentException("ConceptFeatures bound to a different concept:
"+features.getConcept());
this.properties = properties;
}
@@ -111,7 +108,6 @@
}
public void setResources(ConceptResources<? extends WikiWordConcept>
resources) {
- if (this.resources!=null) throw new
IllegalStateException("property already initialized");
this.resources = resources;
}
@@ -120,7 +116,6 @@
}
public void setRelations(ConceptRelations<? extends WikiWordConcept>
relations) {
- if (this.relations!=null) throw new
IllegalStateException("property already initialized");
this.relations = relations;
}
@@ -137,7 +132,6 @@
}
public void setTerms(TermReference[] terms) {
- if (this.terms!=null) throw new IllegalStateException("property
already initialized");
this.terms = terms;
}
@@ -146,7 +140,6 @@
}
public void setType(ConceptType type) {
- if (this.type!=null && !this.type.equals(ConceptType.UNKNOWN))
throw new IllegalStateException("property already initialized");
this.type = type;
}
Modified:
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/AnalyzerConsole.java
===================================================================
---
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/AnalyzerConsole.java
2010-09-14 16:32:44 UTC (rev 72989)
+++
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/AnalyzerConsole.java
2010-09-14 16:41:16 UTC (rev 72990)
@@ -28,7 +28,7 @@
public void runCommand(String cmd, List<Object> params, ConsoleOutput
out) throws Exception {
if (cmd.equals("phrases") || cmd.equals("p")) {
Object s = params.get(1);
- PhraseOccuranceSet occurances =
plainTextAnalyzer.extractPhrases(s.toString(), 5);
+ PhraseOccuranceSet occurances =
plainTextAnalyzer.extractPhrases(s.toString(), 5, 5);
out.writeList(occurances);
out.dumpPhraseTree(occurances.getRootNode());
} else {
@@ -40,7 +40,7 @@
if (s.indexOf('|')>0 || s.indexOf(';')>0 ) {
return super.getPhrases(s);
} else {
- PhraseOccuranceSet occurances =
plainTextAnalyzer.extractPhrases(s, 5);
+ PhraseOccuranceSet occurances =
plainTextAnalyzer.extractPhrases(s, 5, 5);
return occurances.getRootNode();
}
}
Modified:
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
===================================================================
---
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
2010-09-14 16:32:44 UTC (rev 72989)
+++
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzer.java
2010-09-14 16:41:16 UTC (rev 72990)
@@ -271,7 +271,7 @@
}
} */
- public PhraseOccuranceSet extractPhrases(CharSequence text, int
maxWeight) {
+ public PhraseOccuranceSet extractPhrases(CharSequence text, int
maxWeight, int maxDepth) {
PhraseOccuranceSet phrases = new
PhraseOccuranceSet(text.toString(), new ArrayList<PhraseOccurance>());
text = applyManglers(config.sentenceManglers, text);
@@ -285,7 +285,7 @@
buildPhrases(s, ofs, phrases, maxWeight);
if (phrases.isEmpty()) continue;
- phrases.buildAggregatePhrases(ofs, 0, maxWeight,
phraseBreakeMatcher);
+ phrases.buildAggregatePhrases(ofs, 0, maxWeight,
maxDepth, phraseBreakeMatcher);
}
if (phrases.isEmpty()) return phrases;
@@ -373,7 +373,7 @@
BufferedReader in = new BufferedReader(new
InputStreamReader(System.in));
String s ;
while ( (s = in.readLine()) != null ) {
- PhraseOccuranceSet phrases =
analyzer.extractPhrases(s, 6);
+ PhraseOccuranceSet phrases =
analyzer.extractPhrases(s, 6, 6);
DebugUtil.dump("", phrases, ConsoleIO.output);
}
}
Modified:
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java
===================================================================
---
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java
2010-09-14 16:32:44 UTC (rev 72989)
+++
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/analyzer/PlainTextAnalyzerTest.java
2010-09-14 16:41:16 UTC (rev 72990)
@@ -73,29 +73,29 @@
}
public void testExtractPhrases() {
- PhraseOccuranceSet phrases = extractPhrases("", 3);
+ PhraseOccuranceSet phrases = extractPhrases("", 3, 3);
assertEquals(0, phrases.size());
assertEquals(theList(),
getWordList(phrases.getPhrasesAt(0)));
- phrases = extractPhrases("foo", 3);
+ phrases = extractPhrases("foo", 3, 3);
assertEquals(theList( "foo" ),
getWordList(phrases.getPhrasesAt(0)));
- phrases = extractPhrases(" foo ", 3);
+ phrases = extractPhrases(" foo ", 3, 3);
assertEquals(theList(),
getWordList(phrases.getPhrasesAt(0)));
assertEquals(theList( "foo" ),
getWordList(phrases.getPhrasesAt(1)));
assertEquals(theList( "foo" ),
getWordList(phrases.getPhrasesFrom(0)));
}
public void testExtractPhrases2() {
- PhraseOccuranceSet phrases = extractPhrases("red green
blue yellow black", 3);
+ PhraseOccuranceSet phrases = extractPhrases("red green
blue yellow black", 3, 6);
assertEquals(theList( "red green blue", "red green",
"red" ), getWordList(phrases.getPhrasesAt(0)));
assertEquals(theList( "green blue yellow", "green
blue", "green" ), getWordList(phrases.getPhrasesAt(4)));
- phrases = extractPhrases("red green blue yellow black",
5);
+ phrases = extractPhrases("red green blue yellow black",
5, 10);
assertEquals(theList( "red green blue yellow black",
"red green blue yellow", "red green blue", "red green", "red" ),
getWordList(phrases.getPhrasesAt(0)));
assertEquals(theList( "green blue yellow black", "green
blue yellow", "green blue", "green" ), getWordList(phrases.getPhrasesAt(4)));
- phrases = extractPhrases("and red and green and blue
and yellow", 3);
+ phrases = extractPhrases("and red and green and blue
and yellow", 3, 12);
assertEquals(theList( "and red and green and blue",
"and red and green and",
"and red and green",
@@ -111,14 +111,14 @@
),
getWordList(phrases.getPhrasesAt(4)));
- phrases = extractPhrases("red green blue. yellow
black", 5);
+ phrases = extractPhrases("red green blue. yellow
black", 5, 10);
assertEquals(theList( "red green blue", "red green",
"red" ), getWordList(phrases.getPhrasesAt(0)));
assertEquals(theList( "blue" ),
getWordList(phrases.getPhrasesAt(10)));
assertEquals(theList( "yellow black", "yellow" ),
getWordList(phrases.getPhrasesAt(16)));
}
public void testExtractPhrases3() {
- PhraseOccuranceSet phrases = extractPhrases("Krababbel:
l'Foo-Bar", 3);
+ PhraseOccuranceSet phrases = extractPhrases("Krababbel:
l'Foo-Bar", 3, 6);
assertEquals(theList( "Krababbel"),
getWordList(phrases.getPhrasesAt(0)));
assertEquals(theList( "l'Foo-Bar",
@@ -134,7 +134,7 @@
assertEquals(theList( "Bar"),
getWordList(phrases.getPhrasesAt(17)));
- phrases = extractPhrases("harald's 'schlaaand", 3);
+ phrases = extractPhrases("harald's 'schlaaand", 3, 3);
assertEquals(theList( "harald's 'schlaaand",
"harald's",
"harald"
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs