Author: tommaso
Date: Fri Sep 18 08:02:12 2015
New Revision: 1703761
URL: http://svn.apache.org/viewvc?rev=1703761&view=rev
Log:
OPENNLP-817 - switch to j7, added missing AL header, added runner test, tweaked
parse rules method to adjust probs
Added:
opennlp/sandbox/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/CFGRunnerTest.java
Modified:
opennlp/sandbox/nlp-utils/pom.xml
opennlp/sandbox/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGRunner.java
opennlp/sandbox/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammar.java
opennlp/sandbox/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammarTest.java
Modified: opennlp/sandbox/nlp-utils/pom.xml
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/nlp-utils/pom.xml?rev=1703761&r1=1703760&r2=1703761&view=diff
==============================================================================
--- opennlp/sandbox/nlp-utils/pom.xml (original)
+++ opennlp/sandbox/nlp-utils/pom.xml Fri Sep 18 08:02:12 2015
@@ -43,9 +43,8 @@
<artifactId>maven-compiler-plugin</artifactId>
<version>2.0.2</version>
<configuration>
- <compilerVersion>1.6</compilerVersion>
- <source>1.6</source>
- <target>1.6</target>
+ <source>1.7</source>
+ <target>1.7</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
Modified:
opennlp/sandbox/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGRunner.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGRunner.java?rev=1703761&r1=1703760&r2=1703761&view=diff
==============================================================================
---
opennlp/sandbox/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGRunner.java
(original)
+++
opennlp/sandbox/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/CFGRunner.java
Fri Sep 18 08:02:12 2015
@@ -1,3 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
package org.apache.opennlp.utils.cfg;
import java.io.BufferedReader;
Modified:
opennlp/sandbox/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammar.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammar.java?rev=1703761&r1=1703760&r2=1703761&view=diff
==============================================================================
---
opennlp/sandbox/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammar.java
(original)
+++
opennlp/sandbox/nlp-utils/src/main/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammar.java
Fri Sep 18 08:02:12 2015
@@ -41,15 +41,15 @@ public class ProbabilisticContextFreeGra
private final String startSymbol;
private boolean randomExpansion;
- private static final Rule emptyRule = new Rule("E", "");
+ private static final Rule emptyRule = new Rule("EMPTY~", "");
private static final String nonTerminalMatcher =
"[\\w\\~\\*\\-\\.\\,\\'\\:\\_\\\"]";
- private static final String terminalMatcher = "[òÃ
ùìèé\\|\\w\\'\\.\\,\\:\\_Ã\\?Ã\\%\\;Ã\\-\\\"]";
+ private static final String terminalMatcher = "[\\*òÃ
ùìèé\\|\\w\\'\\.\\,\\:\\_Ã\\?Ã\\%\\;Ã\\-\\\"]";
private static final Pattern terminalPattern =
Pattern.compile("\\(("+nonTerminalMatcher+"+)\\s("+terminalMatcher+"+)\\)");
private static final Pattern nonTerminalPattern = Pattern.compile(
"\\(("+nonTerminalMatcher+"+)" + // source NT
-
"\\s("+nonTerminalMatcher+"+)(\\s("+nonTerminalMatcher+"+))*\\)" // expansion
NTs
+
"\\s("+nonTerminalMatcher+"+)((\\s"+nonTerminalMatcher+"+)*)\\)" // expansion
NTs
);
public ProbabilisticContextFreeGrammar(Collection<String>
nonTerminalSymbols, Collection<String> terminalSymbols,
@@ -94,7 +94,6 @@ public class ProbabilisticContextFreeGra
expansion.addAll(getTerminals(word));
}
return expansion.toArray(new String[expansion.size()]);
-
}
private Collection<String> getTerminals(String word) {
@@ -258,10 +257,10 @@ public class ProbabilisticContextFreeGra
public String toString() {
if (getRule() != emptyRule) {
return "(" +
- rule.getEntry() + " " +
+ (rule != null ? rule.getEntry() : null) + " " +
(leftTree != null && rightTree != null ?
leftTree.toString() + " " + rightTree.toString() :
- rule.getExpansion()[0]
+ (rule != null ? rule.getExpansion()[0] : null)
) +
')';
} else {
@@ -296,6 +295,11 @@ public class ProbabilisticContextFreeGra
Collection<String> nonTerminals = new HashSet<>();
Collection<String> terminals = new HashSet<>();
+ rules.put(emptyRule, 1d);
+ rulesMap.put(emptyRule, 1d);
+ nonTerminals.add(emptyRule.getEntry());
+ terminals.add(emptyRule.getExpansion()[0]);
+
for (String parseTreeString : parseStrings) {
if (trim) {
@@ -312,7 +316,6 @@ public class ProbabilisticContextFreeGra
if (!rules.containsKey(key)) {
rules.put(key, 1d);
terminals.add(t);
-// System.err.println(key);
}
toConsume = toConsume.replace(m.group(), nt);
}
@@ -340,16 +343,12 @@ public class ProbabilisticContextFreeGra
if (!rules.containsKey(key)) {
rules.put(key, 1d);
-// startSymbol = key.getEntry();
-// System.err.println(key);
}
toConsume = toConsume.replace(m2.group(), nt);
}
}
}
- // TODO : check/adjust rules to make them respect CNF
- // TODO : adjust probabilities based on term frequencies
for (Map.Entry<Rule, Double> entry : rules.entrySet()) {
normalize(entry.getKey(), nonTerminals, terminals, rulesMap);
}
@@ -357,35 +356,55 @@ public class ProbabilisticContextFreeGra
return new ProbabilisticContextFreeGrammar(nonTerminals, terminals,
rulesMap, startSymbol, true);
}
+ /**
+ * Normalize (check and eventually adjust) rules to make them respect CNF
+ * @param rule
+ * @param nonTerminals
+ * @param terminals
+ * @param rulesMap
+ */
private static void normalize(Rule rule, Collection<String> nonTerminals,
Collection<String> terminals, Map<Rule, Double> rulesMap) {
String[] expansion = rule.getExpansion();
+ String firstExpansion = expansion[0];
if (expansion.length == 1) {
- if (!terminals.contains(expansion[0])) {
- if (nonTerminals.contains(expansion[0])) {
+ if (!terminals.contains(firstExpansion)) {
+ if (nonTerminals.contains(firstExpansion)) {
// nt1 -> nt2 should be expanded in nt1 -> nt2,E
- rulesMap.put(new Rule(rule.getEntry(), expansion[0],
emptyRule.getEntry()), 1d);
- if (rulesMap.containsKey(emptyRule)) {
- rulesMap.put(emptyRule, 1d);
- }
+ Rule newRule = new Rule(rule.getEntry(), firstExpansion,
emptyRule.getEntry());
+ addRule(newRule, rulesMap);
} else {
throw new RuntimeException("rule "+rule+" expands to neither a
terminal or non terminal");
}
} else {
- rulesMap.put(rule, 1d);
+ addRule(rule, rulesMap);
}
} else if (expansion.length > 2){
// nt1 -> nt2,nt3,...,ntn should be collapsed to a hierarchy of ntX ->
ntY,ntZ rules
- String nt2 = expansion[0];
int seed = nonTerminals.size();
String generatedNT = "GEN~" + seed;
nonTerminals.add(generatedNT);
- Rule newRule = new Rule(rule.getEntry(), nt2, generatedNT);
+ Rule newRule = new Rule(rule.getEntry(), firstExpansion, generatedNT);
rulesMap.put(newRule, 1d);
- Rule chainedRule = new Rule(generatedNT, Arrays.copyOfRange(expansion,
1, expansion.length - 1));
+ Rule chainedRule = new Rule(generatedNT, Arrays.copyOfRange(expansion,
1, expansion.length));
rulesMap.put(chainedRule, 1d);
normalize(chainedRule, nonTerminals, terminals, rulesMap);
} else {
- rulesMap.put(rule, 1d);
+ addRule(rule, rulesMap);
+ }
+ }
+
+ private static void addRule(Rule rule, Map<Rule, Double> rulesMap) {
+ Double prob = rulesMap.get(rule);
+ if (prob != null && prob > 0d) {
+ if (prob > 0.9d) {
+ prob += 1d - prob - 0.01d;
+ } else {
+ prob += 0.01;
+ }
+ } else {
+ prob = 0.3d;
}
+
+ rulesMap.put(rule, prob);
}
}
Added:
opennlp/sandbox/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/CFGRunnerTest.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/CFGRunnerTest.java?rev=1703761&view=auto
==============================================================================
---
opennlp/sandbox/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/CFGRunnerTest.java
(added)
+++
opennlp/sandbox/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/CFGRunnerTest.java
Fri Sep 18 08:02:12 2015
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.opennlp.utils.cfg;
+
+import org.junit.Test;
+
+/**
+ * Tests for {@link CFGRunner}
+ */
+public class CFGRunnerTest {
+
+ @Test
+ public void testDefaultMain() throws Exception {
+ CFGRunner.main(new String[0]);
+ }
+
+ @Test
+ public void testMainWithWD() throws Exception {
+ CFGRunner.main(new String[]{"-wn"});
+ }
+
+ @Test
+ public void testMainWithPT() throws Exception {
+ CFGRunner.main(new String[]{"-pt"});
+ }
+
+ @Test
+ public void testMainWithWNAndPT() throws Exception {
+ CFGRunner.main(new String[]{"-wn", "-pt"});
+ }
+}
\ No newline at end of file
Modified:
opennlp/sandbox/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammarTest.java
URL:
http://svn.apache.org/viewvc/opennlp/sandbox/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammarTest.java?rev=1703761&r1=1703760&r2=1703761&view=diff
==============================================================================
---
opennlp/sandbox/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammarTest.java
(original)
+++
opennlp/sandbox/nlp-utils/src/test/java/org/apache/opennlp/utils/cfg/ProbabilisticContextFreeGrammarTest.java
Fri Sep 18 08:02:12 2015
@@ -194,7 +194,7 @@ public class ProbabilisticContextFreeGra
String string = "(S (VP (Adv last) (Vb tidy)) (NP (Adj biogenic) (NN
Gainesville)))";
Map<Rule, Double> rules =
ProbabilisticContextFreeGrammar.parseRules(string);
assertNotNull(rules);
- assertEquals(7, rules.size());
+ assertEquals(8, rules.size());
}
@Test
@@ -231,6 +231,11 @@ public class ProbabilisticContextFreeGra
ProbabilisticContextFreeGrammar.parseRules(rules3, true, newsSample,
newsSample2);
assertNotNull(rules3);
+ ProbabilisticContextFreeGrammar contextFreeGrammar =
ProbabilisticContextFreeGrammar.parseGrammar(newsSample, newsSample2);
+ assertNotNull(contextFreeGrammar);
+ String[] derivation = contextFreeGrammar.leftMostDerivation("S");
+ assertNotNull(derivation);
+ assertTrue(derivation.length > 1);
}
@Ignore
@@ -244,9 +249,14 @@ public class ProbabilisticContextFreeGra
String[] derivation = cfg.leftMostDerivation("S");
assertNotNull(derivation);
System.err.println(Arrays.toString(derivation));
- String sentence = "Il governo di Berisha pare in difficolta'";
- ProbabilisticContextFreeGrammar.ParseTree parseTree =
cfg.cky(Arrays.asList(sentence.split(" ")));
- assertNotNull(parseTree);
+ ProbabilisticContextFreeGrammar.ParseTree parseTree1 =
cfg.cky(Arrays.asList(derivation));
+ assertNotNull(parseTree1);
+ System.err.println(parseTree1);
+
+ String sentence = "Il Governo di Berisha appare in difficolta'";
+ List<String> fixedSentence = Arrays.asList(sentence.split(" "));
+ ProbabilisticContextFreeGrammar.ParseTree parseTree2 =
cfg.cky(fixedSentence);
+ assertNotNull(parseTree2);
}
private Collection<String> parseSentences(BufferedReader bufferedReader)
throws IOException {