http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/NERPhraseGeneralizer.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/NERPhraseGeneralizer.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/NERPhraseGeneralizer.java new file mode 100644 index 0000000..a5e1ee7 --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/NERPhraseGeneralizer.java @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.parse_thicket.matching; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; + + + + + + + + + + +import opennlp.tools.parse_thicket.IGeneralizer; +import opennlp.tools.parse_thicket.ParseTreeNode; +import opennlp.tools.stemmer.PStemmer; +import opennlp.tools.textsimilarity.GeneralizationListReducer; +import opennlp.tools.textsimilarity.LemmaFormManager; +import opennlp.tools.textsimilarity.POSManager; +import opennlp.tools.textsimilarity.ParseTreeChunk; + +public class NERPhraseGeneralizer extends PhraseGeneralizer { + + /* alignment is based on NER values, not on POS now + * + */ + + + /** + * key matching function which takes two phrases, aligns them and finds a set + * of maximum common sub-phrase + * + * @param chunk1 + * @param chunk2 + * @return + */ + @Override + public List<ParseTreeChunk> generalize( + Object chunk1o, Object chunk2o) { + + ParseTreeChunk chunk1 = (ParseTreeChunk)chunk1o, chunk2 = (ParseTreeChunk)chunk2o; + List<ParseTreeNode> results = new ArrayList<ParseTreeNode>(); + List<ParseTreeChunk> resultChunks = new ArrayList<ParseTreeChunk>(); + + + List<String> pos1 = chunk1.getPOSs(); + List<String> pos2 = chunk2.getPOSs(); + List<String> lem1 = chunk1.getLemmas(); + List<String> lem2 = chunk2.getLemmas(); + + List<String> ner1 = new ArrayList<String>(); + List<String> ner2 = new ArrayList<String>(); + + + for (ParseTreeNode node: chunk1.getParseTreeNodes()) { + if (node.getNe()!=null && !node.getNe().equals("O")) + ner1.add(node.getNe()); + } + + for (ParseTreeNode node: chunk2.getParseTreeNodes()) { + if (node.getNe()!=null && !node.getNe().equals("O")) + ner2.add(node.getNe()); + } + + + List<String> overlap = new ArrayList<String>(ner1); + overlap.retainAll(ner2); + overlap = new ArrayList<String>(new HashSet<String>(overlap)); + + + if (overlap == null || overlap.size() < 1) + return null; + + List<Integer> occur1 = new ArrayList<Integer>(), occur2 = new ArrayList<Integer>(); + for (String word : overlap) { + Integer i1 = ner1.indexOf(word); + Integer i2 = ner2.indexOf(word); + occur1.add(i1); + occur2.add(i2); + } + + + // for verbs find alignment even if no same verb lemmas, just any pair of verbs. Usually should be 0,0 + if (chunk1.getMainPOS().startsWith("VP") && chunk2.getMainPOS().startsWith("VP")) { + Integer i1 = null, i2 = null; + for(int i=0; i< pos1.size(); i++){ + if (pos1.get(i).startsWith("VB")){ + i1 = i; + break; + } + } + + for(int i=0; i< pos2.size(); i++){ + if (pos2.get(i).startsWith("VB")){ + i2 = i; + break; + } + } + occur1.add(i1); + occur2.add(i2); + } + // now we search for plausible sublists of overlaps + // if at some position correspondence is inverse (one of two position + // decreases instead of increases) + // then we terminate current alignment accum and start a new one + List<List<int[]>> overlapsPlaus = new ArrayList<List<int[]>>(); + // starts from 1, not 0 + List<int[]> accum = new ArrayList<int[]>(); + accum.add(new int[] { occur1.get(0), occur2.get(0) }); + for (int i = 1; i < occur1.size(); i++) { + + if (occur1.get(i) > occur1.get(i - 1) + && occur2.get(i) > occur2.get(i - 1)) + accum.add(new int[] { occur1.get(i), occur2.get(i) }); + else { + overlapsPlaus.add(accum); + if (occur1!=null && occur2!=null && i<occur1.size() && i<occur2.size() ){ + accum = new ArrayList<int[]>(); + accum.add(new int[] { occur1.get(i), occur2.get(i) }); + } + } + } + if (accum.size() > 0) { + overlapsPlaus.add(accum); + } + + + for (List<int[]> occur : overlapsPlaus) { + List<Integer> occr1 = new ArrayList<Integer>(), occr2 = new ArrayList<Integer>(); + for (int[] column : occur) { + occr1.add(column[0]); + occr2.add(column[1]); + } + + int ov1 = 0, ov2 = 0; // iterators over common words; + List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>(); + // we start two words before first word + int k1 = occr1.get(ov1) - 2, k2 = occr2.get(ov2) - 2; + Boolean bReachedCommonWord = false; + while (k1 < 0 || k2 < 0) { + k1++; + k2++; + } + int k1max = pos1.size() - 1, k2max = pos2.size() - 1; + while (k1 <= k1max && k2 <= k2max) { + /* // first check if the same POS + String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2)); + String lemmaMatch = lemmaFormManager.matchLemmas(ps, lem1.get(k1), + lem2.get(k2), sim); + */ + String sim = null; + List<String> sims = posManager.//similarPOS(pos1.get(k1), pos2.get(k2)); + generalize(pos1.get(k1), pos2.get(k2)); + if (!sims.isEmpty()) + sim = sims.get(0); + + String lemmaMatch = null; + List<String> lemmaMatchs = lemmaFormManager.//matchLemmas(ps, + generalize(lem1.get(k1), + lem2.get(k2)); + if (!lemmaMatchs.isEmpty()) + lemmaMatch = lemmaMatchs.get(0); + + + + if ((sim != null) + && (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch + .equals("fail")))) { + commonPOS.add(pos1.get(k1)); + + + // doing parse tree node generalization + List<ParseTreeNode> genRes = nodeGen.generalize(chunk1.getParseTreeNodes().get(k1), chunk2.getParseTreeNodes().get(k2)); + if (genRes.size()==1) + results.add(genRes.get(0)); + + if (lemmaMatch != null) { + commonLemmas.add(lemmaMatch); + // System.out.println("Added "+lemmaMatch); + if (k1 == occr1.get(ov1) && k2 == occr2.get(ov2)) + bReachedCommonWord = true; // now we can have different increment + // opera + else { + if (occr1.size() > ov1 + 1 && occr2.size() > ov2 + 1 + && k1 == occr1.get(ov1 + 1) && k2 == occr2.get(ov2 + 1)) { + ov1++; + ov2++; + bReachedCommonWord = true; + } + // else + // System.err.println("Next match reached '"+lemmaMatch+ + // "' | k1 - k2: "+k1 + " "+k2 + + // "| occur index ov1-ov2 "+ + // ov1+" "+ov2+ + // "| identified positions of match: occr1.get(ov1) - occr2.get(ov1) " + // + + // occr1.get(ov1) + " "+ occr2.get(ov1)); + } + } else { + commonLemmas.add("*"); + } // the same parts of speech, proceed to the next word in both + // expressions + k1++; + k2++; + + } else if (!bReachedCommonWord) { + k1++; + k2++; + } // still searching + else { + // different parts of speech, jump to the next identified common word + ov1++; + ov2++; + if (ov1 > occr1.size() - 1 || ov2 > occr2.size() - 1) + break; + // now trying to find + int kk1 = occr1.get(ov1) - 2, // new positions of iterators + kk2 = occr2.get(ov2) - 2; + int countMove = 0; + while ((kk1 < k1 + 1 || kk2 < k2 + 1) && countMove < 2) { // if it is + // behind + // current + // position, + // synchroneously + // move + // towards + // right + kk1++; + kk2++; + countMove++; + } + k1 = kk1; + k2 = kk2; + + if (k1 > k1max) + k1 = k1max; + if (k2 > k2max) + k2 = k2max; + bReachedCommonWord = false; + } + } + ParseTreeChunk currResult = new ParseTreeChunk(results), + currResultOld = new ParseTreeChunk(commonLemmas, commonPOS, 0, 0); + + + resultChunks.add(currResult); + } + + return resultChunks; + } + +}
http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreeNodeGeneralizer.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreeNodeGeneralizer.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreeNodeGeneralizer.java new file mode 100644 index 0000000..8001a7b --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/ParseTreeNodeGeneralizer.java @@ -0,0 +1,68 @@ +package opennlp.tools.parse_thicket.matching; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; + +import opennlp.tools.parse_thicket.IGeneralizer; +import opennlp.tools.parse_thicket.ParseTreeNode; +import opennlp.tools.parse_thicket.VerbNetProcessor; + +public class ParseTreeNodeGeneralizer implements IGeneralizer<ParseTreeNode>{ + private LemmaGeneralizer lGen = new LemmaGeneralizer (); + private PartOfSpeechGeneralizer posGen = new PartOfSpeechGeneralizer (); + private VerbNetProcessor vnProc = VerbNetProcessor.getInstance(null); + + @Override + public List<ParseTreeNode> generalize(Object o1, Object o2) { + + List<ParseTreeNode> results = new ArrayList<ParseTreeNode>(); + + ParseTreeNode newNode = null; + ParseTreeNode ch1 = (ParseTreeNode)o1, ch2 = (ParseTreeNode)o2; + List<String> posGenStrList = posGen.generalize(ch1.getPos(), ch2.getPos()); + if (!posGenStrList.isEmpty()){ + List<String> lemmaGen = lGen.generalize(ch1.getWord(), ch2.getWord()); + if (!lemmaGen.isEmpty()) + newNode = new ParseTreeNode(lemmaGen.get(0), posGenStrList.get(0), "O", -1); + else + newNode = new ParseTreeNode("*", posGenStrList.get(0), "O", -1); + } + + newNode.setPhraseType(ch1.getPhraseType()); + //TODO separate NER generalizer + //TODO multiword generalizer + if (posGenStrList.get(0).startsWith("NN")){ + if (ch1.getNe()!=null && ch2.getNe()!=null && ch1.getNe().equals(ch2.getNe())) + newNode.setNe(ch1.getNe()); + } + if (posGenStrList.get(0).startsWith("VB")){ + List<Map<String, List<String>>> verbNetGenList = vnProc .generalize(ch1.getWord(), ch2.getWord()); + if (verbNetGenList.size()>0){ + Map<String, List<String>> verbNetGen = verbNetGenList.get(0); + Map<String, Object> attr = newNode.getAttributes(); + if (attr == null) + attr = new HashMap<String, Object> (); + try { + List<String> phrDscr = (List<String>) attr.get("phrDescr"); + if (phrDscr!=null) // && phrDscr.size()>1) + phrDscr = new ArrayList<String>(new HashSet<String>(phrDscr)); + } catch (Exception e) { + System.err.println("Problem de-duplicating verbnet expr" + attr); + } + if (verbNetGen!=null){ + attr.putAll(verbNetGen); + newNode.setAttributes(attr); + } + } + } else if (posGenStrList.get(0).startsWith("NN")){ + //TODO + } + + results.add(newNode); + return results; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PartOfSpeechGeneralizer.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PartOfSpeechGeneralizer.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PartOfSpeechGeneralizer.java new file mode 100644 index 0000000..66efe23 --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PartOfSpeechGeneralizer.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.parse_thicket.matching; + +import java.util.ArrayList; +import java.util.List; + +import opennlp.tools.parse_thicket.IGeneralizer; + +public class PartOfSpeechGeneralizer implements IGeneralizer<String>{ + + @Override + public List<String> generalize(Object o1, Object o2){ + String pos1 = (String)o1, pos2 = (String) o2; + List<String> results = new ArrayList<String>(); + String res = computeSimilarity(pos1, pos2); + if (res!=null) + results.add(res); + return results; + + } + private String computeSimilarity(String pos1, String pos2){ + + if ((pos1.startsWith("NN") && pos2.equals("NP") || pos2.startsWith("NN") + && pos1.equals("NP"))) { + return "NN"; + } + if ((pos1.startsWith("NN") && pos2.equals("VBG") || pos2.startsWith("VBG") + && pos1.equals("NN"))) { + return "NN"; + } + + if ((pos1.startsWith("NN") && pos2.equals("ADJP") || pos2.startsWith("NN") + && pos1.equals("ADJP"))) { + return "NN"; + } + if ((pos1.equals("IN") && pos2.equals("TO") || pos1.equals("TO") + && pos2.equals("IN"))) { + return "IN"; + } + // VBx vs VBx = VB (does not matter which form for verb) + if (pos1.startsWith("VB") && pos2.startsWith("VB")) { + return "VB"; + } + + // ABx vs ABy always gives AB + if (pos1.equalsIgnoreCase(pos2)) { + return pos1; + } + if (pos1.length() > 2) { + pos1 = pos1.substring(0, 2); + } + + if (pos2.length() > 2) { + pos2 = pos2.substring(0, 2); + } + if (pos1.equalsIgnoreCase(pos2)) { + return pos1 + "*"; + } + return null; + } +} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PersonalInformationExtractor.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PersonalInformationExtractor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PersonalInformationExtractor.java new file mode 100644 index 0000000..5df0dee --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PersonalInformationExtractor.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.parse_thicket.matching; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import opennlp.tools.jsmlearning.ProfileReaderWriter; +import opennlp.tools.parse_thicket.VerbNetProcessor; + +import org.apache.commons.io.FileUtils; +import org.apache.tika.Tika; +import org.apache.tika.exception.TikaException; + +public class PersonalInformationExtractor { + FrameQueryBasedIExtractor extractor = new FrameQueryBasedIExtractor(); + private ArrayList<File> queue = new ArrayList<File>(); + private Tika tika = new Tika(); + + public void runExtractor(String filename){ + String content = null; + try { + content = FileUtils.readFileToString(new File(filename)); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + + extractor.buildTemplates(new String[] { "John Doe send his California driver license 1234567 . " + + "Jill Jones received her Ohio license 4567456. ", + " Mary Poppins got her identification 8765. Jorge Malony sold his identification 9876. ", + //" President Jorge Smith of Microsoft used his id 4567. Manager John Smith of Google used his id 8765. " + " Johh Doe 123. Don Joe 1323. " + + }); + + List<GeneralizationResult> res = extractor.doIE( content); + + } + + + private void addFiles(File file) { + + try { + if (!file.exists()) { + System.out.println(file + " does not exist."); + } + if (file.isDirectory()) { + for (File f : file.listFiles()) { + try { + addFiles(f); + } catch (Exception e) { + } + } + } else { + queue.add(file); + } + } catch (Exception e) { + + } + } + + public void processDirectory(String filename, String template) throws IOException { + List<String[]> report = new ArrayList<String[]>(); + report.add(new String[]{"filename", "text", "generalization", "fired?" }); + String templateStr = null; + try { + + templateStr = FileUtils.readFileToString(new File(template)); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + String[] samples = templateStr.split("&"); + + extractor.buildTemplates(samples); + + addFiles(new File(filename)); + + + for (File f : queue) { + String content=null; + try { + content = tika.parseToString(f); + List<GeneralizationResult> res = extractor.doIE( content); + + for(GeneralizationResult gr: res){ + report.add(new String[]{filename, gr.getText(), gr.getGen().toString(), gr.getbFire().toString() }); + } + + } catch (TikaException e) { + System.out.println("Tika problem with file" + f.getAbsolutePath()); + } catch (Exception ee){ + ee.printStackTrace(); + } + ProfileReaderWriter.writeReport(report, "PII_report.csv"); + } + + queue.clear(); + } + + + public void runExtractor(String filename, String template){ + String content = null, templateStr = null; + try { + content = FileUtils.readFileToString(new File(filename)); + templateStr = FileUtils.readFileToString(new File(template)); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + String[] samples = templateStr.split("&"); + + extractor.buildTemplates(samples); + + List<GeneralizationResult> res = extractor.doIE( content); + List<String[]> report = new ArrayList<String[]>(); + + for(GeneralizationResult gr: res){ + report.add(new String[]{filename, gr.getText(), gr.getGen().toString(), gr.getbFire().toString() }); + } + + + } + + public static void main(String[] args){ + //String filename = "/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/test/resources/pii/agreement.txt"; + + if (args ==null || args.length!=3) + System.err.println("Usage: java -Xmx10g -jar *.jar path-to-resources path-to-file-to-analyze path-to-file-with_samples\n"); + try { + VerbNetProcessor.getInstance(args[0]); + new PersonalInformationExtractor().processDirectory( args[1], args[2]); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } +} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGeneralizer.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGeneralizer.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGeneralizer.java new file mode 100644 index 0000000..61d5f0f --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGeneralizer.java @@ -0,0 +1,268 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.parse_thicket.matching; + +import java.util.ArrayList; +import java.util.List; + + + + + + +import opennlp.tools.parse_thicket.IGeneralizer; +import opennlp.tools.parse_thicket.ParseTreeNode; +import opennlp.tools.stemmer.PStemmer; +import opennlp.tools.textsimilarity.GeneralizationListReducer; +import opennlp.tools.textsimilarity.LemmaFormManager; +import opennlp.tools.textsimilarity.POSManager; +import opennlp.tools.textsimilarity.ParseTreeChunk; + +public class PhraseGeneralizer implements IGeneralizer<ParseTreeChunk> { + + private GeneralizationListReducer generalizationListReducer = new GeneralizationListReducer(); + protected LemmaGeneralizer lemmaFormManager = new LemmaGeneralizer(); + //protected LemmaFormManager lemmaFormManager = new LemmaFormManager(); + + + protected PartOfSpeechGeneralizer posManager = new PartOfSpeechGeneralizer(); + + protected PStemmer ps = new PStemmer(); + protected ParseTreeNodeGeneralizer nodeGen = new ParseTreeNodeGeneralizer(); + + /** + * key matching function which takes two phrases, aligns them and finds a set + * of maximum common sub-phrase + * + * @param chunk1 + * @param chunk2 + * @return + */ + @Override + public List<ParseTreeChunk> generalize( + Object chunk1o, Object chunk2o) { + + ParseTreeChunk chunk1 = (ParseTreeChunk)chunk1o, chunk2 = (ParseTreeChunk)chunk2o; + + List<ParseTreeChunk> resultChunks = new ArrayList<ParseTreeChunk>(); + + + List<String> pos1 = chunk1.getPOSs(); + List<String> pos2 = chunk2.getPOSs(); + List<String> lem1 = chunk1.getLemmas(); + List<String> lem2 = chunk2.getLemmas(); + + List<String> lem1stem = new ArrayList<String>(); + List<String> lem2stem = new ArrayList<String>(); + + + for (String word : lem1) { + try { + lem1stem.add(ps.stem(word.toLowerCase()).toString()); + } catch (Exception e) { + // e.printStackTrace(); + + if (word.length() > 2) + System.err.println("Unable to stem: " + word); + } + } + try { + for (String word : lem2) { + lem2stem.add(ps.stem(word.toLowerCase()).toString()); + } + } catch (Exception e) { + System.err.println("problem processing word " + lem2.toString()); + } + + List<String> overlap = new ArrayList<String>(lem1stem); + overlap.retainAll(lem2stem); + + if (overlap == null || overlap.size() < 1) + return null; + + // to accumulate starts of alignments + List<Integer> occur1 = new ArrayList<Integer>(), occur2 = new ArrayList<Integer>(); + + // for verbs find alignment even if no same verb lemmas, just any pair of verbs. Usually should be 0,0 + if (chunk1.getMainPOS().startsWith("VP") && chunk2.getMainPOS().startsWith("VP")) { + Integer i1 = null, i2 = null; + for(int i=0; i< pos1.size(); i++){ + if (pos1.get(i).startsWith("VB")){ + i1 = i; + break; + } + } + + for(int i=0; i< pos2.size(); i++){ + if (pos2.get(i).startsWith("VB")){ + i2 = i; + break; + } + } + if (i1!=null) + occur1.add(i1); + if (i2!=null) + occur2.add(i2); + } + + + for (String word : overlap) { + Integer i1 = lem1stem.indexOf(word); + Integer i2 = lem2stem.indexOf(word); + occur1.add(i1); + occur2.add(i2); + } + + + // now we search for plausible sublists of overlaps + // if at some position correspondence is inverse (one of two position + // decreases instead of increases) + // then we terminate current alignment accum and start a new one + List<List<int[]>> overlapsPlaus = new ArrayList<List<int[]>>(); + // starts from 1, not 0 + List<int[]> accum = new ArrayList<int[]>(); + accum.add(new int[] { occur1.get(0), occur2.get(0) }); + for (int i = 1; i < occur1.size() && i< occur2.size(); i++) { + + if (occur1.get(i) > occur1.get(i - 1) + && occur2.get(i) > occur2.get(i - 1)) + accum.add(new int[] { occur1.get(i), occur2.get(i) }); + else { + overlapsPlaus.add(accum); + accum = new ArrayList<int[]>(); + accum.add(new int[] { occur1.get(i), occur2.get(i) }); + } + } + if (accum.size() > 0) { + overlapsPlaus.add(accum); + } + + + for (List<int[]> occur : overlapsPlaus) { + List<ParseTreeNode> results = new ArrayList<ParseTreeNode>(); + List<Integer> occr1 = new ArrayList<Integer>(), occr2 = new ArrayList<Integer>(); + for (int[] column : occur) { + occr1.add(column[0]); + occr2.add(column[1]); + } + + int ov1 = 0, ov2 = 0; // iterators over common words; + List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>(); + // we start two words before first word + int k1 = occr1.get(ov1) - 2, k2 = occr2.get(ov2) - 2; + // if (k1<0) k1=0; if (k2<0) k2=0; + Boolean bReachedCommonWord = false; + while (k1 < 0 || k2 < 0) { + k1++; + k2++; + } + int k1max = pos1.size() - 1, k2max = pos2.size() - 1; + while (k1 <= k1max && k2 <= k2max) { + // first check if the same POS + String sim = null; + List<String> sims = posManager.//similarPOS(pos1.get(k1), pos2.get(k2)); + generalize(pos1.get(k1), pos2.get(k2)); + if (!sims.isEmpty()) + sim = sims.get(0); + + String lemmaMatch = null; + List<String> lemmaMatchs = lemmaFormManager.//matchLemmas(ps, + generalize(lem1.get(k1), + lem2.get(k2)); + if (!lemmaMatchs.isEmpty()) + lemmaMatch = lemmaMatchs.get(0); + + + if ((sim != null) + && (lemmaMatch == null || (lemmaMatch != null ))) { + commonPOS.add(pos1.get(k1)); + + + // doing parse tree node generalization + List<ParseTreeNode> genRes = nodeGen.generalize(chunk1.getParseTreeNodes().get(k1), chunk2.getParseTreeNodes().get(k2)); + if (genRes.size()==1) + results.add(genRes.get(0)); + + if (lemmaMatch != null) { + commonLemmas.add(lemmaMatch); + // System.out.println("Added "+lemmaMatch); + if (k1 == occr1.get(ov1) && k2 == occr2.get(ov2)) + bReachedCommonWord = true; // now we can have different increment + // opera + else { + if (occr1.size() > ov1 + 1 && occr2.size() > ov2 + 1 + && k1 == occr1.get(ov1 + 1) && k2 == occr2.get(ov2 + 1)) { + ov1++; + ov2++; + bReachedCommonWord = true; + } + } + } else { + commonLemmas.add("*"); + } // the same parts of speech, proceed to the next word in both + // expressions + k1++; + k2++; + + } else if (!bReachedCommonWord) { + k1++; + k2++; + } // still searching + else { + // different parts of speech, jump to the next identified common word + ov1++; + ov2++; + if (ov1 > occr1.size() - 1 || ov2 > occr2.size() - 1) + break; + // now trying to find + int kk1 = occr1.get(ov1) - 2, // new positions of iterators + kk2 = occr2.get(ov2) - 2; + int countMove = 0; + while ((kk1 < k1 + 1 || kk2 < k2 + 1) && countMove < 2) { // if it is + // behind + // current + // position, + // synchroneously + // move + // towards + // right + kk1++; + kk2++; + countMove++; + } + k1 = kk1; + k2 = kk2; + + if (k1 > k1max) + k1 = k1max; + if (k2 > k2max) + k2 = k2max; + bReachedCommonWord = false; + } + } + ParseTreeChunk currResult = new ParseTreeChunk(results); + //currResultOld = new ParseTreeChunk(commonLemmas, commonPOS, 0, 0); + + + resultChunks.add(currResult); + } + + return resultChunks; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGroupGeneralizer.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGroupGeneralizer.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGroupGeneralizer.java new file mode 100644 index 0000000..094f093 --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/PhraseGroupGeneralizer.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.parse_thicket.matching; + +import java.util.ArrayList; +import java.util.List; + +import opennlp.tools.parse_thicket.IGeneralizer; +import opennlp.tools.parse_thicket.ParseTreeNode; +import opennlp.tools.stemmer.PStemmer; +import opennlp.tools.textsimilarity.GeneralizationListReducer; +import opennlp.tools.textsimilarity.LemmaFormManager; +import opennlp.tools.textsimilarity.POSManager; +import opennlp.tools.textsimilarity.ParseTreeChunk; + +public class PhraseGroupGeneralizer implements IGeneralizer<List<ParseTreeChunk>>{ + + private GeneralizationListReducer generalizationListReducer = new GeneralizationListReducer(); + + private LemmaFormManager lemmaFormManager = new LemmaFormManager(); + + private POSManager posManager = new POSManager(); + + private PhraseGeneralizer pGen = new PhraseGeneralizer(); + private NERPhraseGeneralizer pGenNER = new NERPhraseGeneralizer(); + + /** + * main function to generalize two expressions grouped by phrase types returns + * a list of generalizations for each phrase type with filtered + * sub-expressions + * + * @param sent1 + * @param sent2 + * @return List<List<ParseTreeChunk>> list of list of POS-words pairs for each + * resultant matched / overlapped phrase + */ + @Override + public List<List<ParseTreeChunk>> generalize(Object o1, Object o2) { + + + List<List<ParseTreeChunk>> sent1 = (List<List<ParseTreeChunk>>)o1, + sent2 = (List<List<ParseTreeChunk>>) o2 ; + List<List<ParseTreeChunk>> results = new ArrayList<List<ParseTreeChunk>>(); + // first iterate through component + for (int comp = 0; comp < 2 && // just np & vp + comp < sent1.size() && comp < sent2.size(); comp++) { + List<ParseTreeChunk> resultComps = new ArrayList<ParseTreeChunk>(); + // then iterate through each phrase in each component + // first try lemma-based alignment + for (ParseTreeChunk ch1 : sent1.get(comp)) { + for (ParseTreeChunk ch2 : sent2.get(comp)) { // simpler version + List<ParseTreeChunk> chunkToAdd=null; + try { + chunkToAdd = pGen.generalize(ch1, ch2); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + if (chunkToAdd == null){ + chunkToAdd = new ArrayList<ParseTreeChunk>(); + } + Boolean alreadyThere = false; + for (ParseTreeChunk chunk : resultComps) { + if (chunkToAdd.contains(chunk)) { + alreadyThere = true; + break; + } + } + + if (!alreadyThere && chunkToAdd != null && chunkToAdd.size() > 0) { + resultComps.addAll(chunkToAdd); + } + + } + } // then try NER-based alignment + if (comp==0 || resultComps.size()<1){ + for (ParseTreeChunk ch1 : sent1.get(comp)) { + for (ParseTreeChunk ch2 : sent2.get(comp)) { // simpler version + List<ParseTreeChunk> chunkToAdd = pGenNER.generalize( + ch1, ch2); + + if (chunkToAdd == null){ + chunkToAdd = new ArrayList<ParseTreeChunk>(); + } + + Boolean alreadyThere = false; + for (ParseTreeChunk chunk : resultComps) { + if (chunkToAdd.contains(chunk)) { + alreadyThere = true; + break; + } + } + + if (!alreadyThere && chunkToAdd != null && chunkToAdd.size() > 0) { + resultComps.addAll(chunkToAdd); + } + + } + } + } + + List<ParseTreeChunk> resultCompsRed = generalizationListReducer.applyFilteringBySubsumption(resultComps); + + resultComps = resultCompsRed; + results.add(resultComps); + } + + return results; + } + + + +} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/AbstractEngineRunner.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/AbstractEngineRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/AbstractEngineRunner.java new file mode 100644 index 0000000..ba8a140 --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/AbstractEngineRunner.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.parse_thicket.opinion_processor; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.StringUtils; + +import opennlp.tools.jsmlearning.ProfileReaderWriter; +import opennlp.tools.parse_thicket.ParseTreeNode; +import opennlp.tools.textsimilarity.ParseTreeChunk; + +public class AbstractEngineRunner { + private List<File> queue; + private final static String reviewSource = "/Users/bgalitsky/Documents/relevance-based-on-parse-trees/src/test/resources/opinions/macbook_pro.txt"; + NamedEntityExtractor neExtractor = new NamedEntityExtractor(); + + public void processJSONfileWithReviews(){ + List<String[]> report = new ArrayList<String[]>(); + report.add(new String[] { "text", "phrases of potential interest list" , }); + + + String content=null; + try { + content = FileUtils.readFileToString(new File(reviewSource)); + } catch (IOException e) { + e.printStackTrace(); + } + String[] texts = StringUtils.substringsBetween(content, "reviewText\": \"", "\", \"overall"); + for(String text: texts){ + EntityExtractionResult result = neExtractor.extractEntities(text); + report.add(new String[]{text}); + //report.add((String[])result.extractedNERWords.toArray(new String[0])); + //report.add((String[])result.extractedSentimentPhrases.toArray(new String[0])); + List<String> stringPhrases = new ArrayList<String>(), + nodePhrases = new ArrayList<String>(); + for(List<ParseTreeNode> chList: result.extractedSentimentPhrases){ + String buf = "", nodeBuf=""; + for(ParseTreeNode ch: chList){ + buf+=ch.getWord()+ " "; + nodeBuf+=ch.toString()+ " "; + } + stringPhrases.add(buf.trim()); + nodePhrases.add(nodeBuf.trim()); + } + report.add((String[])stringPhrases.toArray(new String[0])); + report.add((String[])nodePhrases.toArray(new String[0])); + report.add(new String[]{"-----------------------------"}); + ProfileReaderWriter.writeReport(report, "nameEntitiesTopicsOfInterestExtracted.csv"); + } + } + + // this func collects files + private void addFiles(File file) { + + if (!file.exists()) { + System.out.println(file + " does not exist."); + } + if (file.isDirectory()) { + for (File f : file.listFiles()) { + if (f.getName().startsWith(".")) + continue; + addFiles(f); + System.out.println(f.getName()); + } + } else { + queue.add(file); + + } + } + + public static void main(String[] args){ + AbstractEngineRunner runner = new AbstractEngineRunner(); + runner.processJSONfileWithReviews(); + + } +} + +/* + public void processDirectory(String path){ + List<String[]> report = new ArrayList<String[]>(); + report.add(new String[] { "filename", "named entity list", "phrases of potential interest list" }); + + List<String> allNamedEntities = new ArrayList<String>(); + + addFiles(new File(path)); + for(File f: queue){ + List<String> entities = (List<String>) extractEntities(f.getAbsolutePath()).getFirst(); + List<String> opinions = (List<String>) extractEntities(f.getAbsolutePath()).getSecond(); + report.add(new String[]{ f.getName(), entities.toString(), opinions.toString()}); + ProfileReaderWriter.writeReport(report, "nameEntitiesExtracted.csv"); + + allNamedEntities.addAll(entities); + + allNamedEntities = new ArrayList<String>(new HashSet<String> (allNamedEntities )); + + + } + ProfileReaderWriter.writeReport(report, "nameEntitiesTopicsOfInterestExtracted.csv"); + } +} */ http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/DefaultSentimentProcessor.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/DefaultSentimentProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/DefaultSentimentProcessor.java new file mode 100644 index 0000000..44a3640 --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/DefaultSentimentProcessor.java @@ -0,0 +1,523 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.parse_thicket.opinion_processor; + +import java.io.IOException; +import java.util.List; + +import edu.stanford.nlp.util.logging.Redwood; + +import java.util.Iterator; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.FileOutputStream; +import java.io.PrintStream; +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Properties; +import java.util.logging.Logger; + +import org.ejml.simple.SimpleMatrix; + +import edu.stanford.nlp.io.IOUtils; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.Label; +import edu.stanford.nlp.ling.LabeledWord; +import edu.stanford.nlp.ling.TaggedWord; +import edu.stanford.nlp.ling.WordLemmaTag; +import edu.stanford.nlp.ling.WordTag; +import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.sentiment.SentimentCoreAnnotations.SentimentAnnotatedTree; +import edu.stanford.nlp.sentiment.SentimentUtils; +import edu.stanford.nlp.trees.MemoryTreebank; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.TreeCoreAnnotations; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.sentiment.SentimentCoreAnnotations; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.util.CoreMap; + +public class DefaultSentimentProcessor { + /** A logger for this class */ + private static final Logger log = Logger + .getLogger("opennlp.tools.parse_thicket.opinion_processor.DefaultSentimentProcessor"); + + private static final NumberFormat NF = new DecimalFormat("0.0000"); + + enum Output { + PENNTREES, VECTORS, ROOT, PROBABILITIES + } + + enum Input { + TEXT, TREES + } + + /** + * Sets the labels on the tree (except the leaves) to be the integer + * value of the sentiment prediction. Makes it easy to print out + * with Tree.toString() + */ + static void setSentimentLabels(Tree tree) { + if (tree.isLeaf()) { + return; + } + + for (Tree child : tree.children()) { + setSentimentLabels(child); + } + + Label label = tree.label(); + if (!(label instanceof CoreLabel)) { + throw new IllegalArgumentException("Required a tree with CoreLabels"); + } + CoreLabel cl = (CoreLabel) label; + cl.setValue(Integer.toString(RNNCoreAnnotations.getPredictedClass(tree))); + } + + /** + * Sets the labels on the tree to be the indices of the nodes. + * Starts counting at the root and does a postorder traversal. + */ + static int setIndexLabels(Tree tree, int index) { + if (tree.isLeaf()) { + return index; + } + + tree.label().setValue(Integer.toString(index)); + index++; + for (Tree child : tree.children()) { + index = setIndexLabels(child, index); + } + return index; + } + + /** + * Outputs the vectors from the tree. Counts the tree nodes the + * same as setIndexLabels. + */ + static int outputTreeVectors(PrintStream out, Tree tree, int index) { + if (tree.isLeaf()) { + return index; + } + + out.print(" " + index + ":"); + SimpleMatrix vector = RNNCoreAnnotations.getNodeVector(tree); + for (int i = 0; i < vector.getNumElements(); ++i) { + out.print(" " + NF.format(vector.get(i))); + } + out.println(); + index++; + for (Tree child : tree.children()) { + index = outputTreeVectors(out, child, index); + } + return index; + } + + /** + * Outputs the scores from the tree. Counts the tree nodes the + * same as setIndexLabels. + */ + static int outputTreeScores(PrintStream out, Tree tree, int index) { + if (tree.isLeaf()) { + return index; + } + + out.print(" " + index + ":"); + SimpleMatrix vector = RNNCoreAnnotations.getPredictions(tree); + for (int i = 0; i < vector.getNumElements(); ++i) { + out.print(" " + NF.format(vector.get(i))); + } + out.println(); + index++; + for (Tree child : tree.children()) { + index = outputTreeScores(out, child, index); + } + return index; + } + + public static <T> String wordToString(T o, final boolean justValue) { + return wordToString(o, justValue, null); + } + + public static <T> String wordToString(T o, final boolean justValue, + final String separator) { + if (justValue && o instanceof Label) { + if (o instanceof CoreLabel) { + CoreLabel l = (CoreLabel) o; + String w = l.value(); + if (w == null) + w = l.word(); + return w; + } else { + return (((Label) o).value()); + } + } else if (o instanceof CoreLabel) { + CoreLabel l = ((CoreLabel) o); + String w = l.value(); + if (w == null) + w = l.word(); + if (l.tag() != null) { + if (separator == null) { + return w + CoreLabel.TAG_SEPARATOR + l.tag(); + } else { + return w + separator + l.tag(); + } + } + return w; + // an interface that covered these next four cases would be + // nice, but we're moving away from these data types anyway + } else if (separator != null && o instanceof TaggedWord) { + return ((TaggedWord) o).toString(separator); + } else if (separator != null && o instanceof LabeledWord) { + return ((LabeledWord) o).toString(); + } else if (separator != null && o instanceof WordLemmaTag) { + return ((WordLemmaTag) o).toString(separator); + } else if (separator != null && o instanceof WordTag) { + return ((WordTag) o).toString(separator); + } else { + return (o.toString()); + } + } + + + /** + * Returns the sentence as a string with a space between words. + * It prints out the {@code value()} of each item - + * this will give the expected answer for a short form representation + * of the "sentence" over a range of cases. It is equivalent to + * calling {@code toString(true)}. + * + * TODO: Sentence used to be a subclass of ArrayList, with this + * method as the toString. Therefore, there may be instances of + * ArrayList being printed that expect this method to be used. + * + * @param list The tokenized sentence to print out + * @return The tokenized sentence as a String + */ + public static <T> String listToString(List<T> list) { + return listToString(list, true); + } + /** + * Returns the sentence as a string with a space between words. + * Designed to work robustly, even if the elements stored in the + * 'Sentence' are not of type Label. + * + * This one uses the default separators for any word type that uses + * separators, such as TaggedWord. + * + * @param list The tokenized sentence to print out + * @param justValue If {@code true} and the elements are of type + * {@code Label}, return just the + * {@code value()} of the {@code Label} of each word; + * otherwise, + * call the {@code toString()} method on each item. + * @return The sentence in String form + */ + public static <T> String listToString(List<T> list, final boolean justValue) { + return listToString(list, justValue, null); + } + + /** + * As already described, but if separator is not null, then objects + * such as TaggedWord + * + * @param separator The string used to separate Word and Tag + * in TaggedWord, etc + */ + public static <T> String listToString(List<T> list, final boolean justValue, + final String separator) { + StringBuilder s = new StringBuilder(); + for (Iterator<T> wordIterator = list.iterator(); wordIterator.hasNext();) { + T o = wordIterator.next(); + s.append(wordToString(o, justValue, separator)); + if (wordIterator.hasNext()) { + s.append(' '); + } + } + return s.toString(); + } + + /** + * Outputs a tree using the output style requested + */ + static void outputTree(PrintStream out, CoreMap sentence, List<Output> outputFormats) { + Tree tree = sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class); + for (Output output : outputFormats) { + switch (output) { + case PENNTREES: { + Tree copy = tree.deepCopy(); + setSentimentLabels(copy); + out.println(copy); + break; + } + case VECTORS: { + Tree copy = tree.deepCopy(); + setIndexLabels(copy, 0); + out.println(copy); + outputTreeVectors(out, tree, 0); + break; + } + case ROOT: { + out.println(" " + sentence.get(SentimentCoreAnnotations.SentimentClass.class)); + break; + } + case PROBABILITIES: { + Tree copy = tree.deepCopy(); + setIndexLabels(copy, 0); + out.println(copy); + outputTreeScores(out, tree, 0); + break; + } + default: + throw new IllegalArgumentException("Unknown output format " + output); + } + } + } + + /** + * Reads an annotation from the given filename using the requested input. + */ + public static List<Annotation> getAnnotations(StanfordCoreNLP tokenizer, Input inputFormat, String filename, boolean filterUnknown) { + switch (inputFormat) { + case TEXT: { + String text = IOUtils.slurpFileNoExceptions(filename); + Annotation annotation = new Annotation(text); + tokenizer.annotate(annotation); + List<Annotation> annotations = Generics.newArrayList(); + for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { + Annotation nextAnnotation = new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class)); + nextAnnotation.set(CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence)); + annotations.add(nextAnnotation); + } + return annotations; + } + case TREES: { + List<Tree> trees; + if (filterUnknown) { + trees = SentimentUtils.readTreesWithGoldLabels(filename); + trees = SentimentUtils.filterUnknownRoots(trees); + } else { + trees = Generics.newArrayList(); + MemoryTreebank treebank = new MemoryTreebank("utf-8"); + treebank.loadPath(filename, null); + for (Tree tree : treebank) { + trees.add(tree); + } + } + + List<Annotation> annotations = Generics.newArrayList(); + for (Tree tree : trees) { + CoreMap sentence = new Annotation(listToString(tree.yield())); + sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree); + List<CoreMap> sentences = Collections.singletonList(sentence); + Annotation annotation = new Annotation(""); + annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences); + annotations.add(annotation); + } + return annotations; + } + default: + throw new IllegalArgumentException("Unknown format " + inputFormat); + } + } + + /** Runs the tree-based sentiment model on some text. */ + public void processTextWithArgs(String[] args) throws IOException { + String parserModel = null; + String sentimentModel = null; + + String filename = null; + String fileList = null; + boolean stdin = false; + + boolean filterUnknown = false; + + List<Output> outputFormats = Collections.singletonList(Output.ROOT); + Input inputFormat = Input.TEXT; + + String tlppClass = "DEFAULT_TLPP_CLASS"; + + for (int argIndex = 0; argIndex < args.length; ) { + if (args[argIndex].equalsIgnoreCase("-sentimentModel")) { + sentimentModel = args[argIndex + 1]; + argIndex += 2; + } else if (args[argIndex].equalsIgnoreCase("-parserModel")) { + parserModel = args[argIndex + 1]; + argIndex += 2; + } else if (args[argIndex].equalsIgnoreCase("-file")) { + filename = args[argIndex + 1]; + argIndex += 2; + } else if (args[argIndex].equalsIgnoreCase("-fileList")) { + fileList = args[argIndex + 1]; + argIndex += 2; + } else if (args[argIndex].equalsIgnoreCase("-stdin")) { + stdin = true; + argIndex++; + } else if (args[argIndex].equalsIgnoreCase("-input")) { + inputFormat = Input.valueOf(args[argIndex + 1].toUpperCase()); + argIndex += 2; + } else if (args[argIndex].equalsIgnoreCase("-output")) { + String[] formats = args[argIndex + 1].split(","); + outputFormats = new ArrayList<>(); + for (String format : formats) { + outputFormats.add(Output.valueOf(format.toUpperCase())); + } + argIndex += 2; + } else if (args[argIndex].equalsIgnoreCase("-filterUnknown")) { + filterUnknown = true; + argIndex++; + } else if (args[argIndex].equalsIgnoreCase("-tlppClass")) { + tlppClass = args[argIndex + 1]; + argIndex += 2; + } else if (args[argIndex].equalsIgnoreCase("-help")) { + System.exit(0); + } else { + log.info("Unknown argument " + args[argIndex + 1]); + throw new IllegalArgumentException("Unknown argument " + args[argIndex + 1]); + } + } + + // We construct two pipelines. One handles tokenization, if + // necessary. The other takes tokenized sentences and converts + // them to sentiment trees. + Properties pipelineProps = new Properties(); + Properties tokenizerProps = null; + if (sentimentModel != null) { + pipelineProps.setProperty("sentiment.model", sentimentModel); + } + if (parserModel != null) { + pipelineProps.setProperty("parse.model", parserModel); + } + if (inputFormat == Input.TREES) { + pipelineProps.setProperty("annotators", "binarizer, sentiment"); + pipelineProps.setProperty("customAnnotatorClass.binarizer", "edu.stanford.nlp.pipeline.BinarizerAnnotator"); + pipelineProps.setProperty("binarizer.tlppClass", tlppClass); + pipelineProps.setProperty("enforceRequirements", "false"); + } else { + pipelineProps.setProperty("annotators", "parse, sentiment"); + pipelineProps.setProperty("enforceRequirements", "false"); + tokenizerProps = new Properties(); + tokenizerProps.setProperty("annotators", "tokenize, ssplit"); + } + + if (stdin && tokenizerProps != null) { + tokenizerProps.setProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "true"); + } + + int count = 0; + if (filename != null) count++; + if (fileList != null) count++; + if (stdin) count++; + if (count > 1) { + throw new IllegalArgumentException("Please only specify one of -file, -fileList or -stdin"); + } + if (count == 0) { + throw new IllegalArgumentException("Please specify either -file, -fileList or -stdin"); + } + + StanfordCoreNLP tokenizer = (tokenizerProps == null) ? null : new StanfordCoreNLP(tokenizerProps); + StanfordCoreNLP pipeline = new StanfordCoreNLP(pipelineProps); + + if (filename != null) { + // Process a file. The pipeline will do tokenization, which + // means it will split it into sentences as best as possible + // with the tokenizer. + List<Annotation> annotations = getAnnotations(tokenizer, inputFormat, filename, filterUnknown); + for (Annotation annotation : annotations) { + pipeline.annotate(annotation); + + for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { + System.out.println(sentence); + outputTree(System.out, sentence, outputFormats); + } + } + } else if (fileList != null) { + // Process multiple files. The pipeline will do tokenization, + // which means it will split it into sentences as best as + // possible with the tokenizer. Output will go to filename.out + // for each file. + for (String file : fileList.split(",")) { + List<Annotation> annotations = getAnnotations(tokenizer, inputFormat, file, filterUnknown); + FileOutputStream fout = new FileOutputStream(file + ".out"); + PrintStream pout = new PrintStream(fout); + for (Annotation annotation : annotations) { + pipeline.annotate(annotation); + + for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { + pout.println(sentence); + outputTree(pout, sentence, outputFormats); + } + } + pout.flush(); + fout.close(); + } + } else { + // Process stdin. Each line will be treated as a single sentence. + log.info("Reading in text from stdin."); + log.info("Please enter one sentence per line."); + log.info("Processing will end when EOF is reached."); + BufferedReader reader = IOUtils.readerFromStdin("utf-8"); + + for (String line; (line = reader.readLine()) != null; ) { + line = line.trim(); + if ( ! line.isEmpty()) { + Annotation annotation = tokenizer.process(line); + pipeline.annotate(annotation); + for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { + outputTree(System.out, sentence, outputFormats); + } + } else { + // Output blank lines for blank lines so the tool can be + // used for line-by-line text processing + System.out.println(); + } + } + + } + } + + public float getNumericSentimentValue(String expression) { + Properties props = new Properties(); + props.setProperty("annotators", "tokenize, ssplit, parse, sentiment"); + StanfordCoreNLP pipeline = new StanfordCoreNLP(props); + int mainSentiment = 0; + if (expression != null && expression.length() > 0) { + int longest = 0; + Annotation annotation = pipeline.process(expression); + for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { + Tree tree = sentence.get(SentimentAnnotatedTree.class); + int sentiment = RNNCoreAnnotations.getPredictedClass(tree); + String partText = sentence.toString(); + if (partText.length() > longest) { + mainSentiment = sentiment; + longest = partText.length(); + } + } + } + return mainSentiment; + } +} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/EntityExtractionResult.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/EntityExtractionResult.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/EntityExtractionResult.java new file mode 100644 index 0000000..69eae1d --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/EntityExtractionResult.java @@ -0,0 +1,158 @@ +package opennlp.tools.parse_thicket.opinion_processor; + +import java.util.List; + +import opennlp.tools.parse_thicket.ParseTreeNode; +import opennlp.tools.similarity.apps.HitBase; + +public class EntityExtractionResult { + List<List<ParseTreeNode>> extractedNERs; + public List<String> extractedNERWords; + // phrases w/sentiments + public List<List<ParseTreeNode>> extractedSentimentPhrases; + public List<String> extractedSentimentPhrasesStr; + // phrases w/o sentiments + public List<List<ParseTreeNode>> extractedNONSentimentPhrases; + public List<String> extractedNONSentimentPhrasesStr; + public List<Float> sentimentProfile; + + + public List<String> getExtractedSentimentPhrasesStr() { + return extractedSentimentPhrasesStr; + } + + public void setExtractedSentimentPhrasesStr(List<String> extractedSentimentPhrasesStr) { + this.extractedSentimentPhrasesStr = extractedSentimentPhrasesStr; + } + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + public List<List<ParseTreeNode>> getExtractedNONSentimentPhrases() { + return extractedNONSentimentPhrases; + } + + public void setExtractedNONSentimentPhrases(List<List<ParseTreeNode>> extractedNONSentimentPhrases) { + this.extractedNONSentimentPhrases = extractedNONSentimentPhrases; + } + + public List<String> getExtractedNONSentimentPhrasesStr() { + return extractedNONSentimentPhrasesStr; + } + + public void setExtractedNONSentimentPhrasesStr(List<String> extractedNONSentimentPhrasesStr) { + this.extractedNONSentimentPhrasesStr = extractedNONSentimentPhrasesStr; + } + + public List<HitBase> hits; + private List<List<ParseTreeNode>> extractedNerPhrases; + private List<String> extractedNerPhrasesStr; + private List<String> extractedNerPhraseTags; + private List<List<ParseTreeNode>> extractedNerExactPhrases; + private List<String> extractedNerExactStr; + + public void setExtractedNERWords(List<String> extractedNERWords) { + this.extractedNERWords = extractedNERWords; + } + + public void setExtractedSentimentPhrases(List<List<ParseTreeNode>> extractedSentimentPhrases) { + this.extractedSentimentPhrases = extractedSentimentPhrases; + } + + public void setExtractedNER(List<List<ParseTreeNode>> extractedNERs) { + this.extractedNERs = extractedNERs; + } + + public void setGossipHits(List<HitBase> hitsForAnEntity) { + hits = hitsForAnEntity; + } + + public List<List<ParseTreeNode>> getExtractedNERs() { + return extractedNERs; + } + + public void setExtractedNERs(List<List<ParseTreeNode>> extractedNERs) { + this.extractedNERs = extractedNERs; + } + + public List<HitBase> getHits() { + return hits; + } + + public void setHits(List<HitBase> hits) { + this.hits = hits; + } + + public List<String> getExtractedNERWords() { + return extractedNERWords; + } + + public List<List<ParseTreeNode>> getExtractedSentimentPhrases() { + return extractedSentimentPhrases; + } + + public void setSentimentProfile(List<Float> sentimentProfile) { + this.sentimentProfile = sentimentProfile; + } + + public List<Float> getSentimentProfile() { + return sentimentProfile; + } + + public void setExtractedNerPhrases(List<List<ParseTreeNode>> extractedNerPhrases) { + this.extractedNerPhrases = extractedNerPhrases; + + } + + public void setExtractedNerPhrasesStr(List<String> extractedNerPhrasesStr) { + this.extractedNerPhrasesStr = extractedNerPhrasesStr; + + } + + public List<List<ParseTreeNode>> getExtractedNerPhrases() { + return extractedNerPhrases; + } + + public List<String> getExtractedNerPhrasesStr() { + return extractedNerPhrasesStr; + } + + public void setExtractedNerPhraseTags(List<String> extractedNerPhraseTags) { + this.extractedNerPhraseTags = extractedNerPhraseTags; + } + + public List<String> getExtractedNerPhraseTags() { + return this.extractedNerPhraseTags; + } + + public void setExtractedNerExactPhrases(List<List<ParseTreeNode>> extractedNerExactPhrases) { + this.extractedNerExactPhrases = extractedNerExactPhrases; + + } + + public void setExtractedNerExactStr(List<String> extractedNerExactStr) { + this.extractedNerExactStr = extractedNerExactStr; + + } + + public List<List<ParseTreeNode>> getExtractedNerExactPhrases() { + return extractedNerExactPhrases; + } + + public List<String> getExtractedNerExactStr() { + return extractedNerExactStr; + } + +} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/ExpressionSentimentAnalyzer.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/ExpressionSentimentAnalyzer.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/ExpressionSentimentAnalyzer.java new file mode 100644 index 0000000..dc89d8b --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/ExpressionSentimentAnalyzer.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.parse_thicket.opinion_processor; + +import java.util.Properties; + +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.sentiment.SentimentCoreAnnotations; +import edu.stanford.nlp.sentiment.SentimentCoreAnnotations.SentimentAnnotatedTree; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.util.CoreMap; + +public class ExpressionSentimentAnalyzer { + float findSentiment(String line) { + Properties props = new Properties(); + props.setProperty("annotators", "tokenize, ssplit, parse, sentiment"); + StanfordCoreNLP pipeline = new StanfordCoreNLP(props); + int mainSentiment = 0; + if (line != null && line.length() > 0) { + int longest = 0; + Annotation annotation = pipeline.process(line); + for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { + Tree tree = sentence.get(SentimentAnnotatedTree.class); + int sentiment = RNNCoreAnnotations.getPredictedClass(tree); + String partText = sentence.toString(); + if (partText.length() > longest) { + mainSentiment = sentiment; + longest = partText.length(); + } + } + } + return mainSentiment; + } + + public static void main(String[] args) { + float sent = new ExpressionSentimentAnalyzer().findSentiment("poor president nomee Hilary Clinton visited Mexico"); + System.out.println(sent); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/LinguisticPhraseManager.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/LinguisticPhraseManager.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/LinguisticPhraseManager.java new file mode 100644 index 0000000..0f53ec5 --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/LinguisticPhraseManager.java @@ -0,0 +1,591 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.parse_thicket.opinion_processor; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import org.apache.commons.lang3.StringUtils; +import opennlp.tools.jsmlearning.ProfileReaderWriter; +import opennlp.tools.similarity.apps.BingQueryRunner; +import opennlp.tools.similarity.apps.HitBase; +import opennlp.tools.similarity.apps.utils.ValueSortMap; +import opennlp.tools.stemmer.PStemmer; +import opennlp.tools.textsimilarity.ParseTreeChunk; +import opennlp.tools.textsimilarity.TextProcessor; + +public class LinguisticPhraseManager { + private Map<String, Integer> freq = new ConcurrentHashMap<String, Integer>(); + + // the purpose to init this static object is to show the path to resources + private static StopList stop = StopList.getInstance(new File(".").getAbsolutePath().replace(".","")+ "src/test/resources/"); + + // this list will be overwritten by the external synonyms.csv + private static String[][] synonymPairs = new String[][]{}; + private PStemmer stemmer = new PStemmer(); + + private List<ParseTreeChunk> lingPhrases = new ArrayList<ParseTreeChunk>(); + private List<String> standardizedTopics = new ArrayList<String>(); + // map which shows for each ling phrase the list of ling phrases with the same head noun it belongs + private Map<ParseTreeChunk, List<ParseTreeChunk>> entry_group = new ConcurrentHashMap<ParseTreeChunk, List<ParseTreeChunk>>(); + + // map which shows for each string phrase the list of ling phrases with the same head noun it belongs + private Map<String, List<ParseTreeChunk>> std_group = new ConcurrentHashMap<String, List<ParseTreeChunk>>(); + + private BingQueryRunner runner = new BingQueryRunner(); + private static final int MIN_NUMBER_OF_PHRASES_TO_CONSIDER = 3;//2; 5 + private static final int MIN_LENGTH_OF_WORD_TO_CONSIDER = 3; + // this function takes a log of a chain of the nodes of parse trees and builds their instances + // the phrases should only be VP or NP, otherwise an exception should be thrown + + + + private String resourceDir; + public LinguisticPhraseManager(){ + try { + resourceDir = new File( "." ).getCanonicalPath()+"/src/main/resources/"; + List<String[]> vocabs = ProfileReaderWriter.readProfiles(resourceDir+"/synonyms.csv"); + synonymPairs = new String[vocabs.size()][2]; + int count = 0; + for(String[] line: vocabs){ + try { + synonymPairs[count] = line; + count++; + } catch (Exception e) { + e.printStackTrace(); + } + } + + } catch (Exception e) { + e.printStackTrace(); + } + } + + private ParseTreeChunk parseLingPhraseIntoParseTreeChunk(String phrStr){ + ParseTreeChunk ch = new ParseTreeChunk(); + List<String> POSs = new ArrayList<String>(), lemmas = new ArrayList<String>(); + + String[] parts = phrStr.replace("]","").split(", <"); + + ch.setMainPOS( StringUtils.substringBetween(phrStr, ">", "'")); + try { + for(String part: parts){ + String lemma = StringUtils.substringBetween(part, "P'", "':").toLowerCase(); + String pos = part.substring(part.indexOf(":")+1, part.length()); + + if (pos==null || lemma ==null){ + continue; + } + POSs.add(pos.trim()); + lemmas.add(lemma.trim()); + ch.setPOSs(POSs); ch.setLemmas(lemmas); + } + } catch (Exception e) { + // we expect exceptions if extracted phrases are NEITHER NP nor VP + // empty chunk will be given which will not create a new topic + e.printStackTrace(); + } + + return ch; + } + + // this is a constructor with an array of extraction files + // optimized for performance + // only topics occurring more than MIN_NUMBER_OF_PHRASES_TO_CONSIDER times will be considered + public LinguisticPhraseManager(String[] loadPaths){ + List<String[]> columns = new ArrayList<String[]>(); + for(String file: loadPaths){ + columns.addAll(ProfileReaderWriter.readProfiles( file)); + } + + for(String[] l: columns){ + if (l.length<3 || l[1]==null || l[2]==null) + continue; + String word = l[1].toLowerCase().trim(); + if (word.indexOf("=>")>-1) + continue; + + word = isAcceptableStringPhrase(word); + if (word==null) + continue; + + if (!freq.containsKey(word)) { + freq.put(word, 1); + + } else { + freq.put(word, freq.get(word) + 1); + // once we reached the count for a topic, create it + if (freq.get(word)==MIN_NUMBER_OF_PHRASES_TO_CONSIDER){ + ParseTreeChunk ch = parseLingPhraseIntoParseTreeChunk(l[2]); + ch = isAcceptableLingPhrase(ch); + if (ch==null) + continue; + lingPhrases.add(ch); + } + } + } + // we dont need frequency data any more + freq.clear(); + } + + // this is a default constructor with a single topic extraction file + // not optimized for performance + public LinguisticPhraseManager(String loadPath){ + List<String[]> columns = ProfileReaderWriter.readProfiles( loadPath); + for(String[] l: columns){ + if (l.length<3 || l[1]==null || l[2]==null) + continue; + String word = l[1].toLowerCase().trim(); + if (word.indexOf("=>")>-1) + continue; + + word = isAcceptableStringPhrase(word); + if (word==null) + continue; + + if (!freq.containsKey(word)) { + + ParseTreeChunk ch = parseLingPhraseIntoParseTreeChunk(l[2]); + ch = isAcceptableLingPhrase(ch); + if (ch==null) + continue; + freq.put(word, 1); + lingPhrases.add(ch); + } else { + freq.put(word, freq.get(word) + 1); + } + + + } + freq = ValueSortMap.sortMapByValue(freq, false); + + + } + // removing prepositions and articles in case it has not worked at phrase forming stage + private String isAcceptableStringPhrase(String word) { + if (word.startsWith("to ")) + return null; + if (word.startsWith("a ")) + return word.substring(2, word.length()); + + if (word.endsWith(" !") || word.endsWith(" .")) + return word.substring(0, word.length()-2).trim(); + + return word; + } + // we only accept NP + private ParseTreeChunk isAcceptableLingPhrase(ParseTreeChunk ch) { + if (!ch.getMainPOS().equals("NP")) + return null; + + + return ch; + } + + // groups are sets of phrases with the same head noun + // put all phrases in a group. Have a map from each phrase to its group: the list of members + public void doLingGrouping(){ + for(int i=0; i< lingPhrases.size(); i++){ + for(int j=i+1; j< lingPhrases.size(); j++){ + ParseTreeChunk chI = lingPhrases.get(i); + ParseTreeChunk chJ = lingPhrases.get(j); + if (chI.getLemmas().get(chI.getLemmas().size()-1).equals(chJ.getLemmas().get(chJ.getLemmas().size()-1)) + && chI.getPOSs().get(chI.getLemmas().size()-1).startsWith("NN") ){ + List<ParseTreeChunk> values = null; + if( chI.getLemmas().size()<chJ.getLemmas().size()){ + + if (values == null) + values = new ArrayList<ParseTreeChunk>(); + values.add(chI); + entry_group.put(chJ, values); + } else { + values = entry_group.get(chI); + if (values == null) + values = new ArrayList<ParseTreeChunk>(); + values.add(chJ); + entry_group.put(chI, values); + } + } + } + } + + + } + + public List<String> formStandardizedTopic(){ + Set<ParseTreeChunk> keys = entry_group.keySet(); + for(ParseTreeChunk k: keys){ + List<ParseTreeChunk> lingPhrases = entry_group.get(k); + for(int i=0; i< lingPhrases.size(); i++) + for(int j=i+1; j< lingPhrases.size(); j++){ + ParseTreeChunk chI = lingPhrases.get(i); + ParseTreeChunk chJ = lingPhrases.get(j); + List<String> lemmas = new ArrayList<String>(chI.getLemmas()); + lemmas.retainAll(chJ.getLemmas()); + if (lemmas.size()<2) + continue; + String buf = ""; List<String> candTopicLst = new ArrayList<String>(); + for(String w: lemmas){ + if (w.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER) + continue; + if (!StringUtils.isAlpha(w)) + continue; + // find POS of w + boolean bAccept = false; + for(int iw=0; iw<chI.getLemmas().size(); iw++){ + if (w.equals(chI.getLemmas().get(iw))){ + if (chI.getPOSs().get(iw).startsWith("NN") || chI.getPOSs().get(iw).startsWith("JJ") + || chI.getPOSs().get(iw).startsWith("VB")) + bAccept=true; + } + } + if (bAccept){ + //buf+=w+" "; + String ws = substituteSynonym(w); + candTopicLst.add(ws); + } + } + // remove duplicates like 'new new house' + //candTopicLst = new ArrayList<String>(new HashSet<String>(candTopicLst)); + for(String w: candTopicLst){ + buf+=w+" "; + } + + buf = buf.trim(); + if (buf.indexOf(' ')<0) + continue; + + if (!standardizedTopics.contains(buf)){ + standardizedTopics.add(buf); + std_group.put(buf, lingPhrases); + } + } + } + cleanUpStandardizedTopics(); + + return standardizedTopics; + } + + public void cleanUpStandardizedTopics(){ + List<String> toDelete = new ArrayList<String>(); + for(int i=0; i< standardizedTopics.size(); i++) + for(int j=i+1; j< standardizedTopics.size(); j++){ + List<String> t1 = TextProcessor.fastTokenize(standardizedTopics.get(i), false); + List<String> t2 = TextProcessor.fastTokenize(standardizedTopics.get(j), false); + for(int k=0; k< t1.size(); k++){ + t1.set(k, stemmer.stem(t1.get(k))); + } + for(int k=0; k< t2.size(); k++){ + t2.set(k, stemmer.stem(t2.get(k))); + } + // check if lists are equal + if (t1.size()!=t2.size()) + continue; + //if in two phrases once all keywords are tokenized, one phrase annihilates another, + t1.removeAll(t2); + if (t1.isEmpty()){ + if (standardizedTopics.get(i).length()> standardizedTopics.get(j).length()){ + toDelete.add(standardizedTopics.get(i)); + // TODO update std_group entry + System.out.println("Removing '" + standardizedTopics.get(i) + "' because of '" + standardizedTopics.get(j) ); + List<ParseTreeChunk> stJ = std_group.get(standardizedTopics.get(j)); + stJ.addAll(std_group.get(standardizedTopics.get(i))); + stJ = new ArrayList<ParseTreeChunk>(new HashSet<ParseTreeChunk>(stJ)); + std_group.put(standardizedTopics.get(j), stJ); + } + else { + toDelete.add(standardizedTopics.get(j)); + System.out.println("Removing '" + standardizedTopics.get(j) + "' because of '" + standardizedTopics.get(i) ); + List<ParseTreeChunk> stI = std_group.get(standardizedTopics.get(i)); + stI.addAll(std_group.get(standardizedTopics.get(j))); + stI = new ArrayList<ParseTreeChunk>(new HashSet<ParseTreeChunk>(stI)); + std_group.put(standardizedTopics.get(i), stI); + } + + } + } + for(String d: toDelete){ + //System.out.println("Removed '" + d + "'"); + standardizedTopics.remove(d); + } + } + + // substitute synonyms according to internal vocab + private String substituteSynonym(String w) { + try { + for(String[] pair: synonymPairs){ + if (w.equals(pair[0])) + return pair[1]; + } + } catch (Exception e) { + e.printStackTrace(); + } + return w; + } + + public void generateGroupingReport(String reportName){ + List<String[]> report = new ArrayList<String[]>(); + Set<ParseTreeChunk> chs = entry_group.keySet(); + report.add(new String[]{"string phrase" , "class", "linguistic phrase", "list of ling phrases class representatives"}); + + for(ParseTreeChunk ch: chs){ + String head = ch.getLemmas().get(ch.getLemmas().size()-1); + List<ParseTreeChunk> values = entry_group.get(ch); + if (values.size()<6) + head = ""; + report.add(new String[]{ch.toWordOnlyString(), head, ch.toString(), values.toString()}); + } + ProfileReaderWriter.writeReport(report, reportName); + } + + //final merge floor-floors-flooring as head nound with phrase update + public void applyLastRoundOfAggregation(){ + //merge <floor - floors - flooring> + /* + List<ParseTreeChunk> entries = new ArrayList<ParseTreeChunk>(entry_group.keySet()); + for(int i=0; i< entries.size(); i++){ + for(int j=i+1; j< entries.size(); j++){ + ParseTreeChunk chI = entries.get(i); + ParseTreeChunk chJ = entries.get(j); + String headI = getLastElement(chI.getLemmas()); + String headJ = getLastElement(chJ.getLemmas()); + if (headI==null || headI.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER || + headJ==null || headJ.length()<MIN_LENGTH_OF_WORD_TO_CONSIDER ) + continue; + + if (headI.indexOf(headJ)>-1){ + //leave headJ + List<ParseTreeChunk> valuesToAddTo = entry_group.get(chJ); + List<ParseTreeChunk> valuesBeingAdded = entry_group.get(chI); + if (valuesToAddTo==null || valuesBeingAdded == null) + continue; + valuesToAddTo.addAll(valuesBeingAdded); + entry_group.put(chJ, valuesToAddTo); + entry_group.remove(chI); + System.out.println("Deleting entry '"+ headI +"' and moving group to entry '"+ headJ +"'"); + } else if (headJ.indexOf(headI)>-1){ + //leave headJ + List<ParseTreeChunk> valuesToAddTo = entry_group.get(chI); + List<ParseTreeChunk> valuesBeingAdded = entry_group.get(chJ); + if (valuesToAddTo==null || valuesBeingAdded == null) + continue; + valuesToAddTo.addAll(valuesBeingAdded); + entry_group.put(chI, valuesToAddTo); + entry_group.remove(chJ); + System.out.println("Deleting entry '"+ headJ +"' and moving group to entry '"+ headI +"'"); + } + + } + } + */ + for(int i = 0; i<standardizedTopics.size(); i++ ) + for(int j = i+1; j<standardizedTopics.size(); j++ ){ + String headI = extractHeadNounFromPhrase(standardizedTopics.get(i)); + String headJ = extractHeadNounFromPhrase(standardizedTopics.get(j)); + // if the same word do nothing + if (headI.equals(headJ)) + continue; + + //only if one is sub-word of another + if (headI.indexOf(headJ)>-1){ + + if (!properSubWordForm(headI, headJ)) + continue; + //entry 'I' will be updated + String newKey = standardizedTopics.get(i).replace(headI, headJ); + + List<ParseTreeChunk> stI = std_group.get(standardizedTopics.get(i)); + List<ParseTreeChunk> stInew = std_group.get(newKey); + //if (stInew!=null && !stInew.isEmpty()) + // stI.addAll(stInew); + if(stI==null) + continue; + std_group.put(newKey, stI); + std_group.remove(standardizedTopics.get(i)); + System.out.println("Deleted entry for key '"+ standardizedTopics.get(i) +"' and created '"+ newKey +"'"); + standardizedTopics.set(i, newKey); + + } else if (headJ.indexOf(headI)>-1){ + if (!properSubWordForm(headJ, headI)) + continue; + //entry 'J' will be updated + String newKey = standardizedTopics.get(j).replace(headJ, headI); + + List<ParseTreeChunk> stJ = std_group.get(standardizedTopics.get(j)); + List<ParseTreeChunk> stJnew = std_group.get(newKey); + //if (stJnew!=null && !stJnew.isEmpty()) + // stJ.addAll(stJnew); + if(stJ==null) + continue; + std_group.put(newKey, stJ); + std_group.remove(standardizedTopics.get(j)); + System.out.println("Deleted entry for key '"+ standardizedTopics.get(j) +"' and created '"+ newKey +"'"); + standardizedTopics.set(j, newKey); + } + } + + + + } + + private boolean properSubWordForm(String headI, String headJ) { + String suffix = headI.replace(headJ, ""); + if (suffix.equals("s") || suffix.equals("ing") //|| suffix.equals("er") + || suffix.equals("rooms") || + suffix.equals("") || suffix.equals("counter") || + suffix.equals("room") || suffix.equals("back")) + return true; + + //System.out.println("Wrong word '"+ headI + "'reduction into '" + headJ +"'"); + return false; + } + + //generates report + public void generateStdTopicReport(String reportName){ + List<String[]> report = new ArrayList<String[]>(); + report.add(new String[]{"category", "topic", "sub-topics", "phrase instances" }); + + for(String t: standardizedTopics){ + + String bufCover = ""; + int count = 0; + List<ParseTreeChunk> ptcList = std_group.get(t); + if (ptcList == null) + continue; + for(ParseTreeChunk ch: ptcList){ + List<String> candidate = TextProcessor.fastTokenize(ch.toWordOnlyString(), false); + List<String> tList = TextProcessor.fastTokenize(t, false); + List<String> tListChk = new ArrayList<String>(tList); + + tListChk.removeAll(candidate); + // fully covered by phrase instance + if (!tListChk.isEmpty() || ch.toWordOnlyString().equals(t)){ + continue; + } + + boolean bCovered = true; + + for(String ts: tList){ + boolean bCandWordsIsCovered = false; + for(String s: candidate){ + if ((s.indexOf(ts)>-1) )// && properSubWordForm(s, ts)) + bCandWordsIsCovered = true; + } + if (!bCandWordsIsCovered){ + bCovered = false; + break; + } + } + if (!bCovered) + continue; + bufCover+=ch.toWordOnlyString()+ " # "; + count++; + if (count > 40) + break; + + } + if (bufCover.endsWith(" # ")) + bufCover = bufCover.substring(0, bufCover.length()-3).trim(); + + String buf = ""; + count = 0; + // only up to 40 instances of phrases per 1-st level topic + for(ParseTreeChunk ch: ptcList){ + buf+=ch.toWordOnlyString()+ "|"; + count++; + if (count > 40) + break; + } + + //TODO uncomment + //t = spell.getSpellCheckResult(t); + report.add(new String[]{extractHeadNounFromPhrase(t), t, bufCover, buf //, std_group.get(t).toString() + }); + } + + + ProfileReaderWriter.writeReport(report, reportName); + } + // get a last word from a phrase (supposed to be a head noun) + private String extractHeadNounFromPhrase(String topic){ + String[] tops = topic.split(" "); + int len = tops.length; + if (len>1){ + return tops[len-1]; + } + else return topic; + } + + // get last elem of a list + private String getLastElement(List<String> arrayList ){ + if (arrayList != null && !arrayList.isEmpty()) { + return arrayList.get(arrayList.size()-1); + } + return null; + } + /* + * Using Bing API to check if an extracted phrase can be found on the web, therefore is a meaningful phrase + */ + public List<String> verifyTopic(){ + Set<String> phrases = freq.keySet(); + List<String> approvedPhrases = new ArrayList<String>(); + for(String p: phrases){ + List<HitBase> hits = runner.runSearch("\""+p+"\""); + for(HitBase h: hits){ + String lookup = h.getTitle() + " " + h.getAbstractText(); + if (lookup.indexOf(p)>-1){ + approvedPhrases.add(p); + break; + } + } + } + return approvedPhrases; + } + + public Set<String> getPhraseLookup(){ + return freq.keySet(); + } + + // using phrase frequency to filter phrases + public boolean isAcceptablePhrase(String phrase){ + Integer count = freq.get(phrase.toLowerCase().trim()); + if (count==null) + return false; + + if (count>0 && count < 10000) + return true; + return false; + } + + public static void main(String[] args){ + LinguisticPhraseManager man = new LinguisticPhraseManager( + "/Users/bgalitsky/Documents/workspace/move_com/phrasesOfInterest.csv"); + man.doLingGrouping(); + man.generateGroupingReport("topics_groups7_mergedHeads.csv"); + List<String> stdTopics = man.formStandardizedTopic(); + man.applyLastRoundOfAggregation(); + man.generateStdTopicReport("std_topics7_mergedHeads.csv"); + System.out.println(stdTopics); + + } +}
