http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GoogleAutoCompleteQueryRunner.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GoogleAutoCompleteQueryRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GoogleAutoCompleteQueryRunner.java new file mode 100644 index 0000000..8cea2d8 --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GoogleAutoCompleteQueryRunner.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.similarity.apps; + +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang.StringUtils; + +import opennlp.tools.similarity.apps.utils.PageFetcher; + +public class GoogleAutoCompleteQueryRunner { + protected PageFetcher pageFetcher = new PageFetcher(); + private static String searchRequest = "http://google.com/complete/search?q=", + suffix = "&output=toolbar"; + + + public List<String> getAutoCompleteExpression(String rawExpr){ + // insert spaces into camel cases + rawExpr= rawExpr.replaceAll("([a-z][a-z])([A-Z][a-z])", "$1 $2"); + String query = rawExpr.replace(' ', '+'); + try { + query = URLEncoder.encode(query, "UTF-8"); + } catch (UnsupportedEncodingException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + String pageOrigHTML = pageFetcher.fetchOrigHTML(searchRequest +query+suffix); + String[] results = StringUtils.substringsBetween(pageOrigHTML, "<CompleteSuggestion>", "</CompleteSuggestion>"); + List<List<String>> accum = new ArrayList<List<String>>(); + if (results==null) + return null; + for(String wrapped: results){ + List<String> accumCase = new ArrayList<String>(); + String[] words = null; + try { + words = StringUtils.substringBetween(wrapped, "\"").split(" "); + } catch (Exception e){ + + } + if (words==null || words.length<1) + continue; + accumCase = Arrays.asList(words); + accum.add(accumCase); + } + + //TODO make more noise-resistant algo + if (accum.size()>1){ + List<String> first = new ArrayList<String>(accum.get(0)); + List<String> second = new ArrayList<String>(accum.get(1)); + + first.retainAll(second); + if (first.size()>0) + return first; + else + return accum.get(0); + } + + if (accum.size()==1) + return accum.get(0); + + + return null; + } + + public static String handleCamelCases(String input){ + String s = input.replaceAll("([a-z,A-Z][a-z])([A-Z][a-z])", "$1 $2").replaceAll("([a-z,A-Z][a-z])([A-Z][a-z])", "$1 $2"); + s = s.replaceAll("([A-Z])([A-Z][a-z])", "$1 $2"); + return s; + } + + public static void main(String[] args){ + + + String[] tests = new String[]{"SharingInviteNotification", "SharedByMeSortingOptions", "SharedByMeCurrentSortingOption", "GroupedPrivatelySharedByMe", + "StorageMeter", "RecentActivities", "StorageMeter", "SharingInviteNotification", + "RecentActivities", "ImporterSuggestionsPrefABC", + "WSItem", + "SharingInviteNotification", + "UserDesktopDevices", + "RootFoldersPaginated", + "SharingInviteNotification", "apply security settings"}; + for(String s: tests){ + System.out.println(handleCamelCases(s)); + } + + GoogleAutoCompleteQueryRunner runner = new GoogleAutoCompleteQueryRunner(); + List<String> + res = runner.getAutoCompleteExpression("commentcount"); + System.out.println(res); + res = runner.getAutoCompleteExpression("clearSess"); + System.out.println(res); + res = runner.getAutoCompleteExpression("ImporterSuggestionsPref"); + System.out.println(res); + res = runner.getAutoCompleteExpression("breadCrumbs"); + System.out.println(res); + res = runner.getAutoCompleteExpression("RootFolder"); + System.out.println(res); + + res = runner.getAutoCompleteExpression("BreadCrumbList"); + System.out.println(res); + + } + +} +
http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SentenceTranslate.java.txt ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SentenceTranslate.java.txt b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SentenceTranslate.java.txt new file mode 100644 index 0000000..414b8d1 --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SentenceTranslate.java.txt @@ -0,0 +1,212 @@ +package opennlp.tools.similarity.apps; + + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.logging.Logger; + +import opennlp.tools.parse_thicket.ParseCorefsBuilder; +import opennlp.tools.parse_thicket.ParseThicket; +import opennlp.tools.parse_thicket.apps.SnippetToParagraph; +import opennlp.tools.parse_thicket.matching.Matcher; +import opennlp.tools.similarity.apps.BingQueryRunner; +import opennlp.tools.similarity.apps.Fragment; +import opennlp.tools.similarity.apps.GeneratedSentenceProcessor; +import opennlp.tools.similarity.apps.HitBase; +import opennlp.tools.similarity.apps.RelatedSentenceFinder; +import opennlp.tools.similarity.apps.utils.PageFetcher; +import opennlp.tools.similarity.apps.utils.Utils; +import opennlp.tools.textsimilarity.ParseTreeChunk; +import opennlp.tools.textsimilarity.ParseTreeChunkListScorer; +import opennlp.tools.textsimilarity.SentencePairMatchResult; +import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; + +import com.memetix.mst.language.Language; +import com.memetix.mst.translate.Translate; + +import edu.stanford.nlp.ling.Sentence; +import edu.stanford.nlp.ling.TaggedWord; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.tregex.TregexMatcher; +import edu.stanford.nlp.trees.tregex.TregexPattern; + +/** + * Class for sentence translation with improvement + * based on parse thickets. + * @author Alex Loptev + * + */ +public class SentenceTranslate { + private static String clientID = "ParseThicketsTranslation"; + private static String clientSecret = "M4teDPWKv5xMTOZ/v6nJbwya4ilPE0cUCK4cCPGeRok="; + + private static Logger LOG; + private static ParseTreeChunkListScorer parseTreeChunkListScorer; + private static Matcher matcher; + private static BingQueryRunner searchRunner; + private static SnippetToParagraph sentenceRetriever; + private static final int CONSIDERABLE_SEARCH_RESULTS_COUNT = 1; + private static final double MEANINGLESS_THRESHOLD = 3.0; + private static final int MINIMUM_WORDS_IN_PHRASE_FOR_TESTING = 5; + + /** + * Substitutes translation fragments by text fragments found on the Web with enough + * similarity score with original translation. + * @param translatedSentence sentence translated by some translator (e.g. Microsoft Translator) + * @return improvedTranslation + */ + public static String improveSentenceTranslationBySimilartyAssessment(String translatedSentence) { + List<Tree> phraseNodesForTesting = formPhrasesForMeaningfulnessTesting(translatedSentence); + for (Tree phraseNode: phraseNodesForTesting) { + String phrase = Sentence.listToString(phraseNode.yield()); + String quotedPhrase = "\"" + phrase + "\""; + HitBase mostSimilarResult = null; + double mostSimilarScore = 0.0; + boolean meaningfull = false; + String[] phrases = {quotedPhrase, phrase}; + for (String p: Arrays.asList(phrases)) { + LOG.info(String.format("Meaningfulness testing for phrase: %s", p)); + List<HitBase> searchResults = searchRunner.runSearch(p, CONSIDERABLE_SEARCH_RESULTS_COUNT); + for (HitBase searchResult: searchResults) { + double score = assessSimilarityWithHitBase(phrase, searchResult); + if (score > MEANINGLESS_THRESHOLD ) { + meaningfull = true; + LOG.info(String.format("Phrase %s is meaningful. Score is %f", phrase, score)); + break; + } + if (mostSimilarScore < score) { + mostSimilarResult = searchResult; + mostSimilarScore = score; + } + } + if (meaningfull) + break; + } + if (!meaningfull) { + LOG.info(String.format("Phrase %s is meaningless. Maximal score is %f", phrase, mostSimilarScore)); + // TODO: replacing meaningless phrase + } + } + return ""; + } + + public static double assessSimilarityScore(String s1, String s2) { + LOG.info(String.format("Assess similarity between: \"%s\" and \"%s\"", s1, s2)); + List<List<ParseTreeChunk>> match = matcher.assessRelevance(s1, s2); + double sim = parseTreeChunkListScorer.getParseTreeChunkListScore(match); + LOG.info(String.format("Score: %f", sim)); + return sim; + } + + /** + * Assesses similarity score for phrase and search result's: + * title, snippet and appropriate document sentence. + * @param sentence + * @param searchResult + * @return similarity score + */ + private static double assessSimilarityWithHitBase(String phrase, HitBase searchResult) { + String title = searchResult.getTitle().replace("<b>", " ").replace("</b>", " ").replace(" ", " ").replace(" ", " "); + String snippet = searchResult.getAbstractText().replace("<b>", " ").replace("</b>", " ").replace(" ", " ").replace(" ", " "); + double score = Math.max(assessSimilarityScore(phrase, title), assessSimilarityScore(phrase, snippet)); + searchResult = sentenceRetriever.formTextFromOriginalPageGivenSnippet(searchResult); + List<String> sentences = searchResult.getOriginalSentences(); + for (String sentence: sentences) { + score = Math.max(score, assessSimilarityScore(phrase, sentence)); + } + return score; + } + + /** + * Creates list of phrases (L_op) from translated sentence for meaningfulness testing. + * Such list includes all the phrases which contain at least two sub-phrases. + * @param sentence + * @return list of phrases containing at least two sub-phrases + */ + private static List<Tree> formPhrasesForMeaningfulnessTesting(String sentence) { + List<Tree> results = new LinkedList<Tree>(); + ParseCorefsBuilder ptBuilder = ParseCorefsBuilder.getInstance(); + ParseThicket pt = ptBuilder.buildParseThicket(sentence); + Tree t = pt.getSentences().get(0); + // tregex pattern for all nodes with at least two phrasal children + TregexPattern pattern = TregexPattern.compile("__ < (__ [ !<: __ | < (__ < __) ] $ (__ !<: __ | < (__ < __)))"); + TregexMatcher matcher = pattern.matcher(t); + while (matcher.findNextMatchingNode()) { + Tree candidate = matcher.getMatch(); + int wordsCount = 0; + // test if phrase is too short + for (TaggedWord leaf: candidate.taggedYield()) { + // if is not punctuation + if (Character.isLetter(leaf.tag().charAt(0))) { + wordsCount++; + } + } + if (wordsCount >= MINIMUM_WORDS_IN_PHRASE_FOR_TESTING) { + results.add(candidate); + } + } + // reversing phrases because the highest nodes in tree should + // be tested for meaningfulness after the lowest nodes + Collections.reverse(results); + return results; + } + + + /** + * Execute simple sentence translation by Microsoft Translation API. + * + * @param sentence sentence for translation + * @param fromLanguage sentence native language + * @param fromLanguage sentence destination language + * @return translated sentence + * @throws Exception + */ + public static String executeByMicrosoftTranslator(String text, Language fromLanguage, Language toLanguage) throws Exception { + String result = Translate.execute(text, fromLanguage, toLanguage); + LOG.info(text + " -> " + result); + return result; + } + + /** + * Execute simple sentence translation to English by Microsoft Translation API + * with sentence native language auto detection. + * + * @param sentence sentence for translation + * @return translated sentence + * @throws Exception + */ + public static String executeByMicrosoftTranslator(String text) throws Exception { + return executeByMicrosoftTranslator(text, Language.AUTO_DETECT, Language.ENGLISH); + } + + public static void setMicrosoftTranslatorClientId(String clientId) { + Translate.setClientId(clientId); + } + + public static void setMicrosoftTranslatorClientSecret(String clientSecret) { + Translate.setClientSecret(clientSecret); + } + + /** + * Static initialization block. + */ + static { + Translate.setClientId(clientID); + Translate.setClientSecret(clientSecret); + searchRunner = new BingQueryRunner(); + sentenceRetriever = new SnippetToParagraph(); + matcher = new Matcher(); + parseTreeChunkListScorer = new ParseTreeChunkListScorer(); + LOG = Logger.getLogger("opennlp.tools.parse_thicket.translation.SentenceTranslate"); + } + + /** + * Dummy method for testing purposes. + * @param args + */ + public static void main(String[] args) throws Exception { + SentenceTranslate.improveSentenceTranslationBySimilartyAssessment(SentenceTranslate.executeByMicrosoftTranslator("� �����������, 8 �������, ������ ����� ����� �������� � ���������� ����� ���� ���������������� �������, ������� � ���� ���� ����� �������.")); + }} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/Comment.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/Comment.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/Comment.java new file mode 100644 index 0000000..91ba23f --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/Comment.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.similarity.apps.solr; + +import java.util.Iterator; +import java.util.List; + +import org.docx4j.jaxb.Context; +import org.docx4j.openpackaging.exceptions.InvalidFormatException; +import org.docx4j.openpackaging.packages.WordprocessingMLPackage; +import org.docx4j.openpackaging.parts.JaxbXmlPart; +import org.docx4j.openpackaging.parts.WordprocessingML.CommentsPart; +import org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart; +import org.docx4j.relationships.Relationship; +import org.docx4j.relationships.Relationships; +import org.docx4j.wml.Comments; +import org.docx4j.wml.ObjectFactory; + +//import util.RelationshipName; + +public class Comment { + + private WordprocessingMLPackage wordMlPackage; + private boolean relSet = false; + + public Comment(WordprocessingMLPackage wordMLPack) { + this.wordMlPackage = wordMLPack; + setCommentRel(); + } + + private void setCommentRel() { + if (!commentRelSet()) { + CommentsPart cp; + try { + cp = new CommentsPart(); + // Part must have minimal contents + org.docx4j.wml.ObjectFactory wmlObjectFactory = new ObjectFactory(); + wordMlPackage.getMainDocumentPart().addTargetPart(cp); + } catch (InvalidFormatException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + + private boolean commentRelSet() { + Relationship relShip; + boolean relSet = false; + if (!relSet) { + JaxbXmlPart<Relationships> jxpRelShips = wordMlPackage + .getMainDocumentPart().getOwningRelationshipPart(); + Relationships pk = jxpRelShips.getJaxbElement(); + + List<Relationship> mc = pk.getRelationship(); + + Iterator<Relationship> it = mc.iterator(); + /* while (it.hasNext() && !relSet) { + relShip = it.next(); + if (relShip.getValue().equalsIgnoreCase( + RelationshipName.commentIdentifier)) { + relSet = true; + } + }*/ + } + return relSet; + } + + public static void main(String[] args) throws Exception { + + // Create a package + WordprocessingMLPackage wmlPack = new WordprocessingMLPackage(); + + // Create main document part + MainDocumentPart wordDocumentPart = new MainDocumentPart(); + + // Create main document part content + org.docx4j.wml.ObjectFactory factory = Context.getWmlObjectFactory(); + org.docx4j.wml.Body body = factory.createBody(); + org.docx4j.wml.Document wmlDocumentEl = factory.createDocument(); + + wmlDocumentEl.setBody(body); + wordDocumentPart.setJaxbElement(wmlDocumentEl); + wmlPack.addTargetPart(wordDocumentPart); + + CommentsPart cp = new CommentsPart(); + // Part must have minimal contents + Comments comments = factory.createComments(); + cp.setJaxbElement(comments); + + wordDocumentPart.addTargetPart(cp); + + // Now you can add comments to your comments part, + // and comment refs in your main document part + + + + + + + + wmlPack.save(new java.io.File(System.getProperty("user.dir")+ "/out-m.docx")); + } + +} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/CommentsRel.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/CommentsRel.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/CommentsRel.java new file mode 100644 index 0000000..fe3c14f --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/CommentsRel.java @@ -0,0 +1,139 @@ +package opennlp.tools.similarity.apps.solr; + +import java.io.File; +import java.io.IOException; +import java.math.BigInteger; + +import javax.xml.bind.JAXBException; + +import org.docx4j.XmlUtils; +import org.docx4j.jaxb.Context; +import org.docx4j.openpackaging.exceptions.Docx4JException; +import org.docx4j.openpackaging.exceptions.InvalidFormatException; +import org.docx4j.openpackaging.packages.WordprocessingMLPackage; +import org.docx4j.openpackaging.parts.WordprocessingML.CommentsPart; +import org.docx4j.openpackaging.parts.WordprocessingML.EndnotesPart; +import org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart; +import org.docx4j.wml.CTEndnotes; +import org.docx4j.wml.CTFtnEdn; +import org.docx4j.wml.Comments; + +public class CommentsRel { + + private WordprocessingMLPackage wordMlPackage; + private MainDocumentPart mainPart; + private boolean relSet = false; + private org.docx4j.wml.ObjectFactory wmlObjectFactory; + private CommentsPart cmPart; + + + public CommentsRel(WordprocessingMLPackage wordMLPack) { + this.wordMlPackage = wordMLPack; + wmlObjectFactory = new org.docx4j.wml.ObjectFactory(); + setCommentRel(); + cmPart = wordMlPackage.getMainDocumentPart().getCommentsPart(); + mainPart = wordMLPack.getMainDocumentPart(); + } + + private void setCommentRel() { + if (!commentRelSet()) { + CommentsPart cp; + try { + cp = new CommentsPart(); + // Part must have minimal contents + Comments comments = wmlObjectFactory.createComments(); + cp.setJaxbElement(comments); + + wordMlPackage.getMainDocumentPart().addTargetPart(cp); + } catch (InvalidFormatException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + + private boolean commentRelSet() { + return !(wordMlPackage.getMainDocumentPart().getCommentsPart() == null); + } + + public void addNewComment(String author, String text) { + +// XMLGregorianCalendar xmlCal = new XMLGregorianCalendarImpl(); // You'll need to fix this! + +// CommentRangeEnd cRangeEnde = wmlObjectFactory.createCommentRangeEnd(); +// CommentRangeStart cRangeStart = wmlObjectFactory +// .createCommentRangeStart(); + Comments comment = wmlObjectFactory.createComments(); +// Comments.Comment myCom = wm + + org.docx4j.wml.Comments.Comment c = Context.getWmlObjectFactory().createCommentsComment(); + System.out.println("test"); +// comment.setParent(cmPart); + c.setAuthor(author); +// c.setDate(xmlCal); + cmPart.getJaxbElement().getComment().add(c); + + System.out.println("test ende"); + } + + // WordprocessingMLPackage wordML; + + public static void main(String args[]) throws IOException { + + File document = new File("C:/workspace/TestSolr/mydoc.docx"); + + MainDocumentPart mDocPart; + try { + /* mlPackage = new WordprocessingMLPackage().load(new File(document.getCanonicalPath())); + + mDocPart = mlPackage.getMainDocumentPart(); + + CommentsRel myComment = new CommentsRel(mlPackage); + + myComment.addNewComment("MC","Text"); + */ + // Add an endnote + + WordprocessingMLPackage mlPackage = WordprocessingMLPackage.createPackage(); + + // Setup endnotes part + EndnotesPart ep = new EndnotesPart(); + CTEndnotes endnotes = Context.getWmlObjectFactory().createCTEndnotes(); + ep.setJaxbElement(endnotes); + mlPackage.getMainDocumentPart().addTargetPart(ep); + + CTFtnEdn endnote = Context.getWmlObjectFactory().createCTFtnEdn(); + endnotes.getEndnote().add(endnote); + + endnote.setId(BigInteger.ONE.add(BigInteger.ONE)); + String endnoteBody = "<w:p xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\" ><w:pPr><w:pStyle w:val=\"EndnoteText\"/></w:pPr><w:r><w:rPr><w:rStyle w:val=\"EndnoteReference\"/></w:rPr><w:endnoteRef/></w:r><w:r><w:t xml:space=\"preserve\"> An endnote</w:t></w:r></w:p>"; + try { + endnote.getEGBlockLevelElts().add( XmlUtils.unmarshalString(endnoteBody)); + } catch (JAXBException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + // Add the body text referencing it + String docBody = "<w:p xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\" ><w:r><w:t>the quick brown</w:t></w:r><w:r><w:rPr><w:rStyle w:val=\"EndnoteReference\"/></w:rPr><w:endnoteReference w:id=\"2\"/></w:r></w:p>"; + + try { + mlPackage.getMainDocumentPart().addParagraph(docBody); + } catch (JAXBException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + + + + + + mlPackage.save(new File("C:/workspace/TestSolr/mydoc.docx-OUT.docx")); + } catch (Docx4JException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + } + } http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/QueryExpansionRequestHandler.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/QueryExpansionRequestHandler.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/QueryExpansionRequestHandler.java new file mode 100644 index 0000000..e98e6be --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/QueryExpansionRequestHandler.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.similarity.apps.solr; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import opennlp.tools.similarity.apps.HitBaseComparable; +import opennlp.tools.similarity.apps.utils.Pair; +import opennlp.tools.textsimilarity.ParseTreeChunkListScorer; +import opennlp.tools.textsimilarity.SentencePairMatchResult; +import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; + +import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.CachingWrapperFilter; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryWrapperFilter; +import org.apache.lucene.search.ScoreDoc; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.params.ShardParams; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.handler.RequestHandlerBase; +import org.apache.solr.handler.component.ResponseBuilder; +import org.apache.solr.handler.component.SearchComponent; +import org.apache.solr.handler.component.SearchHandler; +import org.apache.solr.handler.component.ShardHandler; +import org.apache.solr.handler.component.ShardRequest; +import org.apache.solr.handler.component.ShardResponse; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.ResultContext; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.schema.SchemaField; +import org.apache.solr.search.DocIterator; +import org.apache.solr.search.DocList; +import org.apache.solr.search.DocSlice; +import org.apache.solr.search.QParser; +import org.apache.solr.search.SolrIndexSearcher; +import org.apache.solr.util.RTimer; +import org.apache.solr.util.SolrPluginUtils; + +public class QueryExpansionRequestHandler extends SearchHandler { + + public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp){ + try { + //System.out.println("request before ="+req); + SolrQueryRequest req1 = substituteField(req); + //System.out.println("request after ="+req1); + super.handleRequestBody(req1, rsp); + } catch (Exception e) { + e.printStackTrace(); + } + + } + + public static SolrQueryRequest substituteField(SolrQueryRequest req){ + SolrParams params = req.getParams(); + String query = params.get("q"); + System.out.println("query before ="+query); + query = query.replace(' ', '_'); + System.out.println("query after ="+query); + NamedList values = params.toNamedList(); + values.remove("q"); + values.add("q", query); + params = SolrParams.toSolrParams(values); + req.setParams(params); + return req; + } +} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerStanfRequestHandler.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerStanfRequestHandler.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerStanfRequestHandler.java new file mode 100644 index 0000000..477f022 --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerStanfRequestHandler.java @@ -0,0 +1,306 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.similarity.apps.solr; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.logging.Logger; + +import opennlp.tools.parse_thicket.apps.SnippetToParagraph; +import opennlp.tools.parse_thicket.matching.Matcher; +import opennlp.tools.similarity.apps.BingQueryRunner; +import opennlp.tools.similarity.apps.HitBase; +import opennlp.tools.similarity.apps.HitBaseComparable; +import opennlp.tools.similarity.apps.utils.Pair; +import opennlp.tools.textsimilarity.ParseTreeChunk; +import opennlp.tools.textsimilarity.ParseTreeChunkListScorer; +import opennlp.tools.textsimilarity.SentencePairMatchResult; +import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; + +import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.CachingWrapperFilter; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryWrapperFilter; +import org.apache.lucene.search.ScoreDoc; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.params.ShardParams; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.handler.component.SearchHandler; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; + + + +public class SearchResultsReRankerStanfRequestHandler extends SearchHandler { + private static Logger LOG = Logger + .getLogger("com.become.search.requestHandlers.SearchResultsReRankerRequestHandler"); + private final static int MAX_SEARCH_RESULTS = 100; + private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer(); + private int MAX_QUERY_LENGTH_NOT_TO_RERANK=3; + private Matcher matcher = new Matcher(); + private BingQueryRunner bingSearcher = new BingQueryRunner(); + private SnippetToParagraph snp = new SnippetToParagraph(); + + + public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp){ + // get query string + String requestExpression = req.getParamString(); + String[] exprParts = requestExpression.split("&"); + for(String part: exprParts){ + if (part.startsWith("q=")) + requestExpression = part; + } + String query = StringUtils.substringAfter(requestExpression, ":"); + LOG.info(requestExpression); + + + SolrParams ps = req.getOriginalParams(); + Iterator<String> iter = ps.getParameterNamesIterator(); + List<String> keys = new ArrayList<String>(); + while(iter.hasNext()){ + keys.add(iter.next()); + } + + List<HitBase> searchResults = new ArrayList<HitBase>(); + + + + + + for ( Integer i=0; i< MAX_SEARCH_RESULTS; i++){ + String title = req.getParams().get("t"+i.toString()); + String descr = req.getParams().get("d"+i.toString()); + + if(title==null || descr==null) + continue; + + HitBase hit = new HitBase(); + hit.setTitle(title); + hit.setAbstractText(descr); + hit.setSource(i.toString()); + searchResults.add(hit); + } + + /* + * http://173.255.254.250:8983/solr/collection1/reranker/? + * q=search_keywords:design+iphone+cases&fields=spend+a+day+with+a+custom+iPhone+case&fields=Add+style+to+your+every+day+fresh+design+with+a+custom+iPhone+case&fields=Add+style+to+your+every+day+with+mobile+case+for+your+family&fields=Add+style+to+your+iPhone+and+iPad&fields=Add+Apple+fashion+to+your+iPhone+and+iPad + * + */ + + if (searchResults.size()<1) { + int count=0; + for(String val : exprParts){ + if (val.startsWith("fields=")){ + val = StringUtils.mid(val, 7, val.length()); + HitBase hit = new HitBase(); + hit.setTitle(""); + hit.setAbstractText(val); + hit.setSource(new Integer(count).toString()); + searchResults.add(hit); + count++; + } + + } + } + + + List<HitBase> reRankedResults = null; + query = query.replace('+', ' '); + if (tooFewKeywords(query)|| orQuery(query)){ + reRankedResults = searchResults; + LOG.info("No re-ranking for "+query); + } + else + reRankedResults = calculateMatchScoreResortHits(searchResults, query); + /* + * <scores> +<score index="2">3.0005</score> +<score index="1">2.101</score> +<score index="3">2.1003333333333334</score> +<score index="4">2.00025</score> +<score index="5">1.1002</score> +</scores> + * + * + */ + StringBuffer buf = new StringBuffer(); + buf.append("<scores>"); + for(HitBase hit: reRankedResults){ + buf.append("<score index=\""+hit.getSource()+"\">"+hit.getGenerWithQueryScore()+"</score>"); + } + buf.append("</scores>"); + + NamedList<Object> scoreNum = new NamedList<Object>(); + for(HitBase hit: reRankedResults){ + scoreNum.add(hit.getSource(), hit.getGenerWithQueryScore()); + } + + StringBuffer bufNums = new StringBuffer(); + bufNums.append("order>"); + for(HitBase hit: reRankedResults){ + bufNums.append(hit.getSource()+"_"); + } + bufNums.append("/order>"); + + LOG.info("re-ranking results: "+buf.toString()); + NamedList<Object> values = rsp.getValues(); + values.remove("response"); + values.add("response", scoreNum); + values.add("new_order", bufNums.toString().trim()); + rsp.setAllValues(values); + + } + + private boolean orQuery(String query) { + if (query.indexOf('|')>-1) + return true; + + return false; + } + + private boolean tooFewKeywords(String query) { + String[] parts = query.split(" "); + if (parts!=null && parts.length< MAX_QUERY_LENGTH_NOT_TO_RERANK) + return true; + + return false; + } + + protected List<HitBase> calculateMatchScoreResortHits(List<HitBase> hits, + String searchQuery) { + + List<HitBase> newHitList = new ArrayList<HitBase>(); + int count = 0; + for (HitBase hit : hits) { + if (count>10) + break; + count++; + String[] pageSentsAndSnippet = formTextForReRankingFromHit(hit); + + Double score = 0.0; + try { + List<List<ParseTreeChunk>> match = null; + if (pageSentsAndSnippet!=null && pageSentsAndSnippet[0].length()>50){ + match = matcher.assessRelevanceCache(pageSentsAndSnippet[0] , + searchQuery); + score = parseTreeChunkListScorer.getParseTreeChunkListScore(match); + hit.setSource(match.toString()); + } + if (score < 2){ // attempt to match with snippet, if not much luck with original text + match = matcher.assessRelevanceCache(pageSentsAndSnippet[0] , + searchQuery); + score = parseTreeChunkListScorer.getParseTreeChunkListScore(match); + } + LOG.info(score + " | " +pageSentsAndSnippet[1]); + } catch (Exception e) { + LOG.severe("Problem processing snapshot " + pageSentsAndSnippet[1]); + e.printStackTrace(); + } + hit.setGenerWithQueryScore(score); + newHitList.add(hit); + } + + System.out.println("\n\n ============= old ORDER ================= "); + for (HitBase hit : newHitList) { + System.out.println(hit.getOriginalSentences().toString() + " => "+hit.getGenerWithQueryScore()); + System.out.println("match = "+hit.getSource()); + } + Collections.sort(newHitList, new HitBaseComparable()); + + System.out.println("\n\n ============= NEW ORDER ================= "); + for (HitBase hit : newHitList) { + System.out.println(hit.getOriginalSentences().toString() + " => "+hit.getGenerWithQueryScore()); + System.out.println("match = "+hit.getSource()); + } + + return newHitList; + } + + protected String[] formTextForReRankingFromHit(HitBase hit) { + HitBase hitWithFullSents = snp.formTextFromOriginalPageGivenSnippet(hit); + String textFromOriginalPage = ""; + try { + List<String> sents = hitWithFullSents.getOriginalSentences(); + for(String s: sents){ + textFromOriginalPage+=s+" "; + } + + if (textFromOriginalPage.startsWith(".")){ + textFromOriginalPage = textFromOriginalPage.substring(2); + } + textFromOriginalPage = textFromOriginalPage.replace(" . .", ". ").replace(". . ", ". "). + replace("..", ". ").trim(); + } catch (Exception e1) { + e1.printStackTrace(); + LOG.info("Problem processing snapshot "+hit.getAbstractText()); + } + hit.setPageContent(textFromOriginalPage); + String snapshot = hit.getAbstractText().replace("<b>...</b>", ". ").replace("<span class='best-phrase'>", " ").replace("<span>", " ").replace("<span>", " ") + .replace("<b>", "").replace("</b>", ""); + snapshot = snapshot.replace("</B>", "").replace("<B>", "") + .replace("<br>", "").replace("</br>", "").replace("...", ". ") + .replace("|", " ").replace(">", " ").replace(". .", ". "); + snapshot += " . " + hit.getTitle(); + + return new String[] { textFromOriginalPage, snapshot }; + } + + + public class HitBaseComparable implements Comparator<HitBase> { + // @Override + public int compare(HitBase o1, HitBase o2) { + return (o1.getGenerWithQueryScore() > o2.getGenerWithQueryScore() ? -1 + : (o1 == o2 ? 0 : 1)); + } + } + +} + +/* + +http://dev1.exava.us:8086/solr/collection1/reranker/?q=search_keywords:I+want+style+in+my+every+day+fresh+design+iphone+cases +&t1=Personalized+iPhone+4+Cases&d1=spend+a+day+with+a+custom+iPhone+case +&t2=iPhone+Cases+to+spend+a+day&d2=Add+style+to+your+every+day+fresh+design+with+a+custom+iPhone+case +&t3=Plastic+iPhone+Cases&d3=Add+style+to+your+every+day+with+mobile+case+for+your+family +&t4=Personalized+iPhone+and+iPad+Cases&d4=Add+style+to+your+iPhone+and+iPad +&t5=iPhone+accessories+from+Apple&d5=Add+Apple+fashion+to+your+iPhone+and+iPad + +http://dev1.exava.us:8086/solr/collection1/reranker/?q=search_keywords:I+want+style+in+my+every+day+fresh+design+iphone+cases&t1=Personalized+iPhone+4+Cases&d1=spend+a+day+with+a+custom+iPhone+case&t2=iPhone+Cases+to+spend+a+day&d2=Add+style+to+your+every+day+fresh+design+with+a+custom+iPhone+case&t3=Plastic+iPhone+Cases&d3=Add+style+to+your+every+day+with+mobile+case+for+your+family&t4=Personalized+iPhone+and+iPad+Cases&d4=Add+style+to+your+iPhone+and+iPad&t5=iPhone+accessories+from+Apple&d5=Add+Apple+fashion+to+your+iPhone+and+iPad + */ \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilder.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilder.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilder.java new file mode 100644 index 0000000..3de5c1b --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilder.java @@ -0,0 +1,270 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.similarity.apps.solr; + + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import net.billylieurance.azuresearch.AzureSearchImageResult; +import net.billylieurance.azuresearch.AzureSearchResultSet; +import net.billylieurance.azuresearch.AzureSearchWebResult; + +import org.docx4j.dml.wordprocessingDrawing.Inline; +import org.docx4j.openpackaging.exceptions.Docx4JException; +import org.docx4j.openpackaging.exceptions.InvalidFormatException; +import org.docx4j.openpackaging.packages.WordprocessingMLPackage; +import org.docx4j.openpackaging.parts.WordprocessingML.BinaryPartAbstractImage; +import org.docx4j.wml.CTFootnotes; +import org.docx4j.wml.CTFtnEdn; +import org.docx4j.wml.Drawing; +import org.docx4j.wml.P; +import org.docx4j.wml.R; +import org.docx4j.wml.STFtnEdn; + +import opennlp.tools.similarity.apps.BingQueryRunner; +import opennlp.tools.similarity.apps.Fragment; +import opennlp.tools.similarity.apps.HitBase; + +public class WordDocBuilder{ + protected static final String IMG_REL_PATH = "images/"; + protected BingQueryRunner imageSearcher = new BingQueryRunner(); + protected String absPath = null; + + public WordDocBuilder(){ + absPath = new File(".").getAbsolutePath(); + absPath = absPath.substring(0, absPath.length()-1); + } + + public String buildWordDoc(List<HitBase> content, String title){ + + String outputDocFinename = absPath+"/written/"+ title.replace(' ','_').replace('\"', ' ').trim()+ ".docx"; + + WordprocessingMLPackage wordMLPackage; + try { + wordMLPackage = WordprocessingMLPackage.createPackage(); + wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Title", title); + for(HitBase para: content){ + + wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle", + para.getTitle()); + String paraText = para.getFragments().toString().replace("[", "").replace("]", "").replace(" | ", "") + .replace(".,", ".").replace(".\"", "\"").replace(". .", ".") + .replace(",.", "."); + wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText); + + addImageByImageTitleToPackage(wordMLPackage, para.getTitle()); + } + + //File file = new File("C:/ma/personal/argCamp.png"); + //byte[] bytes = convertImageToByteArray(file); + //addImageToPackage(wordMLPackage, bytes); + + wordMLPackage.save(new File(outputDocFinename)); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + return outputDocFinename; + } + + private void addImageByImageTitleToPackage( + WordprocessingMLPackage wordMLPackage, String title) { + AzureSearchResultSet<AzureSearchImageResult> res = imageSearcher.runImageSearch(title); + for (AzureSearchImageResult anr : res){ + String url = anr.getMediaUrl(); + addImageByURLToPackage( wordMLPackage, url); + return; + } + + } + + private void addImageByURLToPackage(WordprocessingMLPackage wordMLPackage, + String url){ + String destinationFile = url.replace("http://", "").replace("/", "_"); + saveImageFromTheWeb(url, absPath+IMG_REL_PATH+destinationFile); + File file = new File(absPath+destinationFile); + try { + byte[] bytes = convertImageToByteArray(file); + addImageToPackage(wordMLPackage, bytes); + } catch (FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + /** + * Docx4j contains a utility method to create an image part from an array of + * bytes and then adds it to the given package. In order to be able to add this + * image to a paragraph, we have to convert it into an inline object. For this + * there is also a method, which takes a filename hint, an alt-text, two ids + * and an indication on whether it should be embedded or linked to. + * One id is for the drawing object non-visual properties of the document, and + * the second id is for the non visual drawing properties of the picture itself. + * Finally we add this inline object to the paragraph and the paragraph to the + * main document of the package. + * + * @param wordMLPackage The package we want to add the image to + * @param bytes The bytes of the image + * @throws Exception Sadly the createImageInline method throws an Exception + * (and not a more specific exception type) + * + * + */ + protected static void addImageToPackage(WordprocessingMLPackage wordMLPackage, + byte[] bytes) throws Exception { + BinaryPartAbstractImage imagePart = + BinaryPartAbstractImage.createImagePart(wordMLPackage, bytes); + + int docPrId = 1; + int cNvPrId = 2; + Inline inline = imagePart.createImageInline("Filename hint", + "Alternative text", docPrId, cNvPrId, false); + + P paragraph = addInlineImageToParagraph(inline); + + wordMLPackage.getMainDocumentPart().addObject(paragraph); + } + + /** + * We create an object factory and use it to create a paragraph and a run. + * Then we add the run to the paragraph. Next we create a drawing and + * add it to the run. Finally we add the inline object to the drawing and + * return the paragraph. + * + * @param inline The inline object containing the image. + * @return the paragraph containing the image + */ + private static P addInlineImageToParagraph(Inline inline) { + // Now add the in-line image to a paragraph + org.docx4j.wml.ObjectFactory factory = new org.docx4j.wml.ObjectFactory(); + P paragraph = factory.createP(); + R run = factory.createR(); + paragraph.getContent().add(run); + Drawing drawing = factory.createDrawing(); + run.getContent().add(drawing); + drawing.getAnchorOrInline().add(inline); + return paragraph; + } + + private static CTFootnotes createFootnote(P paragraph){ + org.docx4j.wml.ObjectFactory factory = new org.docx4j.wml.ObjectFactory(); + CTFootnotes fn = factory.createCTFootnotes(); + fn.setParent(paragraph); + + //STFtnEdn sTFtnEdn = factory.createSTFtnEdn(); + CTFtnEdn fe = factory.createCTFtnEdn(); + fe.setParent(paragraph); + return fn; + } + + /** + * Convert the image from the file into an array of bytes. + * + * @param file the image file to be converted + * @return the byte array containing the bytes from the image + * @throws FileNotFoundException + * @throws IOException + */ + protected static byte[] convertImageToByteArray(File file) + throws FileNotFoundException, IOException { + InputStream is = new FileInputStream(file ); + long length = file.length(); + // You cannot create an array using a long, it needs to be an int. + if (length > Integer.MAX_VALUE) { + System.out.println("File too large!!"); + } + byte[] bytes = new byte[(int)length]; + int offset = 0; + int numRead = 0; + while (offset < bytes.length && (numRead=is.read(bytes, offset, bytes.length-offset)) >= 0) { + offset += numRead; + } + // Ensure all the bytes have been read + if (offset < bytes.length) { + System.out.println("Could not completely read file " + +file.getName()); + } + is.close(); + return bytes; + } + + + + public static void saveImageFromTheWeb(String imageUrl, String destinationFile) { + try { + URL url = new URL(imageUrl); + InputStream is = url.openStream(); + if (!new File(destinationFile).exists()) { + new File(destinationFile).createNewFile(); + } + + OutputStream os = new FileOutputStream(destinationFile); + + + byte[] b = new byte[2048]; + int length; + + while ((length = is.read(b)) != -1) { + os.write(b, 0, length); + } + + is.close(); + os.close(); + } catch (MalformedURLException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + public static void main(String[] args){ + WordDocBuilder b = new WordDocBuilder(); + List<HitBase> content = new ArrayList<HitBase>(); + for(int i = 0; i<10; i++){ + HitBase h = new HitBase(); + h.setTitle("albert einstein "+i); + List<Fragment> frs = new ArrayList<Fragment>(); + frs.add(new Fragment(" content "+i, 0)); + h.setFragments(frs); + content.add(h); + } + + b.buildWordDoc(content, "mytitle"); + } +} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderEndNotes.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderEndNotes.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderEndNotes.java new file mode 100644 index 0000000..7d46f86 --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderEndNotes.java @@ -0,0 +1,223 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.similarity.apps.solr; + + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.math.BigInteger; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import javax.xml.bind.JAXBException; + +import net.billylieurance.azuresearch.AzureSearchImageResult; +import net.billylieurance.azuresearch.AzureSearchResultSet; +import net.billylieurance.azuresearch.AzureSearchWebResult; + +import org.apache.commons.lang.StringUtils; +import org.docx4j.XmlUtils; +import org.docx4j.dml.wordprocessingDrawing.Inline; +import org.docx4j.jaxb.Context; +import org.docx4j.openpackaging.exceptions.Docx4JException; +import org.docx4j.openpackaging.exceptions.InvalidFormatException; +import org.docx4j.openpackaging.packages.WordprocessingMLPackage; +import org.docx4j.openpackaging.parts.WordprocessingML.BinaryPartAbstractImage; +import org.docx4j.openpackaging.parts.WordprocessingML.EndnotesPart; +import org.docx4j.wml.CTEndnotes; +import org.docx4j.wml.CTFtnEdn; +import org.docx4j.wml.Drawing; +import org.docx4j.wml.P; +import org.docx4j.wml.R; + +import opennlp.tools.similarity.apps.BingQueryRunner; +import opennlp.tools.similarity.apps.Fragment; +import opennlp.tools.similarity.apps.HitBase; + +public class WordDocBuilderEndNotes extends WordDocBuilderSingleImageSearchCall{ + + public String buildWordDoc(List<HitBase> content, String title){ + + String outputDocFinename = absPath+"written/"+ title.replace(' ','_').replace('\"', ' ').trim()+ ".docx"; + + WordprocessingMLPackage wordMLPackage=null; + + + List<String> imageURLs = getAllImageSearchResults(title); + int count=0; + BigInteger refId = BigInteger.ONE; + try { + wordMLPackage = WordprocessingMLPackage.createPackage(); + + + CTEndnotes endnotes = null; + try { + EndnotesPart ep = new EndnotesPart(); + endnotes = Context.getWmlObjectFactory().createCTEndnotes(); + ep.setJaxbElement(endnotes); + wordMLPackage.getMainDocumentPart().addTargetPart(ep); + } catch (InvalidFormatException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + } + + + + + wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Title", title.toUpperCase()); + for(HitBase para: content){ + if (para.getFragments()==null || para.getFragments().size()<1) // no found content in this hit + continue; + try { + String processedParaTitle = processParagraphTitle(para.getTitle()); + + if (processedParaTitle!=null && + !processedParaTitle.endsWith("..") || StringUtils.isAlphanumeric(processedParaTitle)){ + wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle",processedParaTitle); + } + String paraText = processParagraphText(para.getFragments().toString()); + wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText); + + CTFtnEdn endnote = Context.getWmlObjectFactory().createCTFtnEdn(); + endnotes.getEndnote().add(endnote); + + endnote.setId(refId); + refId.add(BigInteger.ONE); + String url = para.getUrl(); + String endnoteBody = "<w:p xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\" ><w:pPr><w:pStyle w:val=\"EndnoteText\"/></w:pPr><w:r><w:rPr>" + + "<w:rStyle w:val=\"EndnoteReference\"/></w:rPr><w:endnoteRef/></w:r><w:r><w:t xml:space=\"preserve\"> "+ url + "</w:t></w:r></w:p>"; + try { + endnote.getEGBlockLevelElts().add( XmlUtils.unmarshalString(endnoteBody)); + } catch (JAXBException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + // Add the body text referencing it + String docBody = "<w:p xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\" ><w:r><w:t>"//+ paraText + /*+ refId.toString()*/ +"</w:t></w:r><w:r><w:rPr><w:rStyle w:val=\"EndnoteReference\"/></w:rPr><w:endnoteReference w:id=\""+refId.toString()+"\"/></w:r></w:p>"; + + try { + wordMLPackage.getMainDocumentPart().addParagraph(docBody); + } catch (JAXBException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + try { + addImageByImageURLToPackage(count, wordMLPackage, imageURLs); + } catch (Exception e) { + // no need to report issues + //e.printStackTrace(); + } + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + count++; + } + // now add URLs + wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle", "REFERENCES"); + for(HitBase para: content){ + if (para.getFragments()==null || para.getFragments().size()<1) // no found content in this hit + continue; + try { + wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle", + para.getTitle()); + String paraText = para.getUrl(); + wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText); + + + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + + try { + wordMLPackage.save(new File(outputDocFinename)); + System.out.println("Finished creating docx ="+outputDocFinename); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + try { + String fileNameToDownload = "/var/www/wrt_latest/"+title.replace(' ','_').replace('\"', ' ').trim()+ ".docx"; + wordMLPackage.save(new File(fileNameToDownload)); + System.out.println("Wrote a doc for download :"+fileNameToDownload); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + return outputDocFinename; + } + + public static String processParagraphText(String title){ + + return title.replace("[", "").replace("]", "").replace(" | ", "") + .replace(".,", ".").replace(".\"", "\"").replace(". .", ".") + .replace(",.", "."); + } + + public static String processParagraphTitle(String title){ + String titleDelim = title.replace('-', '&').replace('|', '&'); + String[] titleParts = titleDelim.split("&"); + + int lenCurr = -1; + String bestPart = null; + for(String candidatePart: titleParts ){ // if this part longer and does not have periods + if (lenCurr< candidatePart.length() && candidatePart.indexOf('.')<0){ + lenCurr = candidatePart.length(); + bestPart = candidatePart; + } + } + + return bestPart; + } + + + public static void main(String[] args){ + WordDocBuilderEndNotes b = new WordDocBuilderEndNotes(); + List<HitBase> content = new ArrayList<HitBase>(); + for(int i = 0; i<10; i++){ + HitBase h = new HitBase(); + h.setTitle("albert einstein "+i); + List<Fragment> frs = new ArrayList<Fragment>(); + frs.add(new Fragment(" content "+i, 0)); + h.setFragments(frs); + h.setUrl("http://www."+i+".com"); + content.add(h); + } + + b.buildWordDoc(content, "albert einstein"); + } +} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderSingleImageSearchCall.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderSingleImageSearchCall.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderSingleImageSearchCall.java new file mode 100644 index 0000000..7c7c1d9 --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderSingleImageSearchCall.java @@ -0,0 +1,174 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.similarity.apps.solr; + + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import net.billylieurance.azuresearch.AzureSearchImageResult; +import net.billylieurance.azuresearch.AzureSearchResultSet; +import net.billylieurance.azuresearch.AzureSearchWebResult; + +import org.apache.commons.lang.StringUtils; +//import org.docx4j.Docx4J; +//import org.docx4j.convert.out.FOSettings; +import org.docx4j.openpackaging.packages.WordprocessingMLPackage; + +import opennlp.tools.similarity.apps.ContentGeneratorSupport; +import opennlp.tools.similarity.apps.Fragment; +import opennlp.tools.similarity.apps.HitBase; + + + +public class WordDocBuilderSingleImageSearchCall extends WordDocBuilder{ + + public String buildWordDoc(List<HitBase> content, String title){ + + String outputDocFinename = absPath+"/written/"+ title.replace(' ','_').replace('\"', ' ').trim()+ ".docx"; + + WordprocessingMLPackage wordMLPackage; + List<String> imageURLs = getAllImageSearchResults(title); + int count=0; + try { + wordMLPackage = WordprocessingMLPackage.createPackage(); + wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Title", title.toUpperCase()); + for(HitBase para: content){ + if (para.getFragments()==null || para.getFragments().size()<1) // no found content in this hit + continue; + try { + if (!para.getTitle().endsWith("..") /*|| StringUtils.isAlphanumeric(para.getTitle())*/){ + String sectTitle = ContentGeneratorSupport.getPortionOfTitleWithoutDelimiters(para.getTitle()); + wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle", + sectTitle); + } + String paraText = para.getFragments().toString().replace("[", "").replace("]", "").replace(" | ", "") + .replace(".,", ".").replace(".\"", "\"").replace(". .", ".") + .replace(",.", "."); + wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText); + + try { + addImageByImageURLToPackage(count, wordMLPackage, imageURLs); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + count++; + } + // now add URLs + wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle", "REFERENCES"); + for(HitBase para: content){ + if (para.getFragments()==null || para.getFragments().size()<1) // no found content in this hit + continue; + try { + wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle", + para.getTitle()); + String paraText = para.getUrl(); + wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText); + + + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + + wordMLPackage.save(new File(outputDocFinename)); + System.out.println("Finished creating docx ="+outputDocFinename); + //TODO pdf export + /* + FOSettings foSettings = Docx4J.createFOSettings(); + foSettings.setWmlPackage(wordMLPackage); + OutputStream os = new java.io.FileOutputStream(outputDocFinename.replace(".docx", ".pdf")); + Docx4J.toFO(foSettings, os, Docx4J.FLAG_NONE); + System.out.println("Finished creating docx's PDF ="+outputDocFinename); + */ + + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + return outputDocFinename; + } + + protected void addImageByImageURLToPackage(int count, + WordprocessingMLPackage wordMLPackage, + List<String> imageURLs) { + if (count>imageURLs.size()-1) + return; + + String url = imageURLs.get(count); + String destinationFile = url.replace("http://", "").replace("/", "_"); + saveImageFromTheWeb(url, absPath+IMG_REL_PATH+destinationFile); + File file = new File(absPath+IMG_REL_PATH+destinationFile); + try { + byte[] bytes = convertImageToByteArray(file); + addImageToPackage(wordMLPackage, bytes); + } catch (FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + } + + protected List<String> getAllImageSearchResults(String title) { + List<String> imageURLs = new ArrayList<String>(); + AzureSearchResultSet<AzureSearchImageResult> res = imageSearcher.runImageSearch(title); + for(AzureSearchImageResult imResult: res){ + imageURLs.add(imResult.getMediaUrl()); + } + return imageURLs; + + } + + + public static void main(String[] args){ + WordDocBuilderSingleImageSearchCall b = new WordDocBuilderSingleImageSearchCall(); + List<HitBase> content = new ArrayList<HitBase>(); + for(int i = 0; i<10; i++){ + HitBase h = new HitBase(); + h.setTitle("albert einstein "+i); + List<Fragment> frs = new ArrayList<Fragment>(); + frs.add(new Fragment(" content "+i, 0)); + h.setFragments(frs); + content.add(h); + } + + b.buildWordDoc(content, "albert einstein"); + } +} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/CsvAdapter.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/CsvAdapter.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/CsvAdapter.java new file mode 100644 index 0000000..8d718f3 --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/CsvAdapter.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.similarity.apps.taxo_builder; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import opennlp.tools.jsmlearning.ProfileReaderWriter; + +public class CsvAdapter { + Map<String, List<List<String>>> lemma_AssocWords = new HashMap<String, List<List<String>>>(); + private String resourceDir=null, fileNameToImport = null; + + public CsvAdapter(){ + if (resourceDir==null) + try { + resourceDir = new File( "." ).getCanonicalPath()+"/src/test/resources"; + } catch (IOException e) { + e.printStackTrace(); + } + fileNameToImport = resourceDir + "/taxonomies/musicTaxonomyRoot.csv"; + } + + public void importCSV(){ + List<String[]> lines = ProfileReaderWriter.readProfiles(fileNameToImport); + String topNode=null; + for(String[] line: lines){ + String line0 = extractEntity(line[0]).toLowerCase(); + List<String> path = new ArrayList<String>(); + List<List<String>> paths = new ArrayList<List<String>>(); + if (line[1]!=null && line[1].equals("1")){ + topNode = line0; + } else { + path.add(topNode); + path.add(line0); + paths.add(path); + lemma_AssocWords.put(line0, paths); + } + } + } + + private String extractEntity(String s) { + Integer[] poss = new Integer[]{s.indexOf('/'), + s.indexOf('('), s.indexOf('_')}; + + int cutPos = 100; + for(int p: poss){ + if (p>-1 && p< cutPos) + cutPos=p; + } + + if (cutPos<100) + s = s.substring(0,cutPos).trim(); + return s; + } +} http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/DomainTaxonomyExtender.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/DomainTaxonomyExtender.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/DomainTaxonomyExtender.java new file mode 100644 index 0000000..8538c25 --- /dev/null +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/DomainTaxonomyExtender.java @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.similarity.apps.taxo_builder; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.logging.Logger; + +import org.apache.commons.lang.StringUtils; + +import opennlp.tools.parse_thicket.matching.Matcher; +import opennlp.tools.similarity.apps.BingQueryRunner; +import opennlp.tools.similarity.apps.HitBase; +import opennlp.tools.similarity.apps.utils.StringCleaner; +import opennlp.tools.stemmer.PStemmer; +import opennlp.tools.textsimilarity.ParseTreeChunk; +import opennlp.tools.textsimilarity.ParseTreeChunkListScorer; +import opennlp.tools.textsimilarity.SentencePairMatchResult; +import opennlp.tools.textsimilarity.TextProcessor; +import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; + +/** + + * + */ + +public class DomainTaxonomyExtender { + private static Logger LOG = Logger + .getLogger("opennlp.tools.similarity.apps.taxo_builder.DomainTaxonomyExtender"); + + private BingQueryRunner brunner = new BingQueryRunner(); + + protected static String BING_KEY = "WFoNMM706MMJ5JYfcHaSEDP+faHj3xAxt28CPljUAHA"; + Matcher matcher = new Matcher(); + + private final static String TAXO_FILENAME = "taxo_data.dat"; + + private Map<String, List<List<String>>> lemma_ExtendedAssocWords = new HashMap<String, List<List<String>>>(); + private Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords = new HashMap<List<String>, List<List<String>>>(); + private PStemmer ps; + + CsvAdapter adapter = new CsvAdapter(); + + public Map<List<String>, List<List<String>>> getAssocWords_ExtendedAssocWords() { + return assocWords_ExtendedAssocWords; + } + + public Map<String, List<List<String>>> getLemma_ExtendedAssocWords() { + return lemma_ExtendedAssocWords; + } + + public void setLemma_ExtendedAssocWords( + Map<String, List<List<String>>> lemma_ExtendedAssocWords) { + this.lemma_ExtendedAssocWords = lemma_ExtendedAssocWords; + } + + public DomainTaxonomyExtender() { + ps = new PStemmer(); + adapter.importCSV(); + brunner.setKey(BING_KEY); + } + + private List<List<String>> getCommonWordsFromList_List_ParseTreeChunk( + List<List<ParseTreeChunk>> matchList, List<String> queryWordsToRemove, + List<String> toAddAtEnd) { + List<List<String>> res = new ArrayList<List<String>>(); + for (List<ParseTreeChunk> chunks : matchList) { + List<String> wordRes = new ArrayList<String>(); + for (ParseTreeChunk ch : chunks) { + List<String> lemmas = ch.getLemmas(); + for (int w = 0; w < lemmas.size(); w++) + if ((!lemmas.get(w).equals("*")) + && ((ch.getPOSs().get(w).startsWith("NN") || ch.getPOSs().get(w).startsWith("JJ") || ch.getPOSs().get(w) + .startsWith("VB"))) && lemmas.get(w).length() > 2) { + String formedWord = lemmas.get(w); + String stemmedFormedWord = ps.stem(formedWord); + if (!stemmedFormedWord.startsWith("invalid")) + wordRes.add(formedWord); + } + } + wordRes = new ArrayList<String>(new HashSet<String>(wordRes)); + List<String> cleanedRes = new ArrayList<String>(); + for(String s: wordRes){ + if (!queryWordsToRemove.contains(s)) + cleanedRes .add(s); + } + //wordRes.removeAll(queryWordsToRemove); + if (cleanedRes.size() > 0) { + //cleanedRes.addAll(toAddAtEnd); + res.add(cleanedRes); + } + } + res = new ArrayList<List<String>>(new HashSet<List<String>>(res)); + return res; + } + + public void extendTaxonomy(String fileNameWithTaxonomyRoot, String domain, String lang) { + + + List<String> entries = new ArrayList<String>((adapter.lemma_AssocWords.keySet())); + try { + for (String entity : entries) { // . + List<List<String>> paths = adapter.lemma_AssocWords.get(entity); + for (List<String> taxoPath : paths) { + String query = taxoPath.toString() + " " + entity + " " + domain; + query = query.replace('[', ' ').replace(']', ' ').replace(',', ' ') + .replace('_', ' '); + List<List<ParseTreeChunk>> matchList = runSearchForTaxonomyPath( + query, "", lang, 20); //30 + List<String> toRemoveFromExtension = new ArrayList<String>(taxoPath); + toRemoveFromExtension.add(entity); + toRemoveFromExtension.add(domain); + List<List<String>> resList = getCommonWordsFromList_List_ParseTreeChunk( + matchList, toRemoveFromExtension, taxoPath); + assocWords_ExtendedAssocWords.put(taxoPath, resList); + resList.add(taxoPath); + lemma_ExtendedAssocWords.put(entity, resList); + + TaxonomySerializer ser = new TaxonomySerializer(lemma_ExtendedAssocWords, + assocWords_ExtendedAssocWords); + ser.writeTaxonomy(TAXO_FILENAME); + } + } + } catch (Exception e) { + e.printStackTrace(); + System.err.println("Problem taxonomy matching"); + } + + + } + + public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query, + String domain, String lang, int numbOfHits) { + List<List<ParseTreeChunk>> genResult = new ArrayList<List<ParseTreeChunk>>(); + try { + List<HitBase> resultList = brunner.runSearch(query, numbOfHits); + + for (int i = 0; i < resultList.size(); i++) { + { + for (int j = i + 1; j < resultList.size(); j++) { + HitBase h1 = resultList.get(i); + HitBase h2 = resultList.get(j); + String snapshot1 = StringCleaner.processSnapshotForMatching(h1 + .getTitle() + " " + h1.getAbstractText()); + String snapshot2 = StringCleaner.processSnapshotForMatching(h2 + .getTitle() + " " + h2.getAbstractText()); + List<List<ParseTreeChunk>> overlaps =matcher.assessRelevance(snapshot1, snapshot2); + genResult.addAll(overlaps); + } + } + } + + } catch (Exception e) { + e.printStackTrace(); + System.err.print("Problem searching for "+query); + } + + return genResult; + } + + public List<String> runSearchForTaxonomyPathFlatten(String query, + String domain, String lang, int numbOfHits) { + List<String> genResult = new ArrayList<String>(); + try { + List<HitBase> resultList = brunner.runSearch(query, numbOfHits); + + for (int i = 0; i < resultList.size(); i++) { + { + for (int j = i + 1; j < resultList.size(); j++) { + HitBase h1 = resultList.get(i); + HitBase h2 = resultList.get(j); + String snapshot1 = StringCleaner.processSnapshotForMatching(h1 + .getTitle() + " " + h1.getAbstractText()); + String snapshot2 = StringCleaner.processSnapshotForMatching(h2 + .getTitle() + " " + h2.getAbstractText()); + List<String> overlaps =assessKeywordOverlap(snapshot1, snapshot2); + genResult.addAll(overlaps); + } + } + } + + } catch (Exception e) { + System.err.print("Problem searching for "+query); + } + + return genResult; + } + + + + private List<String> assessKeywordOverlap(String snapshot1, String snapshot2) { + List<String> results = new ArrayList<String>(); + List<String> firstList = TextProcessor.fastTokenize(snapshot1, false), + secondList = TextProcessor.fastTokenize(snapshot2, false); + firstList.retainAll(secondList); + for(String s: firstList){ + if (s.length()<4) + continue; + if (!StringUtils.isAlpha(s)) + continue; + results.add(s); + } + return results; + } + + public static void main(String[] args) { + DomainTaxonomyExtender self = new DomainTaxonomyExtender(); + self.extendTaxonomy("", "music", + "en"); + + } + +}
