[44/51] [partial] opennlp-sandbox git commit: merge from bgalitsky's own git repo

bgalitsky Wed, 16 Nov 2016 01:11:40 -0800

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GoogleAutoCompleteQueryRunner.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GoogleAutoCompleteQueryRunner.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GoogleAutoCompleteQueryRunner.java
new file mode 100644
index 0000000..8cea2d8
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GoogleAutoCompleteQueryRunner.java
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.similarity.apps;
+
+import java.io.UnsupportedEncodingException;
+import java.net.URLEncoder;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.lang.StringUtils;
+
+import opennlp.tools.similarity.apps.utils.PageFetcher;
+
+public class GoogleAutoCompleteQueryRunner {
+       protected PageFetcher pageFetcher = new PageFetcher();
+       private static String searchRequest = 
"http://google.com/complete/search?q=";,
+                       suffix = "&output=toolbar";
+       
+       
+       public List<String> getAutoCompleteExpression(String rawExpr){
+               // insert spaces into camel cases
+               rawExpr= rawExpr.replaceAll("([a-z][a-z])([A-Z][a-z])", "$1 
$2");
+               String query = rawExpr.replace(' ', '+');
+               try {
+                       query = URLEncoder.encode(query, "UTF-8");
+               } catch (UnsupportedEncodingException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+               
+               String pageOrigHTML = pageFetcher.fetchOrigHTML(searchRequest 
+query+suffix);
+               String[] results = StringUtils.substringsBetween(pageOrigHTML, 
"<CompleteSuggestion>", "</CompleteSuggestion>");
+               List<List<String>> accum = new ArrayList<List<String>>();
+               if (results==null)
+                               return null;
+               for(String wrapped: results){
+                       List<String> accumCase = new ArrayList<String>();
+                       String[] words = null;
+                       try {
+                               words = StringUtils.substringBetween(wrapped, 
"\"").split(" ");
+                       } catch (Exception e){
+                               
+                       }
+                       if (words==null || words.length<1)
+                               continue;
+                       accumCase = Arrays.asList(words);
+                       accum.add(accumCase);
+               }
+               
+               //TODO make more noise-resistant algo
+               if (accum.size()>1){
+                       List<String> first = new 
ArrayList<String>(accum.get(0));
+                       List<String> second = new 
ArrayList<String>(accum.get(1));
+                       
+                       first.retainAll(second);
+                       if (first.size()>0)
+                          return first;
+                       else
+                          return accum.get(0);
+               } 
+               
+               if (accum.size()==1)
+                       return accum.get(0);
+               
+               
+               return null;
+       }
+       
+       public static String handleCamelCases(String input){
+               String s = input.replaceAll("([a-z,A-Z][a-z])([A-Z][a-z])", "$1 
$2").replaceAll("([a-z,A-Z][a-z])([A-Z][a-z])", "$1 $2");
+               s = s.replaceAll("([A-Z])([A-Z][a-z])", "$1 $2");
+               return s;
+       }
+       
+       public static void main(String[] args){
+       
+               
+               String[] tests = new String[]{"SharingInviteNotification", 
"SharedByMeSortingOptions", "SharedByMeCurrentSortingOption", 
"GroupedPrivatelySharedByMe",
+                               "StorageMeter", "RecentActivities", 
"StorageMeter", "SharingInviteNotification", 
+                               "RecentActivities", 
"ImporterSuggestionsPrefABC",
+                               "WSItem",
+                               "SharingInviteNotification",
+                               "UserDesktopDevices",
+                               "RootFoldersPaginated",
+                               "SharingInviteNotification", "apply security 
settings"};
+               for(String s: tests){
+                       System.out.println(handleCamelCases(s));
+               }
+               
+               GoogleAutoCompleteQueryRunner runner = new 
GoogleAutoCompleteQueryRunner();
+               List<String> 
+               res = runner.getAutoCompleteExpression("commentcount");
+               System.out.println(res);
+               res = runner.getAutoCompleteExpression("clearSess");
+               System.out.println(res);
+           res = runner.getAutoCompleteExpression("ImporterSuggestionsPref");
+               System.out.println(res);
+           res = runner.getAutoCompleteExpression("breadCrumbs");
+               System.out.println(res);
+               res = runner.getAutoCompleteExpression("RootFolder");
+               System.out.println(res);
+               
+               res = runner.getAutoCompleteExpression("BreadCrumbList");
+               System.out.println(res);
+               
+       }
+
+}
+


http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SentenceTranslate.java.txt
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SentenceTranslate.java.txt
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SentenceTranslate.java.txt
new file mode 100644
index 0000000..414b8d1
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SentenceTranslate.java.txt
@@ -0,0 +1,212 @@
+package opennlp.tools.similarity.apps;
+
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import opennlp.tools.parse_thicket.ParseCorefsBuilder;
+import opennlp.tools.parse_thicket.ParseThicket;
+import opennlp.tools.parse_thicket.apps.SnippetToParagraph;
+import opennlp.tools.parse_thicket.matching.Matcher;
+import opennlp.tools.similarity.apps.BingQueryRunner;
+import opennlp.tools.similarity.apps.Fragment;
+import opennlp.tools.similarity.apps.GeneratedSentenceProcessor;
+import opennlp.tools.similarity.apps.HitBase;
+import opennlp.tools.similarity.apps.RelatedSentenceFinder;
+import opennlp.tools.similarity.apps.utils.PageFetcher;
+import opennlp.tools.similarity.apps.utils.Utils;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
+import opennlp.tools.textsimilarity.SentencePairMatchResult;
+import 
opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+
+import com.memetix.mst.language.Language;
+import com.memetix.mst.translate.Translate;
+
+import edu.stanford.nlp.ling.Sentence;
+import edu.stanford.nlp.ling.TaggedWord;
+import edu.stanford.nlp.trees.Tree;
+import edu.stanford.nlp.trees.tregex.TregexMatcher;
+import edu.stanford.nlp.trees.tregex.TregexPattern;
+
+/**
+ * Class for sentence translation with improvement
+ * based on parse thickets.
+ * @author Alex Loptev
+ *
+ */
+public class SentenceTranslate {
+        private static String clientID = "ParseThicketsTranslation";
+        private static String clientSecret = 
"M4teDPWKv5xMTOZ/v6nJbwya4ilPE0cUCK4cCPGeRok=";
+
+        private static Logger LOG;
+        private static ParseTreeChunkListScorer parseTreeChunkListScorer; 
+        private static Matcher matcher;
+        private static BingQueryRunner searchRunner;
+        private static SnippetToParagraph sentenceRetriever; 
+        private static final int CONSIDERABLE_SEARCH_RESULTS_COUNT = 1;
+        private static final double MEANINGLESS_THRESHOLD = 3.0;
+        private static final int MINIMUM_WORDS_IN_PHRASE_FOR_TESTING = 5;
+        
+        /**
+         * Substitutes translation fragments by text fragments found on the 
Web with enough
+         * similarity score with original translation.
+         * @param translatedSentence sentence translated by some translator 
(e.g. Microsoft Translator)
+         * @return improvedTranslation
+         */
+        public static String 
improveSentenceTranslationBySimilartyAssessment(String translatedSentence) {
+                List<Tree> phraseNodesForTesting = 
formPhrasesForMeaningfulnessTesting(translatedSentence);
+                for (Tree phraseNode: phraseNodesForTesting) {
+                        String phrase = 
Sentence.listToString(phraseNode.yield());
+                        String quotedPhrase = "\"" + phrase + "\"";
+                        HitBase mostSimilarResult = null;
+                        double mostSimilarScore = 0.0;
+                        boolean meaningfull = false;
+                        String[] phrases = {quotedPhrase, phrase};
+                        for (String p: Arrays.asList(phrases)) {
+                                LOG.info(String.format("Meaningfulness testing 
for phrase: %s", p));
+                                List<HitBase> searchResults = 
searchRunner.runSearch(p, CONSIDERABLE_SEARCH_RESULTS_COUNT);
+                                for (HitBase searchResult: searchResults) {
+                                        double score = 
assessSimilarityWithHitBase(phrase, searchResult);
+                                        if (score > MEANINGLESS_THRESHOLD ) {
+                                                meaningfull = true;
+                                                LOG.info(String.format("Phrase 
%s is meaningful. Score is %f", phrase, score));
+                                                break;
+                                        }
+                                        if (mostSimilarScore < score) {
+                                                mostSimilarResult = 
searchResult;
+                                                mostSimilarScore = score;
+                                        }
+                                }
+                                if (meaningfull)
+                                        break;
+                        }
+                        if (!meaningfull) {
+                                LOG.info(String.format("Phrase %s is 
meaningless. Maximal score is %f", phrase, mostSimilarScore));
+                                // TODO: replacing meaningless phrase
+                        }
+                }
+                return "";
+        }
+        
+        public static double assessSimilarityScore(String s1, String s2) {
+                LOG.info(String.format("Assess similarity between: \"%s\" and 
\"%s\"", s1, s2));
+                List<List<ParseTreeChunk>> match = matcher.assessRelevance(s1, 
s2);
+                double sim = 
parseTreeChunkListScorer.getParseTreeChunkListScore(match);
+                LOG.info(String.format("Score: %f", sim));
+                return sim;
+        }
+        
+        /**
+         * Assesses similarity score for phrase and search result's:
+         * title, snippet and appropriate document sentence.
+         * @param sentence
+         * @param searchResult
+         * @return similarity score
+         */
+        private static double assessSimilarityWithHitBase(String phrase, 
HitBase searchResult) {
+                String title = searchResult.getTitle().replace("<b>", " 
").replace("</b>", " ").replace("  ", " ").replace("  ", " ");
+                String snippet = searchResult.getAbstractText().replace("<b>", 
" ").replace("</b>", " ").replace("  ", " ").replace("  ", " ");
+                double score = Math.max(assessSimilarityScore(phrase, title), 
assessSimilarityScore(phrase, snippet));
+                searchResult = 
sentenceRetriever.formTextFromOriginalPageGivenSnippet(searchResult);
+                List<String> sentences = searchResult.getOriginalSentences();
+                for (String sentence: sentences) {
+                        score = Math.max(score, assessSimilarityScore(phrase, 
sentence));
+                }
+            return score;
+        }
+        
+        /**
+         * Creates list of phrases (L_op) from translated sentence for 
meaningfulness testing.
+         * Such list includes all the phrases which contain at least two 
sub-phrases. 
+         * @param sentence
+         * @return list of phrases containing at least two sub-phrases
+         */
+        private static List<Tree> formPhrasesForMeaningfulnessTesting(String 
sentence) {
+                List<Tree> results = new LinkedList<Tree>();
+                ParseCorefsBuilder ptBuilder = 
ParseCorefsBuilder.getInstance();
+                ParseThicket pt = ptBuilder.buildParseThicket(sentence);
+                Tree t = pt.getSentences().get(0);
+                // tregex pattern for all nodes with at least two phrasal 
children
+                TregexPattern pattern = TregexPattern.compile("__ < (__ [ !<: 
__ | < (__ < __) ] $ (__ !<: __ | < (__ < __)))");
+                TregexMatcher matcher = pattern.matcher(t);
+                while (matcher.findNextMatchingNode()) {
+                        Tree candidate = matcher.getMatch();
+                        int wordsCount = 0;
+                        // test if phrase is too short
+                        for (TaggedWord leaf: candidate.taggedYield()) {
+                                // if is not punctuation
+                                if (Character.isLetter(leaf.tag().charAt(0))) {
+                                        wordsCount++;
+                                }
+                        }
+                        if (wordsCount >= MINIMUM_WORDS_IN_PHRASE_FOR_TESTING) 
{
+                                results.add(candidate);
+                        }
+                }
+                // reversing phrases because the highest nodes in tree should 
+                // be tested for meaningfulness after the lowest nodes
+                Collections.reverse(results);
+                return results;
+        }
+        
+        
+        /**
+         * Execute simple sentence translation by Microsoft Translation API.
+         * 
+         * @param        sentence                sentence for translation
+         * @param        fromLanguage        sentence native language
+         * @param        fromLanguage        sentence destination language
+         * @return        translated                 sentence
+         * @throws Exception 
+         */
+        public static String executeByMicrosoftTranslator(String text, 
Language fromLanguage, Language toLanguage) throws Exception {
+                String result = Translate.execute(text, fromLanguage, 
toLanguage);
+                LOG.info(text + " -> " + result);
+            return result;
+        }
+        
+        /**
+         * Execute simple sentence translation to English by Microsoft 
Translation API
+         * with sentence native language auto detection.
+         * 
+         * @param        sentence                sentence for translation
+         * @return        translated                 sentence
+         * @throws Exception 
+         */
+        public static String executeByMicrosoftTranslator(String text) throws 
Exception {
+                return executeByMicrosoftTranslator(text, 
Language.AUTO_DETECT, Language.ENGLISH);
+        }
+        
+        public static void setMicrosoftTranslatorClientId(String clientId) {
+                Translate.setClientId(clientId);
+        }
+        
+        public static void setMicrosoftTranslatorClientSecret(String 
clientSecret) {
+                Translate.setClientSecret(clientSecret);
+        }
+        
+        /**
+         * Static initialization block.
+         */
+        static {
+                Translate.setClientId(clientID);
+            Translate.setClientSecret(clientSecret);
+            searchRunner = new BingQueryRunner();
+            sentenceRetriever = new SnippetToParagraph();
+            matcher = new Matcher();
+            parseTreeChunkListScorer = new ParseTreeChunkListScorer();
+            LOG = 
Logger.getLogger("opennlp.tools.parse_thicket.translation.SentenceTranslate");
+        }
+        
+        /**
+         * Dummy method for testing purposes.
+         * @param args
+         */
+        public static void main(String[] args) throws Exception {
+                
SentenceTranslate.improveSentenceTranslationBySimilartyAssessment(SentenceTranslate.executeByMicrosoftTranslator("ï¿½
 ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½, 8 ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½, ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ 
ï¿½ï¿½ï¿½ï¿½ï¿½ ï¿½ï¿½ï¿½ï¿½ï¿½ ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ ï¿½ 
ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ ï¿½ï¿½ï¿½ï¿½ï¿½ ï¿½ï¿½ï¿½ï¿½ 
ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½, 
ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ ï¿½ ï¿½ï¿½ï¿½ï¿½ ï¿½ï¿½ï¿½ï¿½ ï¿½ï¿½ï¿½ï¿½ï¿½ 
ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½."));
+        }}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/Comment.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/Comment.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/Comment.java
new file mode 100644
index 0000000..91ba23f
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/Comment.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.similarity.apps.solr;
+
+import java.util.Iterator;
+import java.util.List;
+
+import org.docx4j.jaxb.Context;
+import org.docx4j.openpackaging.exceptions.InvalidFormatException;
+import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
+import org.docx4j.openpackaging.parts.JaxbXmlPart;
+import org.docx4j.openpackaging.parts.WordprocessingML.CommentsPart;
+import org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart;
+import org.docx4j.relationships.Relationship;
+import org.docx4j.relationships.Relationships;
+import org.docx4j.wml.Comments;
+import org.docx4j.wml.ObjectFactory;
+
+//import util.RelationshipName;
+
+public class Comment {
+
+    private WordprocessingMLPackage wordMlPackage;
+    private boolean relSet = false;
+
+    public Comment(WordprocessingMLPackage wordMLPack) {
+        this.wordMlPackage = wordMLPack;
+        setCommentRel();
+    }
+
+    private void setCommentRel() {
+        if (!commentRelSet()) {
+            CommentsPart cp;
+            try {
+                cp = new CommentsPart();
+                // Part must have minimal contents
+                org.docx4j.wml.ObjectFactory wmlObjectFactory = new 
ObjectFactory();
+                wordMlPackage.getMainDocumentPart().addTargetPart(cp);
+            } catch (InvalidFormatException e) {
+                // TODO Auto-generated catch block
+                e.printStackTrace();
+            }
+        }
+    }
+
+    private boolean commentRelSet() {
+        Relationship relShip;
+        boolean relSet = false;
+        if (!relSet) {
+            JaxbXmlPart<Relationships> jxpRelShips = wordMlPackage
+                    .getMainDocumentPart().getOwningRelationshipPart();
+            Relationships pk = jxpRelShips.getJaxbElement();
+
+            List<Relationship> mc = pk.getRelationship();
+
+            Iterator<Relationship> it = mc.iterator();
+       /*     while (it.hasNext() && !relSet) {
+                relShip = it.next();
+                if (relShip.getValue().equalsIgnoreCase(
+                        RelationshipName.commentIdentifier)) {
+                    relSet = true;
+                }
+            }*/
+        }
+        return relSet;
+    }
+    
+    public static void main(String[] args) throws Exception {
+
+        // Create a package
+        WordprocessingMLPackage wmlPack = new WordprocessingMLPackage();
+
+        // Create main document part
+        MainDocumentPart wordDocumentPart = new MainDocumentPart();      
+        
+        // Create main document part content
+        org.docx4j.wml.ObjectFactory factory = Context.getWmlObjectFactory();
+        org.docx4j.wml.Body  body = factory.createBody();      
+        org.docx4j.wml.Document wmlDocumentEl = factory.createDocument();
+        
+        wmlDocumentEl.setBody(body);
+        wordDocumentPart.setJaxbElement(wmlDocumentEl);
+        wmlPack.addTargetPart(wordDocumentPart);
+        
+        CommentsPart cp = new CommentsPart();
+        // Part must have minimal contents
+        Comments comments = factory.createComments();
+        cp.setJaxbElement(comments);
+        
+        wordDocumentPart.addTargetPart(cp);
+        
+        // Now you can add comments to your comments part,
+        // and comment refs in your main document part   
+        
+        
+        
+        
+        
+        
+              
+        wmlPack.save(new java.io.File(System.getProperty("user.dir")+ 
"/out-m.docx"));       
+      }
+    
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/CommentsRel.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/CommentsRel.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/CommentsRel.java
new file mode 100644
index 0000000..fe3c14f
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/CommentsRel.java
@@ -0,0 +1,139 @@
+package opennlp.tools.similarity.apps.solr;
+
+import java.io.File;
+import java.io.IOException;
+import java.math.BigInteger;
+
+import javax.xml.bind.JAXBException;
+
+import org.docx4j.XmlUtils;
+import org.docx4j.jaxb.Context;
+import org.docx4j.openpackaging.exceptions.Docx4JException;
+import org.docx4j.openpackaging.exceptions.InvalidFormatException;
+import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
+import org.docx4j.openpackaging.parts.WordprocessingML.CommentsPart;
+import org.docx4j.openpackaging.parts.WordprocessingML.EndnotesPart;
+import org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart;
+import org.docx4j.wml.CTEndnotes;
+import org.docx4j.wml.CTFtnEdn;
+import org.docx4j.wml.Comments;
+
+public class CommentsRel {
+          
+          private WordprocessingMLPackage wordMlPackage;
+          private MainDocumentPart mainPart;
+          private boolean relSet = false;
+          private org.docx4j.wml.ObjectFactory wmlObjectFactory;
+          private CommentsPart cmPart;
+
+          
+          public CommentsRel(WordprocessingMLPackage wordMLPack) {
+             this.wordMlPackage = wordMLPack;
+             wmlObjectFactory = new org.docx4j.wml.ObjectFactory();
+             setCommentRel();
+             cmPart = wordMlPackage.getMainDocumentPart().getCommentsPart();
+             mainPart = wordMLPack.getMainDocumentPart();
+          }
+
+          private void setCommentRel() {
+             if (!commentRelSet()) {
+                CommentsPart cp;
+                try {
+                   cp = new CommentsPart();
+                   // Part must have minimal contents
+                   Comments comments = wmlObjectFactory.createComments();
+                      cp.setJaxbElement(comments);            
+                   
+                   wordMlPackage.getMainDocumentPart().addTargetPart(cp);
+                } catch (InvalidFormatException e) {
+                   // TODO Auto-generated catch block
+                   e.printStackTrace();
+                }
+             }
+          }
+
+          private boolean commentRelSet() {
+             return !(wordMlPackage.getMainDocumentPart().getCommentsPart() == 
null);
+          }
+
+          public void addNewComment(String author, String text) {
+             
+//           XMLGregorianCalendar xmlCal = new XMLGregorianCalendarImpl();  // 
You'll need to fix this!
+             
+//           CommentRangeEnd cRangeEnde = 
wmlObjectFactory.createCommentRangeEnd();
+//           CommentRangeStart cRangeStart = wmlObjectFactory
+//                 .createCommentRangeStart();
+             Comments comment = wmlObjectFactory.createComments();
+//           Comments.Comment myCom = wm
+
+             org.docx4j.wml.Comments.Comment c = 
Context.getWmlObjectFactory().createCommentsComment();
+             System.out.println("test");
+//           comment.setParent(cmPart);
+             c.setAuthor(author);
+//           c.setDate(xmlCal);
+             cmPart.getJaxbElement().getComment().add(c);
+
+             System.out.println("test ende");
+          }
+
+       //   WordprocessingMLPackage wordML;
+
+          public static void main(String args[]) throws IOException {
+
+             File document = new File("C:/workspace/TestSolr/mydoc.docx");
+
+             MainDocumentPart mDocPart;
+             try {
+               /* mlPackage = new WordprocessingMLPackage().load(new 
File(document.getCanonicalPath()));
+
+                mDocPart = mlPackage.getMainDocumentPart();
+
+                CommentsRel myComment = new CommentsRel(mlPackage);
+
+                myComment.addNewComment("MC","Text");
+                */
+             // Add an endnote
+                
+                WordprocessingMLPackage mlPackage = 
WordprocessingMLPackage.createPackage();
+                
+                // Setup endnotes part
+                EndnotesPart ep = new EndnotesPart();
+                CTEndnotes endnotes = 
Context.getWmlObjectFactory().createCTEndnotes();
+                ep.setJaxbElement(endnotes);
+                mlPackage.getMainDocumentPart().addTargetPart(ep);
+                
+                CTFtnEdn endnote = 
Context.getWmlObjectFactory().createCTFtnEdn();
+                endnotes.getEndnote().add(endnote);
+                
+                endnote.setId(BigInteger.ONE.add(BigInteger.ONE));
+                String endnoteBody = "<w:p 
xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\"; 
><w:pPr><w:pStyle w:val=\"EndnoteText\"/></w:pPr><w:r><w:rPr><w:rStyle 
w:val=\"EndnoteReference\"/></w:rPr><w:endnoteRef/></w:r><w:r><w:t 
xml:space=\"preserve\"> An endnote</w:t></w:r></w:p>";
+                try {
+                               endnote.getEGBlockLevelElts().add( 
XmlUtils.unmarshalString(endnoteBody));
+                       } catch (JAXBException e) {
+                               // TODO Auto-generated catch block
+                               e.printStackTrace();
+                       }
+                
+                // Add the body text referencing it
+                String docBody = "<w:p 
xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\"; 
><w:r><w:t>the quick brown</w:t></w:r><w:r><w:rPr><w:rStyle 
w:val=\"EndnoteReference\"/></w:rPr><w:endnoteReference 
w:id=\"2\"/></w:r></w:p>";
+                
+                try {
+                               
mlPackage.getMainDocumentPart().addParagraph(docBody);
+                       } catch (JAXBException e) {
+                               // TODO Auto-generated catch block
+                               e.printStackTrace();
+                       }
+                
+                
+                
+                
+                
+                
+                 mlPackage.save(new 
File("C:/workspace/TestSolr/mydoc.docx-OUT.docx"));
+             } catch (Docx4JException e) {
+                // TODO Auto-generated catch block
+                e.printStackTrace();
+             }
+
+          }
+       }

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/QueryExpansionRequestHandler.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/QueryExpansionRequestHandler.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/QueryExpansionRequestHandler.java
new file mode 100644
index 0000000..e98e6be
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/QueryExpansionRequestHandler.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.similarity.apps.solr;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import opennlp.tools.similarity.apps.HitBaseComparable;
+import opennlp.tools.similarity.apps.utils.Pair;
+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
+import opennlp.tools.textsimilarity.SentencePairMatchResult;
+import 
opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+
+import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.CachingWrapperFilter;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.QueryWrapperFilter;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrDocumentList;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.params.CommonParams;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.common.params.ShardParams;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.handler.RequestHandlerBase;
+import org.apache.solr.handler.component.ResponseBuilder;
+import org.apache.solr.handler.component.SearchComponent;
+import org.apache.solr.handler.component.SearchHandler;
+import org.apache.solr.handler.component.ShardHandler;
+import org.apache.solr.handler.component.ShardRequest;
+import org.apache.solr.handler.component.ShardResponse;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.ResultContext;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.search.DocIterator;
+import org.apache.solr.search.DocList;
+import org.apache.solr.search.DocSlice;
+import org.apache.solr.search.QParser;
+import org.apache.solr.search.SolrIndexSearcher;
+import org.apache.solr.util.RTimer;
+import org.apache.solr.util.SolrPluginUtils;
+
+public class QueryExpansionRequestHandler extends SearchHandler {
+
+       public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse 
rsp){
+               try {
+                       //System.out.println("request before ="+req);
+                       SolrQueryRequest req1 = substituteField(req);
+                       //System.out.println("request after ="+req1);
+                       super.handleRequestBody(req1, rsp);
+               } catch (Exception e) {
+                       e.printStackTrace();
+               }
+               
+       }
+
+       public static SolrQueryRequest substituteField(SolrQueryRequest req){
+               SolrParams params = req.getParams();
+               String query = params.get("q");
+               System.out.println("query before ="+query);
+               query = query.replace(' ', '_');
+               System.out.println("query after ="+query);
+               NamedList values = params.toNamedList();
+               values.remove("q");
+               values.add("q", query);
+               params = SolrParams.toSolrParams(values);
+               req.setParams(params);
+               return req;
+       }
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerStanfRequestHandler.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerStanfRequestHandler.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerStanfRequestHandler.java
new file mode 100644
index 0000000..477f022
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerStanfRequestHandler.java
@@ -0,0 +1,306 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.similarity.apps.solr;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.logging.Logger;
+
+import opennlp.tools.parse_thicket.apps.SnippetToParagraph;
+import opennlp.tools.parse_thicket.matching.Matcher;
+import opennlp.tools.similarity.apps.BingQueryRunner;
+import opennlp.tools.similarity.apps.HitBase;
+import opennlp.tools.similarity.apps.HitBaseComparable;
+import opennlp.tools.similarity.apps.utils.Pair;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
+import opennlp.tools.textsimilarity.SentencePairMatchResult;
+import 
opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+
+import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.CachingWrapperFilter;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.QueryWrapperFilter;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrDocumentList;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.params.CommonParams;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.common.params.ShardParams;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.handler.component.SearchHandler;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+
+
+
+public class SearchResultsReRankerStanfRequestHandler extends SearchHandler {
+       private static Logger LOG = Logger
+                       
.getLogger("com.become.search.requestHandlers.SearchResultsReRankerRequestHandler");
+       private final static int MAX_SEARCH_RESULTS = 100;
+       private ParseTreeChunkListScorer parseTreeChunkListScorer = new 
ParseTreeChunkListScorer();
+       private int MAX_QUERY_LENGTH_NOT_TO_RERANK=3;
+       private Matcher matcher = new Matcher();
+       private BingQueryRunner bingSearcher = new BingQueryRunner();
+       private SnippetToParagraph snp = new SnippetToParagraph();
+       
+       
+       public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse 
rsp){
+               // get query string
+               String requestExpression = req.getParamString();
+               String[] exprParts = requestExpression.split("&");
+               for(String part: exprParts){
+                       if (part.startsWith("q="))
+                               requestExpression = part;                       
+               }
+               String query = StringUtils.substringAfter(requestExpression, 
":");
+               LOG.info(requestExpression);
+
+
+               SolrParams ps = req.getOriginalParams();
+               Iterator<String> iter =  ps.getParameterNamesIterator();
+               List<String> keys = new ArrayList<String>();
+               while(iter.hasNext()){
+                       keys.add(iter.next());
+               }
+
+               List<HitBase> searchResults = new ArrayList<HitBase>();
+
+
+
+
+
+               for ( Integer i=0; i< MAX_SEARCH_RESULTS; i++){
+                       String title = req.getParams().get("t"+i.toString());
+                       String descr = req.getParams().get("d"+i.toString());
+
+                       if(title==null || descr==null)
+                               continue;
+
+                       HitBase hit = new HitBase();
+                       hit.setTitle(title);
+                       hit.setAbstractText(descr);
+                       hit.setSource(i.toString());
+                       searchResults.add(hit);
+               }
+
+               /*
+                * http://173.255.254.250:8983/solr/collection1/reranker/?
+                * 
q=search_keywords:design+iphone+cases&fields=spend+a+day+with+a+custom+iPhone+case&fields=Add+style+to+your+every+day+fresh+design+with+a+custom+iPhone+case&fields=Add+style+to+your+every+day+with+mobile+case+for+your+family&fields=Add+style+to+your+iPhone+and+iPad&fields=Add+Apple+fashion+to+your+iPhone+and+iPad
+                * 
+                */
+
+               if (searchResults.size()<1) {
+                       int count=0;
+                       for(String val : exprParts){
+                               if (val.startsWith("fields=")){
+                                       val = StringUtils.mid(val, 7, 
val.length());
+                                       HitBase hit = new HitBase();
+                                       hit.setTitle("");
+                                       hit.setAbstractText(val);
+                                       hit.setSource(new 
Integer(count).toString());
+                                       searchResults.add(hit);
+                                       count++;
+                               }
+
+                       }
+               }
+
+
+               List<HitBase> reRankedResults = null;
+               query = query.replace('+', ' ');
+               if (tooFewKeywords(query)|| orQuery(query)){
+                       reRankedResults = searchResults;
+                       LOG.info("No re-ranking for "+query);
+               }
+               else 
+                       reRankedResults = 
calculateMatchScoreResortHits(searchResults, query);
+               /*
+                * <scores>
+<score index="2">3.0005</score>
+<score index="1">2.101</score>
+<score index="3">2.1003333333333334</score>
+<score index="4">2.00025</score>
+<score index="5">1.1002</score>
+</scores>
+                * 
+                * 
+                */
+               StringBuffer buf = new StringBuffer(); 
+               buf.append("<scores>");
+               for(HitBase hit: reRankedResults){
+                       buf.append("<score 
index=\""+hit.getSource()+"\">"+hit.getGenerWithQueryScore()+"</score>");       
                     
+               }
+               buf.append("</scores>");
+
+               NamedList<Object> scoreNum = new NamedList<Object>();
+               for(HitBase hit: reRankedResults){
+                       scoreNum.add(hit.getSource(), 
hit.getGenerWithQueryScore());                            
+               }
+               
+               StringBuffer bufNums = new StringBuffer(); 
+               bufNums.append("order>");
+               for(HitBase hit: reRankedResults){
+                       bufNums.append(hit.getSource()+"_");                    
        
+               }
+               bufNums.append("/order>");
+               
+               LOG.info("re-ranking results: "+buf.toString());
+               NamedList<Object> values = rsp.getValues();
+               values.remove("response");
+               values.add("response", scoreNum); 
+               values.add("new_order", bufNums.toString().trim());
+               rsp.setAllValues(values);
+               
+       }
+
+       private boolean orQuery(String query) {
+               if (query.indexOf('|')>-1)
+                       return true;
+
+               return false;
+       }
+
+       private boolean tooFewKeywords(String query) {
+               String[] parts = query.split(" ");
+               if (parts!=null && parts.length< MAX_QUERY_LENGTH_NOT_TO_RERANK)
+                       return true;
+
+               return false;
+       }
+
+       protected List<HitBase> calculateMatchScoreResortHits(List<HitBase> 
hits,
+                       String searchQuery) {
+
+               List<HitBase> newHitList = new ArrayList<HitBase>();
+               int count = 0;
+               for (HitBase hit : hits) {
+                       if (count>10)
+                               break;
+                       count++;
+                       String[] pageSentsAndSnippet = 
formTextForReRankingFromHit(hit);
+                                       
+                       Double score = 0.0;
+                       try {
+                               List<List<ParseTreeChunk>> match = null;
+                               if (pageSentsAndSnippet!=null && 
pageSentsAndSnippet[0].length()>50){
+                                       match = 
matcher.assessRelevanceCache(pageSentsAndSnippet[0] ,
+                                                       searchQuery);
+                                       score = 
parseTreeChunkListScorer.getParseTreeChunkListScore(match);
+                                       hit.setSource(match.toString());
+                               }
+                               if (score < 2){ // attempt to match with 
snippet, if not much luck with original text
+                                       match = 
matcher.assessRelevanceCache(pageSentsAndSnippet[0] ,
+                                                       searchQuery);
+                                       score = 
parseTreeChunkListScorer.getParseTreeChunkListScore(match);
+                               }
+                               LOG.info(score + " | " +pageSentsAndSnippet[1]);
+                       } catch (Exception e) {
+                               LOG.severe("Problem processing snapshot " + 
pageSentsAndSnippet[1]);
+                               e.printStackTrace();
+                       }
+                       hit.setGenerWithQueryScore(score);
+                       newHitList.add(hit);
+               }
+               
+               System.out.println("\n\n ============= old ORDER 
================= ");
+               for (HitBase hit : newHitList) {
+                       
System.out.println(hit.getOriginalSentences().toString() + " => 
"+hit.getGenerWithQueryScore());
+                       System.out.println("match = "+hit.getSource());
+               }
+               Collections.sort(newHitList, new HitBaseComparable());
+
+               System.out.println("\n\n ============= NEW ORDER 
================= ");
+               for (HitBase hit : newHitList) {
+                       
System.out.println(hit.getOriginalSentences().toString() + " => 
"+hit.getGenerWithQueryScore());
+                       System.out.println("match = "+hit.getSource());
+               }
+
+               return newHitList;
+       }
+       
+       protected String[] formTextForReRankingFromHit(HitBase hit) {
+               HitBase hitWithFullSents = 
snp.formTextFromOriginalPageGivenSnippet(hit);
+               String textFromOriginalPage = "";
+               try {
+                       List<String> sents = 
hitWithFullSents.getOriginalSentences();
+                       for(String s: sents){
+                               textFromOriginalPage+=s+" ";
+                       }
+
+                       if (textFromOriginalPage.startsWith(".")){
+                               textFromOriginalPage = 
textFromOriginalPage.substring(2);
+                       }
+                       textFromOriginalPage = textFromOriginalPage.replace(" . 
.", ". ").replace(". . ", ". ").
+                                       replace("..", ". ").trim();
+               } catch (Exception e1) {
+                       e1.printStackTrace();
+                       LOG.info("Problem processing snapshot 
"+hit.getAbstractText());
+               }
+               hit.setPageContent(textFromOriginalPage);
+               String snapshot = hit.getAbstractText().replace("<b>...</b>", 
". ").replace("<span class='best-phrase'>", " ").replace("<span>", " 
").replace("<span>", " ")
+                               .replace("<b>", "").replace("</b>", "");
+               snapshot = snapshot.replace("</B>", "").replace("<B>", "")
+                               .replace("<br>", "").replace("</br>", 
"").replace("...", ". ")
+                               .replace("|", " ").replace(">", " ").replace(". 
.", ". ");
+               snapshot += " . " + hit.getTitle();
+               
+               return new String[] { textFromOriginalPage, snapshot };
+       }
+
+
+       public class HitBaseComparable implements Comparator<HitBase> {
+               // @Override
+               public int compare(HitBase o1, HitBase o2) {
+                       return (o1.getGenerWithQueryScore() > 
o2.getGenerWithQueryScore() ? -1
+                                       : (o1 == o2 ? 0 : 1));
+               }
+       }
+
+}
+
+/*
+
+http://dev1.exava.us:8086/solr/collection1/reranker/?q=search_keywords:I+want+style+in+my+every+day+fresh+design+iphone+cases
+&t1=Personalized+iPhone+4+Cases&d1=spend+a+day+with+a+custom+iPhone+case
+&t2=iPhone+Cases+to+spend+a+day&d2=Add+style+to+your+every+day+fresh+design+with+a+custom+iPhone+case
+&t3=Plastic+iPhone+Cases&d3=Add+style+to+your+every+day+with+mobile+case+for+your+family
+&t4=Personalized+iPhone+and+iPad+Cases&d4=Add+style+to+your+iPhone+and+iPad
+&t5=iPhone+accessories+from+Apple&d5=Add+Apple+fashion+to+your+iPhone+and+iPad
+
+http://dev1.exava.us:8086/solr/collection1/reranker/?q=search_keywords:I+want+style+in+my+every+day+fresh+design+iphone+cases&t1=Personalized+iPhone+4+Cases&d1=spend+a+day+with+a+custom+iPhone+case&t2=iPhone+Cases+to+spend+a+day&d2=Add+style+to+your+every+day+fresh+design+with+a+custom+iPhone+case&t3=Plastic+iPhone+Cases&d3=Add+style+to+your+every+day+with+mobile+case+for+your+family&t4=Personalized+iPhone+and+iPad+Cases&d4=Add+style+to+your+iPhone+and+iPad&t5=iPhone+accessories+from+Apple&d5=Add+Apple+fashion+to+your+iPhone+and+iPad
+ */
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilder.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilder.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilder.java
new file mode 100644
index 0000000..3de5c1b
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilder.java
@@ -0,0 +1,270 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.similarity.apps.solr;
+
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+
+import net.billylieurance.azuresearch.AzureSearchImageResult;
+import net.billylieurance.azuresearch.AzureSearchResultSet;
+import net.billylieurance.azuresearch.AzureSearchWebResult;
+
+import org.docx4j.dml.wordprocessingDrawing.Inline;
+import org.docx4j.openpackaging.exceptions.Docx4JException;
+import org.docx4j.openpackaging.exceptions.InvalidFormatException;
+import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
+import org.docx4j.openpackaging.parts.WordprocessingML.BinaryPartAbstractImage;
+import org.docx4j.wml.CTFootnotes;
+import org.docx4j.wml.CTFtnEdn;
+import org.docx4j.wml.Drawing;
+import org.docx4j.wml.P;
+import org.docx4j.wml.R;
+import org.docx4j.wml.STFtnEdn;
+
+import opennlp.tools.similarity.apps.BingQueryRunner;
+import opennlp.tools.similarity.apps.Fragment;
+import opennlp.tools.similarity.apps.HitBase;
+
+public class WordDocBuilder{
+       protected static final String IMG_REL_PATH = "images/";
+       protected BingQueryRunner imageSearcher = new BingQueryRunner();
+       protected String absPath = null;
+       
+       public WordDocBuilder(){
+       absPath = new File(".").getAbsolutePath();
+       absPath = absPath.substring(0, absPath.length()-1);
+       }
+       
+       public String buildWordDoc(List<HitBase> content, String title){
+               
+               String outputDocFinename =  absPath+"/written/"+ 
title.replace(' ','_').replace('\"', ' ').trim()+ ".docx";
+               
+               WordprocessingMLPackage wordMLPackage;
+               try {
+                       wordMLPackage = WordprocessingMLPackage.createPackage();
+                       
wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Title", title);
+                       for(HitBase para: content){
+                               
+                               
wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle",
+                                               para.getTitle());
+                               String paraText = 
para.getFragments().toString().replace("[", "").replace("]", "").replace(" | ", 
"")
+                                               .replace(".,", 
".").replace(".\"", "\"").replace(". .", ".")
+                                               .replace(",.", ".");
+                               
wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText);
+                               
+                               addImageByImageTitleToPackage(wordMLPackage, 
para.getTitle());
+                       }
+                       
+                       //File file = new File("C:/ma/personal/argCamp.png");
+               //byte[] bytes = convertImageToByteArray(file);
+               //addImageToPackage(wordMLPackage, bytes);
+               
+                       wordMLPackage.save(new File(outputDocFinename));
+               } catch (Exception e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+               
+               return outputDocFinename;
+       }
+       
+       private void addImageByImageTitleToPackage(
+                       WordprocessingMLPackage wordMLPackage, String title) {
+               AzureSearchResultSet<AzureSearchImageResult> res = 
imageSearcher.runImageSearch(title);
+               for (AzureSearchImageResult anr : res){
+                       String url = anr.getMediaUrl();
+                       addImageByURLToPackage( wordMLPackage, url);
+                       return;
+               }
+               
+       }
+
+       private void addImageByURLToPackage(WordprocessingMLPackage 
wordMLPackage,
+            String url){
+               String destinationFile = url.replace("http://";, 
"").replace("/", "_");
+               saveImageFromTheWeb(url, absPath+IMG_REL_PATH+destinationFile);
+               File file = new File(absPath+destinationFile);
+        try {
+                       byte[] bytes = convertImageToByteArray(file);
+                       addImageToPackage(wordMLPackage, bytes);
+               } catch (FileNotFoundException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               } catch (IOException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               } catch (Exception e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+       }
+       
+       /**
+     *  Docx4j contains a utility method to create an image part from an array 
of
+     *  bytes and then adds it to the given package. In order to be able to 
add this
+     *  image to a paragraph, we have to convert it into an inline object. For 
this
+     *  there is also a method, which takes a filename hint, an alt-text, two 
ids
+     *  and an indication on whether it should be embedded or linked to.
+     *  One id is for the drawing object non-visual properties of the 
document, and
+     *  the second id is for the non visual drawing properties of the picture 
itself.
+     *  Finally we add this inline object to the paragraph and the paragraph 
to the
+     *  main document of the package.
+     *
+     *  @param wordMLPackage The package we want to add the image to
+     *  @param bytes         The bytes of the image
+     *  @throws Exception    Sadly the createImageInline method throws an 
Exception
+     *                       (and not a more specific exception type)
+     *                       
+     *                       
+     */
+    protected static void addImageToPackage(WordprocessingMLPackage 
wordMLPackage,
+                            byte[] bytes) throws Exception {
+        BinaryPartAbstractImage imagePart =
+            BinaryPartAbstractImage.createImagePart(wordMLPackage, bytes);
+ 
+        int docPrId = 1;
+        int cNvPrId = 2;
+            Inline inline = imagePart.createImageInline("Filename hint",
+                "Alternative text", docPrId, cNvPrId, false);
+ 
+        P paragraph = addInlineImageToParagraph(inline);
+ 
+        wordMLPackage.getMainDocumentPart().addObject(paragraph);
+    }
+ 
+    /**
+     *  We create an object factory and use it to create a paragraph and a run.
+     *  Then we add the run to the paragraph. Next we create a drawing and
+     *  add it to the run. Finally we add the inline object to the drawing and
+     *  return the paragraph.
+     *
+     * @param   inline The inline object containing the image.
+     * @return  the paragraph containing the image
+     */
+    private static P addInlineImageToParagraph(Inline inline) {
+        // Now add the in-line image to a paragraph
+       org.docx4j.wml.ObjectFactory factory = new 
org.docx4j.wml.ObjectFactory();
+        P paragraph = factory.createP();
+        R run = factory.createR();
+        paragraph.getContent().add(run);
+        Drawing drawing = factory.createDrawing();
+        run.getContent().add(drawing);
+        drawing.getAnchorOrInline().add(inline);
+        return paragraph;
+    }
+    
+    private static CTFootnotes createFootnote(P paragraph){
+       org.docx4j.wml.ObjectFactory factory = new 
org.docx4j.wml.ObjectFactory();
+       CTFootnotes fn = factory.createCTFootnotes();
+       fn.setParent(paragraph);
+       
+       //STFtnEdn sTFtnEdn = factory.createSTFtnEdn();
+       CTFtnEdn fe = factory.createCTFtnEdn();
+       fe.setParent(paragraph);
+       return fn;
+    }
+ 
+    /**
+     * Convert the image from the file into an array of bytes.
+     *
+     * @param file  the image file to be converted
+     * @return      the byte array containing the bytes from the image
+     * @throws FileNotFoundException
+     * @throws IOException
+     */
+    protected static byte[] convertImageToByteArray(File file)
+            throws FileNotFoundException, IOException {
+        InputStream is = new FileInputStream(file );
+        long length = file.length();
+        // You cannot create an array using a long, it needs to be an int.
+        if (length > Integer.MAX_VALUE) {
+            System.out.println("File too large!!");
+        }
+        byte[] bytes = new byte[(int)length];
+        int offset = 0;
+        int numRead = 0;
+        while (offset < bytes.length && (numRead=is.read(bytes, offset, 
bytes.length-offset)) >= 0) {
+            offset += numRead;
+        }
+        // Ensure all the bytes have been read
+        if (offset < bytes.length) {
+            System.out.println("Could not completely read file "
+                        +file.getName());
+        }
+        is.close();
+        return bytes;
+    }
+    
+    
+    
+    public static void saveImageFromTheWeb(String imageUrl, String 
destinationFile) {
+               try {
+                       URL url = new URL(imageUrl);
+                       InputStream is = url.openStream();
+                       if (!new File(destinationFile).exists()) {
+                               new File(destinationFile).createNewFile();
+                       }
+                       
+                       OutputStream os = new FileOutputStream(destinationFile);
+                       
+
+                       byte[] b = new byte[2048];
+                       int length;
+
+                       while ((length = is.read(b)) != -1) {
+                               os.write(b, 0, length);
+                       }
+
+                       is.close();
+                       os.close();
+               } catch (MalformedURLException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               } catch (FileNotFoundException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               } catch (IOException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+       }
+    
+    public static void main(String[] args){
+       WordDocBuilder b = new WordDocBuilder();
+       List<HitBase> content = new ArrayList<HitBase>();
+       for(int i = 0; i<10; i++){
+               HitBase h = new HitBase();
+               h.setTitle("albert einstein "+i);
+               List<Fragment> frs = new ArrayList<Fragment>();
+               frs.add(new Fragment(" content "+i, 0));
+               h.setFragments(frs);
+               content.add(h);
+       }
+       
+       b.buildWordDoc(content, "mytitle");
+    }
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderEndNotes.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderEndNotes.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderEndNotes.java
new file mode 100644
index 0000000..7d46f86
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderEndNotes.java
@@ -0,0 +1,223 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.similarity.apps.solr;
+
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.math.BigInteger;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+
+import javax.xml.bind.JAXBException;
+
+import net.billylieurance.azuresearch.AzureSearchImageResult;
+import net.billylieurance.azuresearch.AzureSearchResultSet;
+import net.billylieurance.azuresearch.AzureSearchWebResult;
+
+import org.apache.commons.lang.StringUtils;
+import org.docx4j.XmlUtils;
+import org.docx4j.dml.wordprocessingDrawing.Inline;
+import org.docx4j.jaxb.Context;
+import org.docx4j.openpackaging.exceptions.Docx4JException;
+import org.docx4j.openpackaging.exceptions.InvalidFormatException;
+import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
+import org.docx4j.openpackaging.parts.WordprocessingML.BinaryPartAbstractImage;
+import org.docx4j.openpackaging.parts.WordprocessingML.EndnotesPart;
+import org.docx4j.wml.CTEndnotes;
+import org.docx4j.wml.CTFtnEdn;
+import org.docx4j.wml.Drawing;
+import org.docx4j.wml.P;
+import org.docx4j.wml.R;
+
+import opennlp.tools.similarity.apps.BingQueryRunner;
+import opennlp.tools.similarity.apps.Fragment;
+import opennlp.tools.similarity.apps.HitBase;
+
+public class WordDocBuilderEndNotes extends 
WordDocBuilderSingleImageSearchCall{
+       
+       public String buildWordDoc(List<HitBase> content, String title){
+               
+               String outputDocFinename =  absPath+"written/"+ title.replace(' 
','_').replace('\"', ' ').trim()+ ".docx";
+               
+               WordprocessingMLPackage wordMLPackage=null;
+               
+       
+               List<String> imageURLs = getAllImageSearchResults(title);
+               int count=0;
+               BigInteger refId = BigInteger.ONE;
+               try {
+                       wordMLPackage = WordprocessingMLPackage.createPackage();
+                       
+                       
+                       CTEndnotes endnotes = null;
+                       try {
+                               EndnotesPart ep = new EndnotesPart();
+                               endnotes = 
Context.getWmlObjectFactory().createCTEndnotes();
+                               ep.setJaxbElement(endnotes);
+                               
wordMLPackage.getMainDocumentPart().addTargetPart(ep);
+                       } catch (InvalidFormatException e1) {
+                               // TODO Auto-generated catch block
+                               e1.printStackTrace();
+                       }
+                       
+                       
+                       
+                       
+                       
wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Title", 
title.toUpperCase());
+                       for(HitBase para: content){
+                               if (para.getFragments()==null || 
para.getFragments().size()<1) // no found content in this hit
+                                               continue;
+                               try {
+                                       String processedParaTitle = 
processParagraphTitle(para.getTitle());
+                                       
+                                       if (processedParaTitle!=null && 
+                                                       
!processedParaTitle.endsWith("..") || 
StringUtils.isAlphanumeric(processedParaTitle)){
+                                               
wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle",processedParaTitle);
+                                       }
+                                       String paraText = 
processParagraphText(para.getFragments().toString());
+                                       
wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText);
+                                       
+                                        CTFtnEdn endnote = 
Context.getWmlObjectFactory().createCTFtnEdn();
+                                endnotes.getEndnote().add(endnote);
+                               
+                                endnote.setId(refId);
+                                refId.add(BigInteger.ONE);
+                                String url = para.getUrl();
+                                String endnoteBody = "<w:p 
xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\"; 
><w:pPr><w:pStyle w:val=\"EndnoteText\"/></w:pPr><w:r><w:rPr>" +
+                                               "<w:rStyle 
w:val=\"EndnoteReference\"/></w:rPr><w:endnoteRef/></w:r><w:r><w:t 
xml:space=\"preserve\"> "+ url + "</w:t></w:r></w:p>";
+                                try {
+                                               
endnote.getEGBlockLevelElts().add( XmlUtils.unmarshalString(endnoteBody));
+                                       } catch (JAXBException e) {
+                                               // TODO Auto-generated catch 
block
+                                               e.printStackTrace();
+                                       }
+                                
+                                // Add the body text referencing it
+                                String docBody = "<w:p 
xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\"; 
><w:r><w:t>"//+ paraText
+                                /*+ refId.toString()*/ 
+"</w:t></w:r><w:r><w:rPr><w:rStyle 
w:val=\"EndnoteReference\"/></w:rPr><w:endnoteReference 
w:id=\""+refId.toString()+"\"/></w:r></w:p>";
+                                
+                                try {
+                                        
wordMLPackage.getMainDocumentPart().addParagraph(docBody);
+                                       } catch (JAXBException e) {
+                                               // TODO Auto-generated catch 
block
+                                               e.printStackTrace();
+                                       }
+                                       
+                                       try {
+                                               
addImageByImageURLToPackage(count, wordMLPackage, imageURLs);
+                                       } catch (Exception e) {
+                                               // no need to report issues
+                                               //e.printStackTrace();
+                                       }
+                               } catch (Exception e) {
+                                       // TODO Auto-generated catch block
+                                       e.printStackTrace();
+                               }
+                               count++;
+                       }
+                       // now add URLs
+                       
wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle", 
"REFERENCES");
+                       for(HitBase para: content){
+                               if (para.getFragments()==null || 
para.getFragments().size()<1) // no found content in this hit
+                                               continue;
+                               try {
+                                       
wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle",
+                                                       para.getTitle());
+                                       String paraText = para.getUrl();
+                                       
wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText);
+                                       
+                                       
+                               } catch (Exception e) {
+                                       // TODO Auto-generated catch block
+                                       e.printStackTrace();
+                               }
+                       }
+       
+               
+                       try {
+                               wordMLPackage.save(new File(outputDocFinename));
+                               System.out.println("Finished creating docx 
="+outputDocFinename);
+                       } catch (Exception e) {
+                               // TODO Auto-generated catch block
+                               e.printStackTrace();
+                       }
+                       
+                       try {
+                               String fileNameToDownload = 
"/var/www/wrt_latest/"+title.replace(' ','_').replace('\"', ' ').trim()+ 
".docx";
+                               wordMLPackage.save(new 
File(fileNameToDownload));
+                               System.out.println("Wrote a doc for download 
:"+fileNameToDownload);
+                       } catch (Exception e) {
+                               // TODO Auto-generated catch block
+                               e.printStackTrace();
+                       }
+                       
+               } catch (Exception e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+               
+               return outputDocFinename;
+       }
+       
+       public static String processParagraphText(String title){
+               
+               return title.replace("[", "").replace("]", "").replace(" | ", 
"")
+               .replace(".,", ".").replace(".\"", "\"").replace(". .", ".")
+               .replace(",.", ".");
+       }
+       
+       public static String processParagraphTitle(String title){
+               String titleDelim = title.replace('-', '&').replace('|', '&');
+               String[] titleParts = titleDelim.split("&");
+               
+               int lenCurr = -1; 
+               String bestPart = null;
+               for(String candidatePart: titleParts ){ // if this part longer 
and does not have periods
+                       if (lenCurr< candidatePart.length() && 
candidatePart.indexOf('.')<0){
+                               lenCurr = candidatePart.length();
+                               bestPart = candidatePart;
+                       }
+               }
+               
+               return bestPart;
+       }
+
+    
+    public static void main(String[] args){
+       WordDocBuilderEndNotes b = new WordDocBuilderEndNotes();
+       List<HitBase> content = new ArrayList<HitBase>();
+       for(int i = 0; i<10; i++){
+               HitBase h = new HitBase();
+               h.setTitle("albert einstein "+i);
+               List<Fragment> frs = new ArrayList<Fragment>();
+               frs.add(new Fragment(" content "+i, 0));
+               h.setFragments(frs);
+               h.setUrl("http://www."+i+".com";);
+               content.add(h);
+       }
+       
+       b.buildWordDoc(content, "albert einstein");
+    }
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderSingleImageSearchCall.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderSingleImageSearchCall.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderSingleImageSearchCall.java
new file mode 100644
index 0000000..7c7c1d9
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderSingleImageSearchCall.java
@@ -0,0 +1,174 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.similarity.apps.solr;
+
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+
+import net.billylieurance.azuresearch.AzureSearchImageResult;
+import net.billylieurance.azuresearch.AzureSearchResultSet;
+import net.billylieurance.azuresearch.AzureSearchWebResult;
+
+import org.apache.commons.lang.StringUtils;
+//import org.docx4j.Docx4J;
+//import org.docx4j.convert.out.FOSettings;
+import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
+
+import opennlp.tools.similarity.apps.ContentGeneratorSupport;
+import opennlp.tools.similarity.apps.Fragment;
+import opennlp.tools.similarity.apps.HitBase;
+
+
+
+public class WordDocBuilderSingleImageSearchCall extends WordDocBuilder{
+       
+       public String buildWordDoc(List<HitBase> content, String title){
+               
+               String outputDocFinename =  absPath+"/written/"+ 
title.replace(' ','_').replace('\"', ' ').trim()+ ".docx";
+               
+               WordprocessingMLPackage wordMLPackage;
+               List<String> imageURLs = getAllImageSearchResults(title);
+               int count=0;
+               try {
+                       wordMLPackage = WordprocessingMLPackage.createPackage();
+                       
wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Title", 
title.toUpperCase());
+                       for(HitBase para: content){
+                               if (para.getFragments()==null || 
para.getFragments().size()<1) // no found content in this hit
+                                               continue;
+                               try {
+                                       if (!para.getTitle().endsWith("..") 
/*|| StringUtils.isAlphanumeric(para.getTitle())*/){
+                                               String sectTitle = 
ContentGeneratorSupport.getPortionOfTitleWithoutDelimiters(para.getTitle());
+                                               
wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle",
+                                                       sectTitle);
+                                       }
+                                       String paraText = 
para.getFragments().toString().replace("[", "").replace("]", "").replace(" | ", 
"")
+                                                       .replace(".,", 
".").replace(".\"", "\"").replace(". .", ".")
+                                                       .replace(",.", ".");
+                                       
wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText);
+                                       
+                                       try {
+                                               
addImageByImageURLToPackage(count, wordMLPackage, imageURLs);
+                                       } catch (Exception e) {
+                                               // TODO Auto-generated catch 
block
+                                               e.printStackTrace();
+                                       }
+                               } catch (Exception e) {
+                                       // TODO Auto-generated catch block
+                                       e.printStackTrace();
+                               }
+                               count++;
+                       }
+                       // now add URLs
+                       
wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle", 
"REFERENCES");
+                       for(HitBase para: content){
+                               if (para.getFragments()==null || 
para.getFragments().size()<1) // no found content in this hit
+                                               continue;
+                               try {
+                                       
wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle",
+                                                       para.getTitle());
+                                       String paraText = para.getUrl();
+                                       
wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText);
+                                       
+                                       
+                               } catch (Exception e) {
+                                       // TODO Auto-generated catch block
+                                       e.printStackTrace();
+                               }
+                       }
+       
+               
+                       wordMLPackage.save(new File(outputDocFinename));
+                       System.out.println("Finished creating docx 
="+outputDocFinename);
+               //TODO pdf export
+                       /*
+                       FOSettings foSettings = Docx4J.createFOSettings();
+            foSettings.setWmlPackage(wordMLPackage);
+            OutputStream os = new 
java.io.FileOutputStream(outputDocFinename.replace(".docx", ".pdf"));
+            Docx4J.toFO(foSettings, os, Docx4J.FLAG_NONE);
+               System.out.println("Finished creating docx's PDF 
="+outputDocFinename);
+       */      
+                       
+               } catch (Exception e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+               
+               return outputDocFinename;
+       }
+       
+       protected void addImageByImageURLToPackage(int count,
+                       WordprocessingMLPackage wordMLPackage,
+                       List<String>  imageURLs) {
+               if (count>imageURLs.size()-1)
+                       return;
+               
+               String url = imageURLs.get(count);
+               String destinationFile = url.replace("http://";, 
"").replace("/", "_");
+               saveImageFromTheWeb(url, absPath+IMG_REL_PATH+destinationFile);
+               File file = new File(absPath+IMG_REL_PATH+destinationFile);
+        try {
+                       byte[] bytes = convertImageToByteArray(file);
+                       addImageToPackage(wordMLPackage, bytes);
+               } catch (FileNotFoundException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               } catch (IOException e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               } catch (Exception e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
+               
+       }
+
+       protected List<String>  getAllImageSearchResults(String title) {
+               List<String> imageURLs = new ArrayList<String>();
+               AzureSearchResultSet<AzureSearchImageResult> res = 
imageSearcher.runImageSearch(title);
+               for(AzureSearchImageResult imResult: res){
+                       imageURLs.add(imResult.getMediaUrl());
+               }
+               return imageURLs;
+               
+       }
+
+    
+    public static void main(String[] args){
+       WordDocBuilderSingleImageSearchCall b = new 
WordDocBuilderSingleImageSearchCall();
+       List<HitBase> content = new ArrayList<HitBase>();
+       for(int i = 0; i<10; i++){
+               HitBase h = new HitBase();
+               h.setTitle("albert einstein "+i);
+               List<Fragment> frs = new ArrayList<Fragment>();
+               frs.add(new Fragment(" content "+i, 0));
+               h.setFragments(frs);
+               content.add(h);
+       }
+       
+       b.buildWordDoc(content, "albert einstein");
+    }
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/CsvAdapter.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/CsvAdapter.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/CsvAdapter.java
new file mode 100644
index 0000000..8d718f3
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/CsvAdapter.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.similarity.apps.taxo_builder;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.jsmlearning.ProfileReaderWriter;
+
+public class CsvAdapter {
+         Map<String, List<List<String>>> lemma_AssocWords = new 
HashMap<String, List<List<String>>>();
+       private String resourceDir=null, fileNameToImport = null;
+         
+         public CsvAdapter(){
+                 if (resourceDir==null)
+                               try {
+                                       resourceDir = new File( "." 
).getCanonicalPath()+"/src/test/resources";
+                               } catch (IOException e) {
+                                       e.printStackTrace();
+                               }
+                 fileNameToImport = resourceDir + 
"/taxonomies/musicTaxonomyRoot.csv";
+         }
+         
+         public void importCSV(){
+                 List<String[]> lines = 
ProfileReaderWriter.readProfiles(fileNameToImport);
+                 String topNode=null;
+                 for(String[] line: lines){    
+                         String line0 = extractEntity(line[0]).toLowerCase();
+                         List<String> path = new ArrayList<String>();
+                         List<List<String>> paths = new 
ArrayList<List<String>>();
+                       if (line[1]!=null && line[1].equals("1")){
+                                 topNode = line0;
+                       } else {
+                               path.add(topNode);
+                               path.add(line0);
+                               paths.add(path);
+                               lemma_AssocWords.put(line0, paths);             
        
+                       }
+                 }
+         }
+
+       private String extractEntity(String s) {
+               Integer[] poss = new Integer[]{s.indexOf('/'),
+                               s.indexOf('('),  s.indexOf('_')};
+               
+               int cutPos = 100;
+               for(int p: poss){
+                       if (p>-1 && p< cutPos)
+                               cutPos=p;
+               }
+               
+               if (cutPos<100)
+                       s = s.substring(0,cutPos).trim();
+               return s;
+       }
+}

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/1f97041b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/DomainTaxonomyExtender.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/DomainTaxonomyExtender.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/DomainTaxonomyExtender.java
new file mode 100644
index 0000000..8538c25
--- /dev/null
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/DomainTaxonomyExtender.java
@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.similarity.apps.taxo_builder;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Logger;
+
+import org.apache.commons.lang.StringUtils;
+
+import opennlp.tools.parse_thicket.matching.Matcher;
+import opennlp.tools.similarity.apps.BingQueryRunner;
+import opennlp.tools.similarity.apps.HitBase;
+import opennlp.tools.similarity.apps.utils.StringCleaner;
+import opennlp.tools.stemmer.PStemmer;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
+import opennlp.tools.textsimilarity.SentencePairMatchResult;
+import opennlp.tools.textsimilarity.TextProcessor;
+import 
opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
+
+/**
+
+ * 
+ */
+
+public class DomainTaxonomyExtender {
+       private static Logger LOG = Logger
+                       
.getLogger("opennlp.tools.similarity.apps.taxo_builder.DomainTaxonomyExtender");
+
+       private BingQueryRunner brunner = new BingQueryRunner();
+
+       protected static String BING_KEY = 
"WFoNMM706MMJ5JYfcHaSEDP+faHj3xAxt28CPljUAHA";
+       Matcher matcher = new Matcher(); 
+
+       private final static String TAXO_FILENAME = "taxo_data.dat";
+
+       private Map<String, List<List<String>>> lemma_ExtendedAssocWords = new 
HashMap<String, List<List<String>>>();
+       private Map<List<String>, List<List<String>>> 
assocWords_ExtendedAssocWords = new HashMap<List<String>, List<List<String>>>();
+       private PStemmer ps;
+
+       CsvAdapter adapter = new CsvAdapter();
+
+       public Map<List<String>, List<List<String>>> 
getAssocWords_ExtendedAssocWords() {
+               return assocWords_ExtendedAssocWords;
+       }
+
+       public Map<String, List<List<String>>> getLemma_ExtendedAssocWords() {
+               return lemma_ExtendedAssocWords;
+       }
+
+       public void setLemma_ExtendedAssocWords(
+                       Map<String, List<List<String>>> 
lemma_ExtendedAssocWords) {
+               this.lemma_ExtendedAssocWords = lemma_ExtendedAssocWords;
+       }
+
+       public DomainTaxonomyExtender() {
+               ps = new PStemmer();
+               adapter.importCSV();
+               brunner.setKey(BING_KEY);
+       }
+
+       private List<List<String>> getCommonWordsFromList_List_ParseTreeChunk(
+                       List<List<ParseTreeChunk>> matchList, List<String> 
queryWordsToRemove,
+                       List<String> toAddAtEnd) {
+               List<List<String>> res = new ArrayList<List<String>>();
+               for (List<ParseTreeChunk> chunks : matchList) {
+                       List<String> wordRes = new ArrayList<String>();
+                       for (ParseTreeChunk ch : chunks) {
+                               List<String> lemmas = ch.getLemmas();
+                               for (int w = 0; w < lemmas.size(); w++)
+                                       if ((!lemmas.get(w).equals("*"))
+                                                       && 
((ch.getPOSs().get(w).startsWith("NN") || ch.getPOSs().get(w).startsWith("JJ") 
|| ch.getPOSs().get(w)
+                                                                       
.startsWith("VB"))) && lemmas.get(w).length() > 2) {
+                                               String formedWord = 
lemmas.get(w);
+                                               String stemmedFormedWord = 
ps.stem(formedWord);
+                                               if 
(!stemmedFormedWord.startsWith("invalid"))
+                                                       wordRes.add(formedWord);
+                                       }
+                       }
+                       wordRes = new ArrayList<String>(new 
HashSet<String>(wordRes));
+                       List<String> cleanedRes = new ArrayList<String>();
+                       for(String s: wordRes){
+                               if (!queryWordsToRemove.contains(s))
+                                       cleanedRes .add(s);       
+                       }
+                       //wordRes.removeAll(queryWordsToRemove);
+                       if (cleanedRes.size() > 0) {
+                               //cleanedRes.addAll(toAddAtEnd);
+                               res.add(cleanedRes);
+                       }
+               }
+               res = new ArrayList<List<String>>(new 
HashSet<List<String>>(res));
+               return res;
+       }
+
+       public void extendTaxonomy(String fileNameWithTaxonomyRoot, String 
domain, String lang) {
+
+
+               List<String> entries = new 
ArrayList<String>((adapter.lemma_AssocWords.keySet()));
+               try {
+                       for (String entity : entries) { // .
+                               List<List<String>> paths = 
adapter.lemma_AssocWords.get(entity);
+                               for (List<String> taxoPath : paths) {
+                                       String query = taxoPath.toString() + " 
" + entity + " " + domain; 
+                                       query = query.replace('[', ' 
').replace(']', ' ').replace(',', ' ')
+                                                       .replace('_', ' ');
+                                       List<List<ParseTreeChunk>> matchList = 
runSearchForTaxonomyPath(
+                                                       query, "", lang, 20); 
//30
+                                       List<String> toRemoveFromExtension = 
new ArrayList<String>(taxoPath);
+                                       toRemoveFromExtension.add(entity);
+                                       toRemoveFromExtension.add(domain);
+                                       List<List<String>> resList = 
getCommonWordsFromList_List_ParseTreeChunk(
+                                                       matchList, 
toRemoveFromExtension, taxoPath);
+                                       
assocWords_ExtendedAssocWords.put(taxoPath, resList);
+                                       resList.add(taxoPath);
+                                       lemma_ExtendedAssocWords.put(entity, 
resList);
+
+                                       TaxonomySerializer ser = new 
TaxonomySerializer(lemma_ExtendedAssocWords,
+                                                       
assocWords_ExtendedAssocWords);
+                                       ser.writeTaxonomy(TAXO_FILENAME);
+                               }
+                       }
+               } catch (Exception e) {
+                       e.printStackTrace();
+                       System.err.println("Problem taxonomy matching");
+               }
+
+
+       }
+
+       public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query,
+                       String domain, String lang, int numbOfHits) {
+               List<List<ParseTreeChunk>> genResult = new 
ArrayList<List<ParseTreeChunk>>();
+               try {
+                       List<HitBase> resultList = brunner.runSearch(query, 
numbOfHits);
+
+                       for (int i = 0; i < resultList.size(); i++) {
+                               {
+                                       for (int j = i + 1; j < 
resultList.size(); j++) {
+                                               HitBase h1 = resultList.get(i);
+                                               HitBase h2 = resultList.get(j);
+                                               String snapshot1 = 
StringCleaner.processSnapshotForMatching(h1
+                                                               .getTitle() + " 
" + h1.getAbstractText());
+                                               String snapshot2 = 
StringCleaner.processSnapshotForMatching(h2
+                                                               .getTitle() + " 
" + h2.getAbstractText());
+                                               List<List<ParseTreeChunk>> 
overlaps =matcher.assessRelevance(snapshot1, snapshot2);
+                                               genResult.addAll(overlaps);
+                                       }
+                               }
+                       }
+
+               } catch (Exception e) {
+                       e.printStackTrace();
+                       System.err.print("Problem searching for "+query);
+               }
+
+               return genResult;
+       }
+
+       public List<String> runSearchForTaxonomyPathFlatten(String query,
+                       String domain, String lang, int numbOfHits) {
+               List<String> genResult = new ArrayList<String>();
+               try {
+                       List<HitBase> resultList = brunner.runSearch(query, 
numbOfHits);
+
+                       for (int i = 0; i < resultList.size(); i++) {
+                               {
+                                       for (int j = i + 1; j < 
resultList.size(); j++) {
+                                               HitBase h1 = resultList.get(i);
+                                               HitBase h2 = resultList.get(j);
+                                               String snapshot1 = 
StringCleaner.processSnapshotForMatching(h1
+                                                               .getTitle() + " 
" + h1.getAbstractText());
+                                               String snapshot2 = 
StringCleaner.processSnapshotForMatching(h2
+                                                               .getTitle() + " 
" + h2.getAbstractText());
+                                               List<String> overlaps 
=assessKeywordOverlap(snapshot1, snapshot2);
+                                               genResult.addAll(overlaps);
+                                       }
+                               }
+                       }
+
+               } catch (Exception e) {
+                       System.err.print("Problem searching for "+query);
+               }
+
+               return genResult;
+       }
+
+
+
+       private List<String> assessKeywordOverlap(String snapshot1, String 
snapshot2) {
+               List<String> results = new ArrayList<String>();
+               List<String> firstList = TextProcessor.fastTokenize(snapshot1, 
false), 
+                               secondList = 
TextProcessor.fastTokenize(snapshot2, false);        
+               firstList.retainAll(secondList);
+               for(String s: firstList){
+                       if (s.length()<4)
+                               continue;
+                       if (!StringUtils.isAlpha(s))
+                               continue;
+                       results.add(s);
+               }
+               return results;
+       }
+
+       public static void main(String[] args) {
+               DomainTaxonomyExtender self = new DomainTaxonomyExtender();
+               self.extendTaxonomy("", "music",
+                               "en");
+
+       }
+
+}

[44/51] [partial] opennlp-sandbox git commit: merge from bgalitsky's own git repo

Reply via email to