This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch experimental/cleanup-dependency-mess-of-opennlp-similarity in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
commit f379f033e98cc80c12be2c7a8973ca4c075ff554 Author: Martin Wiesner <[email protected]> AuthorDate: Tue Dec 10 11:26:20 2024 +0100 reorganizes dependencies of 'opennlp-similarity' component switches 'tika-app' to more lightweight 'tika-core' dep switches 'docx4j' to more lightweight / modern 'docx4j-core' dep (11.5.1, jakarta) --- opennlp-similarity/pom.xml | 77 ++++++++++++---------- .../tools/apps/utils/email/EmailSender.java | 26 ++++---- .../tools/apps/utils/email/SMTPAuthenticator.java | 4 +- ...cClassifierTrainingSetMultilingualExtender.java | 6 +- .../DocClassifierTrainingSetVerifier.java | 4 +- .../enron_email_recognizer/EmailNormalizer.java | 13 ++-- .../EmailTrainingSetFormer.java | 9 ++- .../similarity/apps/ContentGeneratorRunner.java | 8 +-- .../tools/similarity/apps/solr/CommentsRel.java | 2 +- .../apps/solr/ContentGeneratorRequestHandler.java | 4 +- .../apps/solr/WordDocBuilderEndNotes.java | 45 ++++++------- pom.xml | 18 ++++- 12 files changed, 118 insertions(+), 98 deletions(-) diff --git a/opennlp-similarity/pom.xml b/opennlp-similarity/pom.xml index 58dd8a2..1e4753a 100644 --- a/opennlp-similarity/pom.xml +++ b/opennlp-similarity/pom.xml @@ -27,6 +27,12 @@ <name>Apache OpenNLP Similarity distribution</name> <properties> + <jakarta.bind-api.version>4.0.2</jakarta.bind-api.version> + <jakarta.mail.version>2.1.3</jakarta.mail.version> + + <tika.version>3.0.0</tika.version> + <solr.version>8.11.3</solr.version> + <docx4j.version>11.5.1</docx4j.version> <dl4j.version>1.0.0-M2.1</dl4j.version> <hdf5.version>1.14.3-1.5.10</hdf5.version> <javacpp.version>1.5.11</javacpp.version> @@ -85,25 +91,23 @@ </dependency> <dependency> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-api</artifactId> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + <scope>runtime</scope> </dependency> - <dependency> - <groupId>commons-lang</groupId> - <artifactId>commons-lang</artifactId> + <groupId>org.apache.commons</groupId> + <artifactId>commons-math3</artifactId> </dependency> <dependency> - <groupId>commons-codec</groupId> - <artifactId>commons-codec</artifactId> + <groupId>jakarta.xml.bind</groupId> + <artifactId>jakarta.xml.bind-api</artifactId> + <version>${jakarta.bind-api.version}</version> </dependency> <dependency> - <groupId>commons-collections</groupId> - <artifactId>commons-collections</artifactId> - </dependency> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-math3</artifactId> + <groupId>jakarta.mail</groupId> + <artifactId>jakarta.mail-api</artifactId> + <version>${jakarta.mail.version}</version> </dependency> <dependency> <groupId>org.json</groupId> @@ -112,19 +116,20 @@ </dependency> <dependency> <groupId>org.apache.tika</groupId> - <artifactId>tika-app</artifactId> - <version>3.0.0</version> + <artifactId>tika-core</artifactId> + <version>${tika.version}</version> </dependency> <dependency> - <groupId>net.sf.opencsv</groupId> - <artifactId>opencsv</artifactId> - <version>2.3</version> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parser-html-module</artifactId> + <version>${tika.version}</version> + <scope>runtime</scope> </dependency> <dependency> <groupId>org.apache.solr</groupId> <artifactId>solr-core</artifactId> - <version>8.11.3</version> + <version>${solr.version}</version> <exclusions> <exclusion> <groupId>org.apache.hadoop</groupId> @@ -138,14 +143,13 @@ <groupId>org.eclipse.jetty.http2</groupId> <artifactId>*</artifactId> </exclusion> + <exclusion> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>*</artifactId> + </exclusion> </exclusions> </dependency> - <dependency> - <groupId>javax.mail</groupId> - <artifactId>mail</artifactId> - <version>1.4.7</version> - </dependency> <dependency> <groupId>com.restfb</groupId> <artifactId>restfb</artifactId> @@ -181,8 +185,8 @@ <dependency> <groupId>org.docx4j</groupId> - <artifactId>docx4j</artifactId> - <version>6.1.2</version> + <artifactId>docx4j-core</artifactId> + <version>${docx4j.version}</version> <exclusions> <!-- Exclusion here as log4j version 2 bindings are used during tests/runtime--> <exclusion> @@ -217,11 +221,7 @@ </exclusion> </exclusions> </dependency> - <dependency> - <groupId>org.deeplearning4j</groupId> - <artifactId>deeplearning4j-ui</artifactId> - <version>${dl4j.version}</version> - </dependency> + <dependency> <groupId>org.deeplearning4j</groupId> <artifactId>deeplearning4j-nlp</artifactId> @@ -252,10 +252,15 @@ <groupId>org.junit.jupiter</groupId> <artifactId>junit-jupiter-params</artifactId> </dependency> + + <!-- Logging --> <dependency> - <groupId>org.apache.logging.log4j</groupId> - <artifactId>log4j-api</artifactId> - <scope>test</scope> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-api</artifactId> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>log4j-over-slf4j</artifactId> </dependency> <dependency> <groupId>org.apache.logging.log4j</groupId> @@ -265,7 +270,7 @@ <dependency> <groupId>org.apache.logging.log4j</groupId> <artifactId>log4j-slf4j2-impl</artifactId> - <scope>test</scope> + <scope>runtime</scope> </dependency> </dependencies> @@ -444,7 +449,7 @@ <configuration> <source>${maven.compiler.source}</source> <target>${maven.compiler.target}</target> - <compilerArgument>-Xlint</compilerArgument> + <compilerArgument>-Xlint:-options</compilerArgument> </configuration> </plugin> diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/EmailSender.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/EmailSender.java index c5388fa..94ba811 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/EmailSender.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/EmailSender.java @@ -17,19 +17,19 @@ package opennlp.tools.apps.utils.email; -import javax.activation.DataHandler; -import javax.activation.DataSource; -import javax.activation.FileDataSource; -import javax.mail.Authenticator; -import javax.mail.BodyPart; -import javax.mail.Message; -import javax.mail.Multipart; -import javax.mail.Session; -import javax.mail.Transport; -import javax.mail.internet.InternetAddress; -import javax.mail.internet.MimeBodyPart; -import javax.mail.internet.MimeMessage; -import javax.mail.internet.MimeMultipart; +import jakarta.activation.DataHandler; +import jakarta.activation.DataSource; +import jakarta.activation.FileDataSource; +import jakarta.mail.Authenticator; +import jakarta.mail.BodyPart; +import jakarta.mail.Message; +import jakarta.mail.Multipart; +import jakarta.mail.Session; +import jakarta.mail.Transport; +import jakarta.mail.internet.InternetAddress; +import jakarta.mail.internet.MimeBodyPart; +import jakarta.mail.internet.MimeMessage; +import jakarta.mail.internet.MimeMultipart; import java.util.Properties; import java.util.regex.Matcher; import java.util.regex.Pattern; diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/SMTPAuthenticator.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/SMTPAuthenticator.java index c48ab34..55f56dd 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/SMTPAuthenticator.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/SMTPAuthenticator.java @@ -17,12 +17,12 @@ package opennlp.tools.apps.utils.email; -import javax.mail.PasswordAuthentication; +import jakarta.mail.PasswordAuthentication; /** * This contains the required information for the smtp authorization! */ -public class SMTPAuthenticator extends javax.mail.Authenticator { +public class SMTPAuthenticator extends jakarta.mail.Authenticator { private final String username; private final String password; diff --git a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java index 29a5107..18d778c 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java @@ -27,11 +27,11 @@ import java.net.URL; import java.nio.channels.Channels; import java.nio.channels.ReadableByteChannel; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; import java.util.ArrayList; import java.util.HashSet; import java.util.List; -import org.apache.commons.io.FileUtils; import org.apache.commons.lang.StringUtils; /* @@ -86,7 +86,7 @@ public class DocClassifierTrainingSetMultilingualExtender { List<String> filteredEntries = new ArrayList<>(); String content=null; try { - content = FileUtils.readFileToString(new File(filename), StandardCharsets.UTF_8); + content = Files.readString(new File(filename).toPath(), StandardCharsets.UTF_8); } catch (IOException e) { e.printStackTrace(); } @@ -127,7 +127,7 @@ public class DocClassifierTrainingSetMultilingualExtender { continue; System.out.println("processing "+f.getName()); - content = FileUtils.readFileToString(f, "utf-8"); + content = Files.readString(f.toPath(), StandardCharsets.UTF_8); int langIndex =0; for(String[] begEnd: MULTILINGUAL_TOKENS){ String urlDirty = StringUtils.substringBetween(content, begEnd[0], begEnd[1]); diff --git a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java index 95c2b27..d774c4d 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java @@ -18,12 +18,12 @@ package opennlp.tools.doc_classifier; import java.io.File; import java.io.IOException; +import java.nio.file.Files; import java.util.ArrayList; import java.util.List; import opennlp.tools.jsmlearning.ProfileReaderWriter; -import org.apache.commons.io.FileUtils; import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; @@ -96,7 +96,7 @@ public class DocClassifierTrainingSetVerifier { && resultsClassif.get(0).equals( ClassifierTrainingSetIndexer.getCategoryFromFilePath(f.getAbsolutePath()))){ String destFileName = f.getAbsolutePath().replace(sourceDir, destinationDir); - FileUtils.copyFile(f, new File(destFileName)); + Files.copy(f.toPath(), new File(destFileName).toPath()); bRejected = false; } else { System.out.println("File "+ f.getAbsolutePath() + "\n classified as "+ diff --git a/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailNormalizer.java b/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailNormalizer.java index 6e1ebe9..3fde124 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailNormalizer.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailNormalizer.java @@ -20,10 +20,9 @@ package opennlp.tools.enron_email_recognizer; import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; import java.util.ArrayList; -import org.apache.commons.io.FileUtils; - public class EmailNormalizer { protected final ArrayList<File> queue = new ArrayList<>(); @@ -67,7 +66,7 @@ public class EmailNormalizer { public void normalizeAndWriteIntoANewFile(File f){ String content = ""; try { - content = FileUtils.readFileToString(f, StandardCharsets.UTF_8); + content = Files.readString(f.toPath(), StandardCharsets.UTF_8); } catch (IOException e) { e.printStackTrace(); } @@ -95,10 +94,10 @@ public class EmailNormalizer { String directoryNew = f.getAbsolutePath().replace(origFolder, newFolder); try { String fullFileNameNew = directoryNew +"txt"; - FileUtils.writeStringToFile(new File(fullFileNameNew), buf.toString(), StandardCharsets.UTF_8); - } catch (IOException e) { - e.printStackTrace(); - } + Files.writeString(new File(fullFileNameNew).toPath(), buf.toString(), StandardCharsets.UTF_8); + } catch (IOException e) { + e.printStackTrace(); + } } public void normalizeDirectory(File f){ diff --git a/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailTrainingSetFormer.java b/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailTrainingSetFormer.java index 1a8ce6d..2551052 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailTrainingSetFormer.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailTrainingSetFormer.java @@ -20,10 +20,9 @@ package opennlp.tools.enron_email_recognizer; import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; import java.util.List; -import org.apache.commons.io.FileUtils; - public class EmailTrainingSetFormer { static final String DATA_DIR = "/Users/bgalitsky/Downloads/"; static final String FILE_LIST_FILE = "cats4_11-17.txt"; @@ -32,14 +31,14 @@ public class EmailTrainingSetFormer { //enron_with_categories/5/70665.cats:4,10,1 public static void createPosTrainingSet(){ try { - List<String> lines = FileUtils.readLines(new File(DATA_DIR + FILE_LIST_FILE), StandardCharsets.UTF_8); + List<String> lines = Files.readAllLines(new File(DATA_DIR + FILE_LIST_FILE).toPath(), StandardCharsets.UTF_8); for(String l: lines){ int endOfFname = l.indexOf('.'), startOfFname = l.lastIndexOf('/'); String filenameOld = DATA_DIR + l.substring(0, endOfFname)+".txt"; String content = normalize(new File(filenameOld)); String filenameNew = DESTINATION_DIR + l.substring(startOfFname+1, endOfFname)+".txt"; //FileUtils.copyFile(new File(filenameOld), new File(filenameNew)); - FileUtils.writeStringToFile(new File(filenameNew), content, StandardCharsets.UTF_8); + Files.writeString(new File(filenameNew).toPath(), content, StandardCharsets.UTF_8); } } catch (Exception e) { e.printStackTrace(); @@ -52,7 +51,7 @@ public class EmailTrainingSetFormer { public static String normalize(File f){ String content=""; try { - content = FileUtils.readFileToString(f, StandardCharsets.UTF_8); + content = Files.readString(f.toPath(), StandardCharsets.UTF_8); } catch (IOException e) { e.printStackTrace(); } diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorRunner.java index b6bc2b1..58d0527 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorRunner.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorRunner.java @@ -18,17 +18,17 @@ package opennlp.tools.similarity.apps; import java.util.List; -import javax.mail.internet.AddressException; -import javax.mail.internet.InternetAddress; +import jakarta.mail.internet.AddressException; +import jakarta.mail.internet.InternetAddress; import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; public class ContentGeneratorRunner { public static void main(String[] args) { - ParserChunker2MatcherProcessor sm = null; - + try { String resourceDir = args[2]; + ParserChunker2MatcherProcessor sm; if (resourceDir!=null) sm = ParserChunker2MatcherProcessor.getInstance(resourceDir); else diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/CommentsRel.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/CommentsRel.java index e80e94e..85c4714 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/CommentsRel.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/CommentsRel.java @@ -23,7 +23,7 @@ import java.io.File; import java.io.IOException; import java.math.BigInteger; -import javax.xml.bind.JAXBException; +import jakarta.xml.bind.JAXBException; import org.docx4j.XmlUtils; import org.docx4j.jaxb.Context; diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/ContentGeneratorRequestHandler.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/ContentGeneratorRequestHandler.java index a40c0bb..19e935d 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/ContentGeneratorRequestHandler.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/ContentGeneratorRequestHandler.java @@ -25,8 +25,8 @@ import java.io.InputStreamReader; import java.util.List; import java.util.logging.Logger; -import javax.mail.internet.AddressException; -import javax.mail.internet.InternetAddress; +import jakarta.mail.internet.AddressException; +import jakarta.mail.internet.InternetAddress; import org.apache.solr.common.util.NamedList; import org.apache.solr.handler.component.SearchHandler; diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderEndNotes.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderEndNotes.java index afe37fc..dcda0ce 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderEndNotes.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderEndNotes.java @@ -16,15 +16,11 @@ */ package opennlp.tools.similarity.apps.solr; - import java.io.File; import java.math.BigInteger; import java.util.ArrayList; import java.util.List; -import javax.xml.bind.JAXBException; - -import org.apache.commons.lang.StringUtils; import org.docx4j.XmlUtils; import org.docx4j.jaxb.Context; import org.docx4j.openpackaging.exceptions.InvalidFormatException; @@ -69,7 +65,7 @@ public class WordDocBuilderEndNotes extends WordDocBuilderSingleImageSearchCall{ String processedParaTitle = processParagraphTitle(para.getTitle()); if (processedParaTitle!=null && - !processedParaTitle.endsWith("..") || StringUtils.isAlphanumeric(processedParaTitle)){ + !processedParaTitle.endsWith("..") || processedParaTitle.chars().allMatch(this::isAlphanumeric)){ wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle",processedParaTitle); } String paraText = processParagraphText(para.getFragments().toString()); @@ -85,7 +81,7 @@ public class WordDocBuilderEndNotes extends WordDocBuilderSingleImageSearchCall{ "<w:rStyle w:val=\"EndnoteReference\"/></w:rPr><w:endnoteRef/></w:r><w:r><w:t xml:space=\"preserve\"> "+ url + "</w:t></w:r></w:p>"; try { endnote.getEGBlockLevelElts().add( XmlUtils.unmarshalString(endnoteBody)); - } catch (JAXBException e) { + } catch (Exception e) { e.printStackTrace(); } @@ -95,7 +91,7 @@ public class WordDocBuilderEndNotes extends WordDocBuilderSingleImageSearchCall{ try { wordMLPackage.getMainDocumentPart().addParagraph(docBody); - } catch (JAXBException e) { + } catch (Exception e) { e.printStackTrace(); } @@ -172,20 +168,25 @@ public class WordDocBuilderEndNotes extends WordDocBuilderSingleImageSearchCall{ return bestPart; } + private boolean isAlphanumeric(final int codePoint) { + return (codePoint >= 65 && codePoint <= 90) || + (codePoint >= 97 && codePoint <= 122) || + (codePoint >= 48 && codePoint <= 57); + } - public static void main(String[] args){ - WordDocBuilderEndNotes b = new WordDocBuilderEndNotes(); - List<HitBase> content = new ArrayList<>(); - for(int i = 0; i<10; i++){ - HitBase h = new HitBase(); - h.setTitle("albert einstein "+i); - List<Fragment> frs = new ArrayList<>(); - frs.add(new Fragment(" content "+i, 0)); - h.setFragments(frs); - h.setUrl("http://www."+i+".com"); - content.add(h); - } - - b.buildWordDoc(content, "albert einstein"); - } + public static void main(String[] args){ + WordDocBuilderEndNotes b = new WordDocBuilderEndNotes(); + List<HitBase> content = new ArrayList<>(); + for(int i = 0; i<10; i++){ + HitBase h = new HitBase(); + h.setTitle("albert einstein "+i); + List<Fragment> frs = new ArrayList<>(); + frs.add(new Fragment(" content "+i, 0)); + h.setFragments(frs); + h.setUrl("http://www."+i+".com"); + content.add(h); + } + + b.buildWordDoc(content, "albert einstein"); + } } diff --git a/pom.xml b/pom.xml index c2f4a52..e98b18d 100644 --- a/pom.xml +++ b/pom.xml @@ -158,22 +158,38 @@ <artifactId>slf4j-api</artifactId> <version>${slf4j.version}</version> </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>log4j-over-slf4j</artifactId> + <version>${slf4j.version}</version> + <scope>runtime</scope> + </dependency> <dependency> <groupId>commons-lang</groupId> <artifactId>commons-lang</artifactId> <version>2.6</version> </dependency> + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + <version>2.18.0</version> + </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> - <version>3.12.0</version> + <version>3.17.0</version> </dependency> <dependency> <groupId>commons-codec</groupId> <artifactId>commons-codec</artifactId> <version>1.15</version> </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-mat3</artifactId> + <version>3.6.1</version> + </dependency> <dependency> <groupId>commons-logging</groupId> <artifactId>commons-logging</artifactId>
