This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 20e782969 TIKA-4343 -- remove agepredictor (#2346)
20e782969 is described below
commit 20e782969c7117ecdebfab1d66310949e1c407c6
Author: Tim Allison <[email protected]>
AuthorDate: Fri Oct 3 12:39:40 2025 -0400
TIKA-4343 -- remove agepredictor (#2346)
---
.../tika-parsers-ml/tika-age-recogniser/pom.xml | 374 ---------------------
.../tika/parser/recognition/AgeRecogniser.java | 142 --------
.../parser/recognition/AgeRecogniserConfig.java | 66 ----
.../tika/parser/recognition/AgeRecogniserTest.java | 85 -----
.../tika/parser/recognition/tika-config-age.xml | 31 --
.../tika/parser/ner/opennlp/ModelGetter.groovy | 7 +-
6 files changed, 1 insertion(+), 704 deletions(-)
diff --git a/tika-parsers/tika-parsers-ml/tika-age-recogniser/pom.xml
b/tika-parsers/tika-parsers-ml/tika-age-recogniser/pom.xml
deleted file mode 100644
index a2ac44bbd..000000000
--- a/tika-parsers/tika-parsers-ml/tika-age-recogniser/pom.xml
+++ /dev/null
@@ -1,374 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
https://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parsers-ml</artifactId>
- <version>4.0.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-age-recogniser</artifactId>
- <packaging>jar</packaging>
-
- <name>Apache Tika age recogniser</name>
- <url>http://maven.apache.org</url>
-
- <properties>
- <curator.version>5.9.0</curator.version>
- <hadoop.version>3.4.2</hadoop.version>
- </properties>
-
- <!-- we're not maintaining this module.
- Keep this here instead of cluttering the parent pom -->
- <dependencyManagement>
- <dependencies>
- <dependency>
- <groupId>com.google.code.findbugs</groupId>
- <artifactId>jsr305</artifactId>
- <version>3.0.2</version>
- </dependency>
- <dependency>
- <groupId>org.scalamacros</groupId>
- <artifactId>quasiquotes_2.10</artifactId>
- <version>2.1.1</version>
- </dependency>
- <!-- can't use 2.13.*, NoClassDefFoundError: scala/Serializable -->
- <dependency>
- <groupId>org.scala-lang</groupId>
- <artifactId>scala-library</artifactId>
- <version>2.12.19</version>
- </dependency>
- <dependency>
- <groupId>net.bytebuddy</groupId>
- <artifactId>byte-buddy</artifactId>
- <version>1.17.7</version>
- </dependency>
- <dependency>
- <groupId>org.scala-lang</groupId>
- <artifactId>scala-reflect</artifactId>
- <version>2.13.17</version>
- </dependency>
- <dependency>
- <groupId>org.scala-lang</groupId>
- <artifactId>scala-compiler</artifactId>
- <version>2.13.17</version>
- </dependency>
- <dependency>
- <groupId>commons-net</groupId>
- <artifactId>commons-net</artifactId>
- <version>${commons.net.version}</version>
- </dependency>
- <dependency>
- <groupId>com.thoughtworks.paranamer</groupId>
- <artifactId>paranamer</artifactId>
- <version>2.8.3</version>
- </dependency>
- <dependency>
- <groupId>org.apache.avro</groupId>
- <artifactId>avro</artifactId>
- <version>1.12.0</version>
- </dependency>
- <dependency>
- <groupId>commons-lang</groupId>
- <artifactId>commons-lang</artifactId>
- <version>2.6</version>
- </dependency>
- <dependency>
- <groupId>org.codehaus.jackson</groupId>
- <artifactId>jackson-core-asl</artifactId>
- <version>1.9.13</version>
- </dependency>
- <dependency>
- <groupId>org.codehaus.jackson</groupId>
- <artifactId>jackson-mapper-asl</artifactId>
- <version>1.9.13</version>
- </dependency>
- <!-- avoid convergence error with hadoop-yarn-common vs
jackson-module-jaxb-annotations -->
- <dependency>
- <groupId>javax.xml.bind</groupId>
- <artifactId>jaxb-api</artifactId>
- <version>2.3.1</version>
- </dependency>
-
- <!-- netty dependency removed, either it isn't needed,
- or our test coverage isn't good enough -->
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-math3</artifactId>
- <version>3.6.1</version>
- </dependency>
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-collections4</artifactId>
- <version>${commons.collections4.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.curator</groupId>
- <artifactId>curator-recipes</artifactId>
- <version>${curator.version}</version>
- </dependency>
-
- <!-- avoid many security and convergence problems (still not perfect) -->
- <!-- TODO spark-core_2.10 / spark-network-shuffle_2.10 (used by
age-predictor-api) use log4j1,
- and are also insecure themselves -->
- <dependency>
- <groupId>org.apache.curator</groupId>
- <artifactId>curator-framework</artifactId>
- <version>${curator.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.curator</groupId>
- <artifactId>curator-client</artifactId>
- <version>${curator.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.ivy</groupId>
- <artifactId>ivy</artifactId>
- <version>2.5.3</version>
- </dependency>
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-common</artifactId>
- <version>${hadoop.version}</version>
- <exclusions>
- <exclusion>
- <groupId>org.bouncycastle</groupId>
- <artifactId>bcprov-jdk15on</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-annotations</artifactId>
- <version>${hadoop.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-mapreduce-client-core</artifactId>
- <version>${hadoop.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-client</artifactId>
- <version>${hadoop.version}</version>
- </dependency>
- <dependency>
- <groupId>io.dropwizard.metrics</groupId>
- <artifactId>metrics-core</artifactId>
- <version>4.2.37</version>
- </dependency>
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-text</artifactId>
- <version>1.14.0</version>
- </dependency>
- <dependency>
- <groupId>org.codehaus.woodstox</groupId>
- <artifactId>stax2-api</artifactId>
- <version>4.2.2</version>
- </dependency>
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-configuration2</artifactId>
- <version>2.12.0</version>
- </dependency>
- <dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-mllib_2.10</artifactId>
- <version>2.2.3</version>
- </dependency>
- <dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-core_2.10</artifactId>
- <version>2.2.3</version>
- </dependency>
- <dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-network-shuffle_2.10</artifactId>
- <version>2.2.3</version>
- </dependency>
- <dependency>
- <groupId>org.jline</groupId>
- <artifactId>jline</artifactId>
- <version>3.30.6</version>
- </dependency>
- <dependency>
- <groupId>javax.activation</groupId>
- <artifactId>activation</artifactId>
- <version>1.1.1</version>
- </dependency>
- <dependency>
- <groupId>org.codehaus.janino</groupId>
- <artifactId>janino</artifactId>
- <version>3.1.12</version>
- </dependency>
- <dependency>
- <groupId>org.codehaus.janino</groupId>
- <artifactId>commons-compiler</artifactId>
- <version>3.1.12</version>
- </dependency>
- <dependency>
- <groupId>org.glassfish.jersey.core</groupId>
- <artifactId>jersey-common</artifactId>
- <version>3.1.11</version>
- </dependency>
- <dependency>
- <groupId>org.glassfish.hk2</groupId>
- <artifactId>osgi-resource-locator</artifactId>
- <version>3.0.0</version>
- </dependency>
- <dependency>
- <groupId>dnsjava</groupId>
- <artifactId>dnsjava</artifactId>
- <version>3.6.3</version>
- </dependency>
- <dependency>
- <groupId>com.jamesmurty.utils</groupId>
- <artifactId>java-xmlbuilder</artifactId>
- <version>1.3</version>
- </dependency>
- </dependencies>
- </dependencyManagement>
- <dependencies>
- <!-- AgePredictor client for Tika -->
- <dependency>
- <groupId>edu.usc.ir</groupId>
- <artifactId>age-predictor-api</artifactId>
- <version>1.0</version>
- <exclusions>
- <exclusion>
- <groupId>io.netty</groupId>
- <artifactId>netty</artifactId>
- </exclusion>
- <exclusion>
- <groupId>io.netty</groupId>
- <artifactId>netty-transport</artifactId>
- </exclusion>
- <exclusion>
- <groupId>io.netty</groupId>
- <artifactId>netty-buffer</artifactId>
- </exclusion>
- <exclusion>
- <groupId>commons-collections</groupId>
- <artifactId>commons-collections</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-log4j12</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.bouncycastle</groupId>
- <artifactId>bcprov-jdk15on</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.jline</groupId>
- <artifactId>jline</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <!-- used by hadoop-common -->
- <dependency>
- <groupId>org.bouncycastle</groupId>
- <artifactId>bcprov-jdk18on</artifactId>
- </dependency>
- <!-- Test dependencies -->
- <dependency>
- <groupId>org.mockito</groupId>
- <artifactId>mockito-core</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.logging.log4j</groupId>
- <artifactId>log4j-slf4j2-impl</artifactId>
- <scope>test</scope>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- <configuration>
- <descriptorRefs>
- <descriptorRef>jar-with-dependencies</descriptorRef>
- </descriptorRefs>
- </configuration>
- <executions>
- <execution>
- <id>make-assembly</id>
- <phase>package</phase>
- <goals>
- <goal>single</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
- <!-- dependencies in this module need to be cleaned up.
- Until TIKA-2368 is resolved, report but ignore
- vulnerabilities.
- -->
- <plugin>
- <groupId>org.sonatype.ossindex.maven</groupId>
- <artifactId>ossindex-maven-plugin</artifactId>
- <configuration>
- <fail>false</fail>
- </configuration>
- </plugin>
- <plugin>
- <groupId>org.apache.rat</groupId>
- <artifactId>apache-rat-plugin</artifactId>
- <configuration>
- <excludes>
- <exclude>model/opennlp/*.bin</exclude>
- <exclude>model/org/apache/tika/parser/recognition/**</exclude>
- </excludes>
- </configuration>
- </plugin>
- <!-- to get the OpenNLP models in the right place for AgeRecogniser:
TODO: fix AgeRecogniser in next version to load from classpath -->
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-antrun-plugin</artifactId>
- <executions>
- <execution>
- <phase>generate-test-resources</phase>
- <goals>
- <goal>run</goal>
- </goals>
- <configuration>
- <target>
- <copy failonerror="false"
file="${basedir}/../tika-parser-nlp-module/src/test/resources/org/apache/tika/parser/ner/opennlp/en-pos-maxent.bin"
todir="${basedir}/model/opennlp/" />
- <copy failonerror="false"
file="${basedir}/../tika-parser-nlp-module/src/test/resources/org/apache/tika/parser/ner/opennlp/en-sent.bin"
todir="${basedir}/model/opennlp/" />
- <copy failonerror="false"
file="${basedir}/../tika-parser-nlp-module/src/test/resources/org/apache/tika/parser/ner/opennlp/en-token.bin"
todir="${basedir}/model/opennlp/" />
- <copy failonerror="false"
file="${basedir}/../tika-age-recogniser/src/test/resources/org/apache/tika/parser/recognition/classify-bigram.bin"
todir="${basedir}/model/org/apache/tika/parser/recognition/" />
- <copy failonerror="false"
file="${basedir}/../tika-age-recogniser/src/test/resources/org/apache/tika/parser/recognition/regression-global.bin"
todir="${basedir}/model/org/apache/tika/parser/recognition/" />
- </target>
- </configuration>
- </execution>
- </executions>
- </plugin>
-
- </plugins>
- </build>
-
- <scm>
- <tag>3.0.0-rc1</tag>
- </scm>
-</project>
diff --git
a/tika-parsers/tika-parsers-ml/tika-age-recogniser/src/main/java/org/apache/tika/parser/recognition/AgeRecogniser.java
b/tika-parsers/tika-parsers-ml/tika-age-recogniser/src/main/java/org/apache/tika/parser/recognition/AgeRecogniser.java
deleted file mode 100644
index 509e4e7c0..000000000
---
a/tika-parsers/tika-parsers-ml/tika-age-recogniser/src/main/java/org/apache/tika/parser/recognition/AgeRecogniser.java
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright
owlocationNameEntitieship.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.recognition;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.nio.charset.StandardCharsets;
-import java.util.Collections;
-import java.util.Map;
-import java.util.Set;
-
-import edu.usc.irds.agepredictor.authorage.AgePredicterLocal;
-import opennlp.tools.util.InvalidFormatException;
-import org.apache.commons.io.IOUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.xml.sax.ContentHandler;
-
-import org.apache.tika.Tika;
-import org.apache.tika.config.Initializable;
-import org.apache.tika.config.InitializableProblemHandler;
-import org.apache.tika.config.Param;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-
-
-/**
- * Parser for extracting features from text. Below features are extracted
- *
- * <ul>
- * <li>Author Age</li>
- * </ul>
- */
-public class AgeRecogniser implements Parser, Initializable {
-
- public static final String MD_KEY_ESTIMATED_AGE_RANGE =
"Estimated-Author-Age-Range";
- public static final String MD_KEY_ESTIMATED_AGE = "Estimated-Author-Age";
- private static final long serialVersionUID = 1108439049093046832L;
- private static final Logger LOG =
LoggerFactory.getLogger(AgeRecogniser.class);
- private static final MediaType MEDIA_TYPE = MediaType.TEXT_PLAIN;
- private static final Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(MEDIA_TYPE);
- private static AgePredicterLocal agePredictor;
- private static volatile boolean available = false;
- public Tika secondaryParser;
- private AgeRecogniserConfig config;
-
- public AgeRecogniser() {
- try {
- secondaryParser = new Tika(new TikaConfig());
- available = true;
- } catch (Exception e) {
- available = false;
- LOG.error("Unable to initialize secondary parser", e);
- }
- }
-
- @Override
- public void checkInitialization(InitializableProblemHandler problemHandler)
- throws TikaConfigException {
- //TODO: what do we want to check here?
- }
-
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
- return SUPPORTED_TYPES;
- }
-
- @Override
- public void initialize(Map<String, Param> params) throws
TikaConfigException {
- config = new AgeRecogniserConfig(params);
-
- }
-
- public AgePredicterLocal getAgePredictorClient() throws
InvalidFormatException, IOException {
- if (agePredictor == null) {
- agePredictor = new AgePredicterLocal(config.getPathClassifyModel(),
- config.getPathClassifyRegression());
- }
- return agePredictor;
- }
-
- /**
- * USED in test cases to mock response of AgeClassifier
- */
- protected static void setAgePredictorClient(AgePredicterLocal
agePredicter) {
- if (AgeRecogniser.agePredictor == null) {
- AgeRecogniser.agePredictor = agePredicter;
- }
- }
-
- @Override
- public void parse(InputStream inputStream, ContentHandler handler,
Metadata metadata,
- ParseContext context) throws IOException {
-
- this.config = context.get(AgeRecogniserConfig.class, config);
- if (!available) {
- LOG.error("Parser Unavailable, check your configuration");
- return;
- }
-
- // If content is not plain text use Tika to extract text out of
content.
- Reader reader;
- if
(MediaType.TEXT_PLAIN.toString().equals(metadata.get(Metadata.CONTENT_TYPE))) {
- reader = new InputStreamReader(inputStream,
StandardCharsets.UTF_8);
- } else {
- reader = secondaryParser.parse(inputStream);
- }
-
- // Use Spark AgePredictor to get predicted Age
- try {
- double predictAuthorAge =
getAgePredictorClient().predictAge(IOUtils.toString(reader));
-
- metadata.add(MD_KEY_ESTIMATED_AGE,
Double.toString(predictAuthorAge));
-
- } catch (Exception e) {
- LOG.error("Age Predictor is not available. Please check wiki for
detailed instructions",
- e);
- return;
- }
- }
-}
diff --git
a/tika-parsers/tika-parsers-ml/tika-age-recogniser/src/main/java/org/apache/tika/parser/recognition/AgeRecogniserConfig.java
b/tika-parsers/tika-parsers-ml/tika-age-recogniser/src/main/java/org/apache/tika/parser/recognition/AgeRecogniserConfig.java
deleted file mode 100644
index 243484955..000000000
---
a/tika-parsers/tika-parsers-ml/tika-age-recogniser/src/main/java/org/apache/tika/parser/recognition/AgeRecogniserConfig.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright
owlocationNameEntitieship.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.recognition;
-
-import java.net.URL;
-import java.util.Map;
-
-import org.apache.tika.config.Param;
-
-
-/**
- * Stores URL for AgePredictor
- */
-public class AgeRecogniserConfig {
-
- private String pathClassifyModel = null;
- private String pathClassifyRegression = null;
-
- public AgeRecogniserConfig(Map<String, Param> params) {
-
- URL classifyUrl = AgeRecogniserConfig.class
-
.getResource(params.get("age.path.classify").getValue().toString());
-
- if (classifyUrl != null) {
- setPathClassifyModel(classifyUrl.getFile());
- }
-
- URL regressionUrl = AgeRecogniserConfig.class
-
.getResource(params.get("age.path.regression").getValue().toString());
-
- if (regressionUrl != null) {
- setPathClassifyRegression(regressionUrl.getFile());
- }
- }
-
- public String getPathClassifyModel() {
- return pathClassifyModel;
- }
-
- public void setPathClassifyModel(String pathClassifyModel) {
- this.pathClassifyModel = pathClassifyModel;
- }
-
- public String getPathClassifyRegression() {
- return pathClassifyRegression;
- }
-
- public void setPathClassifyRegression(String pathClassifyRegression) {
- this.pathClassifyRegression = pathClassifyRegression;
- }
-}
diff --git
a/tika-parsers/tika-parsers-ml/tika-age-recogniser/src/test/java/org/apache/tika/parser/recognition/AgeRecogniserTest.java
b/tika-parsers/tika-parsers-ml/tika-age-recogniser/src/test/java/org/apache/tika/parser/recognition/AgeRecogniserTest.java
deleted file mode 100644
index f672e78f0..000000000
---
a/tika-parsers/tika-parsers-ml/tika-age-recogniser/src/test/java/org/apache/tika/parser/recognition/AgeRecogniserTest.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.recognition;
-
-import static org.junit.jupiter.api.Assertions.assertArrayEquals;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.when;
-
-import java.io.ByteArrayInputStream;
-import java.io.InputStream;
-import java.nio.charset.StandardCharsets;
-
-import edu.usc.irds.agepredictor.authorage.AgePredicterLocal;
-import org.junit.jupiter.api.Test;
-
-import org.apache.tika.Tika;
-import org.apache.tika.TikaTest;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.CompositeParser;
-
-
-public class AgeRecogniserTest extends TikaTest {
-
- private static final String CONFIG_FILE = "tika-config-age.xml";
- private static final String TEST_TEXT =
- "I am student at University of Southern California (USC)," +
- " located in Los Angeles . USC's football team is called
by name Trojans." +
- " Mr. John McKay was a head coach of the team from 1960 -
1975";
- private static final double TEST_AGE = 26.4;
-
- static {
- /**
- * Injecting mock AgeClassifer into AgeParser to generate test response
- */
- AgePredicterLocal mockAgeClassifier = mock(AgePredicterLocal.class);
- AgeRecogniser.setAgePredictorClient(mockAgeClassifier);
-
- try {
- when(mockAgeClassifier.predictAge(TEST_TEXT)).thenReturn(TEST_AGE);
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
-
- @Test
- public void testAgeRecogniser() throws Exception {
-
- //test config is added to resources directory
- try (InputStream is = getResourceAsStream(CONFIG_FILE);
- InputStream bis = new ByteArrayInputStream(
- TEST_TEXT.getBytes(StandardCharsets.UTF_8))) {
- TikaConfig config = new TikaConfig(is);
- Tika tika = new Tika(config);
-
- Metadata md = new Metadata();
- tika.parse(bis, md);
-
- assertArrayEquals(new
String[]{CompositeParser.class.getCanonicalName(),
- AgeRecogniser.class.getCanonicalName()},
- md.getValues(TikaCoreProperties.TIKA_PARSED_BY),
- "Age Parser not invoked.");
- assertArrayEquals(
- new String[]{Double.toString(TEST_AGE)},
- md.getValues(AgeRecogniser.MD_KEY_ESTIMATED_AGE),
- "Wrong age predicted.");
- }
- }
-
-}
diff --git
a/tika-parsers/tika-parsers-ml/tika-age-recogniser/src/test/resources/org/apache/tika/parser/recognition/tika-config-age.xml
b/tika-parsers/tika-parsers-ml/tika-age-recogniser/src/test/resources/org/apache/tika/parser/recognition/tika-config-age.xml
deleted file mode 100644
index 7f214dd5e..000000000
---
a/tika-parsers/tika-parsers-ml/tika-age-recogniser/src/test/resources/org/apache/tika/parser/recognition/tika-config-age.xml
+++ /dev/null
@@ -1,31 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.recognition.AgeRecogniser">
- <mime>text/plain</mime>
- <mime>text/html</mime>
- <mime>application/xhtml+xml</mime>
- <params>
- <param name="age.path.classify"
type="string">classify-bigram.bin</param>
- <param name="age.path.regression"
type="string">regression-global.bin</param>
- </params>
- </parser>
- </parsers>
-
-</properties>
diff --git
a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/resources/org/apache/tika/parser/ner/opennlp/ModelGetter.groovy
b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/resources/org/apache/tika/parser/ner/opennlp/ModelGetter.groovy
index 88e29ddbd..cf054f2ec 100644
---
a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/resources/org/apache/tika/parser/ner/opennlp/ModelGetter.groovy
+++
b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/resources/org/apache/tika/parser/ner/opennlp/ModelGetter.groovy
@@ -95,14 +95,11 @@ if (mvnProxies && mvnProxies.size() > 0) {
def urlPrefix = "http://opennlp.sourceforge.net/models-1.5"
def prefixPath = "src/test/resources/org/apache/tika/parser/ner/opennlp/"
-def ageUrlPrefix =
"https://raw.githubusercontent.com/USCDataScience/AgePredictor/master/model"
-def agePrefixPath = "src/test/resources/org/apache/tika/parser/recognition/"
// detecting proper path for test resources
if (new File("tika-parsers").exists() && new File("tika-app").exists() ) {
// running from parent maven project, but resources should go to sub-module
prefixPath = "tika-parsers/tika-parsers-ml/tika-parser-nlp-module/" +
prefixPath
- agePrefixPath = "tika-parsers/tika-parsers-ml/tika-age-recogniser/" +
agePrefixPath
}
def modelFiles = //filePath : url
@@ -112,9 +109,7 @@ def modelFiles = //filePath : url
(prefixPath + "en-pos-maxent.bin"): (urlPrefix +
"/en-pos-maxent.bin"),
(prefixPath + "en-sent.bin"): (urlPrefix + "/en-sent.bin"),
(prefixPath + "en-token.bin"): (urlPrefix + "/en-token.bin"),
- (prefixPath + "ner-date.bin"): (urlPrefix + "/en-ner-date.bin"),
- (agePrefixPath + "classify-bigram.bin"): (ageUrlPrefix +
"/classify-bigram.bin"),
- (agePrefixPath + "regression-global.bin"): (ageUrlPrefix +
"/regression-global.bin")]
+ (prefixPath + "ner-date.bin"): (urlPrefix + "/en-ner-date.bin")]
for (def entry : modelFiles) {
File file = new File(entry.key)