http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/main/MudrodEngine.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/sdap/mudrod/main/MudrodEngine.java b/core/src/main/java/org/apache/sdap/mudrod/main/MudrodEngine.java new file mode 100644 index 0000000..341d5fc --- /dev/null +++ b/core/src/main/java/org/apache/sdap/mudrod/main/MudrodEngine.java @@ -0,0 +1,456 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.main; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.GnuParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.OptionBuilder; +import org.apache.commons.cli.Options; +import org.apache.commons.io.FileUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.sdap.mudrod.discoveryengine.DiscoveryEngineAbstract; +import org.apache.sdap.mudrod.discoveryengine.MetadataDiscoveryEngine; +import org.apache.sdap.mudrod.discoveryengine.OntologyDiscoveryEngine; +import org.apache.sdap.mudrod.discoveryengine.RecommendEngine; +import org.apache.sdap.mudrod.discoveryengine.WeblogDiscoveryEngine; +import org.apache.sdap.mudrod.driver.ESDriver; +import org.apache.sdap.mudrod.driver.SparkDriver; +import org.apache.sdap.mudrod.integration.LinkageIntegration; +import org.jdom2.Document; +import org.jdom2.Element; +import org.jdom2.JDOMException; +import org.jdom2.input.SAXBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import static org.apache.sdap.mudrod.main.MudrodConstants.DATA_DIR; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.nio.file.Files; +import java.util.List; +import java.util.Properties; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; + +/** + * Main entry point for Running the Mudrod system. Invocation of this class is + * tightly linked to the primary Mudrod configuration which can be located at + * <a href= + * "https://github.com/mudrod/mudrod/blob/master/core/src/main/resources/config.xml">config.xml</a>. + */ +public class MudrodEngine { + + private static final Logger LOG = LoggerFactory.getLogger(MudrodEngine.class); + private Properties props = new Properties(); + private ESDriver es = null; + private SparkDriver spark = null; + private static final String LOG_INGEST = "logIngest"; + private static final String META_INGEST = "metaIngest"; + private static final String FULL_INGEST = "fullIngest"; + private static final String PROCESSING = "processingWithPreResults"; + private static final String ES_HOST = "esHost"; + private static final String ES_TCP_PORT = "esTCPPort"; + private static final String ES_HTTP_PORT = "esPort"; + + /** + * Public constructor for this class. + */ + public MudrodEngine() { + // default constructor + } + + /** + * Start the {@link ESDriver}. Should only be called after call to + * {@link MudrodEngine#loadConfig()} + * + * @return fully provisioned {@link ESDriver} + */ + public ESDriver startESDriver() { + return new ESDriver(props); + } + + /** + * Start the {@link SparkDriver}. Should only be called after call to + * {@link MudrodEngine#loadConfig()} + * + * @return fully provisioned {@link SparkDriver} + */ + public SparkDriver startSparkDriver() { + return new SparkDriver(props); + } + + /** + * Retreive the Mudrod configuration as a Properties Map containing K, V of + * type String. + * + * @return a {@link java.util.Properties} object + */ + public Properties getConfig() { + return props; + } + + /** + * Retreive the Mudrod {@link ESDriver} + * + * @return the {@link ESDriver} instance. + */ + public ESDriver getESDriver() { + return this.es; + } + + /** + * Set the Elasticsearch driver for MUDROD + * + * @param es + * an ES driver instance + */ + public void setESDriver(ESDriver es) { + this.es = es; + } + + private InputStream locateConfig() { + + String configLocation = System.getenv(MudrodConstants.MUDROD_CONFIG) == null ? "" : System.getenv(MudrodConstants.MUDROD_CONFIG); + File configFile = new File(configLocation); + + try { + InputStream configStream = new FileInputStream(configFile); + LOG.info("Loaded config file from " + configFile.getAbsolutePath()); + return configStream; + } catch (IOException e) { + LOG.info("File specified by environment variable " + MudrodConstants.MUDROD_CONFIG + "=\'" + configLocation + "\' could not be loaded. " + e.getMessage()); + } + + InputStream configStream = MudrodEngine.class.getClassLoader().getResourceAsStream("config.xml"); + + if (configStream != null) { + LOG.info("Loaded config file from {}", MudrodEngine.class.getClassLoader().getResource("config.xml").getPath()); + } + + return configStream; + } + + /** + * Load the configuration provided at <a href= + * "https://github.com/mudrod/mudrod/blob/master/core/src/main/resources/config.xml">config.xml</a>. + * + * @return a populated {@link java.util.Properties} object. + */ + public Properties loadConfig() { + SAXBuilder saxBuilder = new SAXBuilder(); + + InputStream configStream = locateConfig(); + + Document document; + try { + document = saxBuilder.build(configStream); + Element rootNode = document.getRootElement(); + List<Element> paraList = rootNode.getChildren("para"); + + for (int i = 0; i < paraList.size(); i++) { + Element paraNode = paraList.get(i); + String attributeName = paraNode.getAttributeValue("name"); + if (MudrodConstants.SVM_SGD_MODEL.equals(attributeName)) { + props.put(attributeName, decompressSVMWithSGDModel(paraNode.getTextTrim())); + } else { + props.put(attributeName, paraNode.getTextTrim()); + } + } + } catch (JDOMException | IOException e) { + LOG.error("Exception whilst retrieving or processing XML contained within 'config.xml'!", e); + } + return getConfig(); + + } + + private String decompressSVMWithSGDModel(String archiveName) throws IOException { + + URL scmArchive = getClass().getClassLoader().getResource(archiveName); + if (scmArchive == null) { + throw new IOException("Unable to locate " + archiveName + " as a classpath resource."); + } + File tempDir = Files.createTempDirectory("mudrod").toFile(); + assert tempDir.setWritable(true); + File archiveFile = new File(tempDir, archiveName); + FileUtils.copyURLToFile(scmArchive, archiveFile); + + // Decompress archive + int BUFFER_SIZE = 512000; + ZipInputStream zipIn = new ZipInputStream(new FileInputStream(archiveFile)); + ZipEntry entry; + while ((entry = zipIn.getNextEntry()) != null) { + File f = new File(tempDir, entry.getName()); + // If the entry is a directory, create the directory. + if (entry.isDirectory() && !f.exists()) { + boolean created = f.mkdirs(); + if (!created) { + LOG.error("Unable to create directory '{}', during extraction of archive contents.", f.getAbsolutePath()); + } + } else if (!entry.isDirectory()) { + boolean created = f.getParentFile().mkdirs(); + if (!created && !f.getParentFile().exists()) { + LOG.error("Unable to create directory '{}', during extraction of archive contents.", f.getParentFile().getAbsolutePath()); + } + int count; + byte data[] = new byte[BUFFER_SIZE]; + FileOutputStream fos = new FileOutputStream(new File(tempDir, entry.getName()), false); + try (BufferedOutputStream dest = new BufferedOutputStream(fos, BUFFER_SIZE)) { + while ((count = zipIn.read(data, 0, BUFFER_SIZE)) != -1) { + dest.write(data, 0, count); + } + } + } + } + + return new File(tempDir, StringUtils.removeEnd(archiveName, ".zip")).toURI().toString(); + } + + /** + * Preprocess and process logs {@link DiscoveryEngineAbstract} implementations + * for weblog + */ + public void startLogIngest() { + DiscoveryEngineAbstract wd = new WeblogDiscoveryEngine(props, es, spark); + wd.preprocess(); + wd.process(); + LOG.info("*****************logs have been ingested successfully******************"); + } + + /** + * updating and analysing metadata to metadata similarity results + */ + public void startMetaIngest() { + DiscoveryEngineAbstract md = new MetadataDiscoveryEngine(props, es, spark); + md.preprocess(); + md.process(); + + DiscoveryEngineAbstract recom = new RecommendEngine(props, es, spark); + recom.preprocess(); + recom.process(); + LOG.info("Metadata has been ingested successfully."); + } + + public void startFullIngest() { + DiscoveryEngineAbstract wd = new WeblogDiscoveryEngine(props, es, spark); + wd.preprocess(); + wd.process(); + + DiscoveryEngineAbstract md = new MetadataDiscoveryEngine(props, es, spark); + md.preprocess(); + md.process(); + + DiscoveryEngineAbstract recom = new RecommendEngine(props, es, spark); + recom.preprocess(); + recom.process(); + LOG.info("Full ingest has finished successfully."); + } + + /** + * Only preprocess various {@link DiscoveryEngineAbstract} implementations for + * weblog, ontology and metadata, linkage discovery and integration. + */ + public void startProcessing() { + DiscoveryEngineAbstract wd = new WeblogDiscoveryEngine(props, es, spark); + wd.process(); + + DiscoveryEngineAbstract od = new OntologyDiscoveryEngine(props, es, spark); + od.preprocess(); + od.process(); + + DiscoveryEngineAbstract md = new MetadataDiscoveryEngine(props, es, spark); + md.preprocess(); + md.process(); + + LinkageIntegration li = new LinkageIntegration(props, es, spark); + li.execute(); + + DiscoveryEngineAbstract recom = new RecommendEngine(props, es, spark); + recom.process(); + } + + /** + * Close the connection to the {@link ESDriver} instance. + */ + public void end() { + if (es != null) { + es.close(); + } + } + + /** + * Main program invocation. Accepts one argument denoting location (on disk) + * to a log file which is to be ingested. Help will be provided if invoked + * with incorrect parameters. + * + * @param args + * {@link java.lang.String} array contaning correct parameters. + */ + public static void main(String[] args) { + // boolean options + Option helpOpt = new Option("h", "help", false, "show this help message"); + + // log ingest (preprocessing + processing) + Option logIngestOpt = new Option("l", LOG_INGEST, false, "begin log ingest"); + // metadata ingest (preprocessing + processing) + Option metaIngestOpt = new Option("m", META_INGEST, false, "begin metadata ingest"); + // ingest both log and metadata + Option fullIngestOpt = new Option("f", FULL_INGEST, false, "begin full ingest Mudrod workflow"); + // processing only, assuming that preprocessing results is in dataDir + Option processingOpt = new Option("p", PROCESSING, false, "begin processing with preprocessing results"); + + // argument options + Option dataDirOpt = OptionBuilder.hasArg(true).withArgName("/path/to/data/directory").hasArgs(1).withDescription("the data directory to be processed by Mudrod").withLongOpt("dataDirectory") + .isRequired().create(DATA_DIR); + + Option esHostOpt = OptionBuilder.hasArg(true).withArgName("host_name").hasArgs(1).withDescription("elasticsearch cluster unicast host").withLongOpt("elasticSearchHost").isRequired(false) + .create(ES_HOST); + + Option esTCPPortOpt = OptionBuilder.hasArg(true).withArgName("port_num").hasArgs(1).withDescription("elasticsearch transport TCP port").withLongOpt("elasticSearchTransportTCPPort") + .isRequired(false).create(ES_TCP_PORT); + + Option esPortOpt = OptionBuilder.hasArg(true).withArgName("port_num").hasArgs(1).withDescription("elasticsearch HTTP/REST port").withLongOpt("elasticSearchHTTPPort").isRequired(false) + .create(ES_HTTP_PORT); + + // create the options + Options options = new Options(); + options.addOption(helpOpt); + options.addOption(logIngestOpt); + options.addOption(metaIngestOpt); + options.addOption(fullIngestOpt); + options.addOption(processingOpt); + options.addOption(dataDirOpt); + options.addOption(esHostOpt); + options.addOption(esTCPPortOpt); + options.addOption(esPortOpt); + + CommandLineParser parser = new GnuParser(); + try { + CommandLine line = parser.parse(options, args); + String processingType = null; + + if (line.hasOption(LOG_INGEST)) { + processingType = LOG_INGEST; + } else if (line.hasOption(PROCESSING)) { + processingType = PROCESSING; + } else if (line.hasOption(META_INGEST)) { + processingType = META_INGEST; + } else if (line.hasOption(FULL_INGEST)) { + processingType = FULL_INGEST; + } + + String dataDir = line.getOptionValue(DATA_DIR).replace("\\", "/"); + if (!dataDir.endsWith("/")) { + dataDir += "/"; + } + + MudrodEngine me = new MudrodEngine(); + me.loadConfig(); + me.props.put(DATA_DIR, dataDir); + + if (line.hasOption(ES_HOST)) { + String esHost = line.getOptionValue(ES_HOST); + me.props.put(MudrodConstants.ES_UNICAST_HOSTS, esHost); + } + + if (line.hasOption(ES_TCP_PORT)) { + String esTcpPort = line.getOptionValue(ES_TCP_PORT); + me.props.put(MudrodConstants.ES_TRANSPORT_TCP_PORT, esTcpPort); + } + + if (line.hasOption(ES_HTTP_PORT)) { + String esHttpPort = line.getOptionValue(ES_HTTP_PORT); + me.props.put(MudrodConstants.ES_HTTP_PORT, esHttpPort); + } + + me.es = new ESDriver(me.getConfig()); + me.spark = new SparkDriver(me.getConfig()); + loadFullConfig(me, dataDir); + if (processingType != null) { + switch (processingType) { + case PROCESSING: + me.startProcessing(); + break; + case LOG_INGEST: + me.startLogIngest(); + break; + case META_INGEST: + me.startMetaIngest(); + break; + case FULL_INGEST: + me.startFullIngest(); + break; + default: + break; + } + } + me.end(); + } catch (Exception e) { + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp("MudrodEngine: 'dataDir' argument is mandatory. " + "User must also provide an ingest method.", options, true); + LOG.error("Error whilst parsing command line.", e); + } + } + + private static void loadFullConfig(MudrodEngine me, String dataDir) { + //TODO all of the properties defined below, which are determined are + //runtime need to be added to MudrodConstants.java and referenced + //accordingly and consistently from Properties.getProperty(MudrodConstant...); + me.props.put("ontologyInputDir", dataDir + "SWEET_ocean/"); + me.props.put("oceanTriples", dataDir + "Ocean_triples.csv"); + me.props.put("userHistoryMatrix", dataDir + "UserHistoryMatrix.csv"); + me.props.put("clickstreamMatrix", dataDir + "ClickstreamMatrix.csv"); + me.props.put("metadataMatrix", dataDir + "MetadataMatrix.csv"); + me.props.put("clickstreamSVDMatrix_tmp", dataDir + "clickstreamSVDMatrix_tmp.csv"); + me.props.put("metadataSVDMatrix_tmp", dataDir + "metadataSVDMatrix_tmp.csv"); + me.props.put("raw_metadataPath", dataDir + me.props.getProperty(MudrodConstants.RAW_METADATA_TYPE)); + + me.props.put("jtopia", dataDir + "jtopiaModel"); + me.props.put("metadata_term_tfidf_matrix", dataDir + "metadata_term_tfidf.csv"); + me.props.put("metadata_word_tfidf_matrix", dataDir + "metadata_word_tfidf.csv"); + me.props.put("session_metadata_Matrix", dataDir + "metadata_session_coocurrence_matrix.csv"); + + me.props.put("metadataOBCode", dataDir + "MetadataOHCode"); + me.props.put("metadata_topic", dataDir + "metadata_topic"); + me.props.put("metadata_topic_matrix", dataDir + "metadata_topic_matrix.csv"); + } + + /** + * Obtain the spark implementation. + * + * @return the {@link SparkDriver} + */ + public SparkDriver getSparkDriver() { + return this.spark; + } + + /** + * Set the {@link SparkDriver} + * + * @param sparkDriver + * a configured {@link SparkDriver} + */ + public void setSparkDriver(SparkDriver sparkDriver) { + this.spark = sparkDriver; + + } +}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/main/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/sdap/mudrod/main/package-info.java b/core/src/main/java/org/apache/sdap/mudrod/main/package-info.java new file mode 100644 index 0000000..8a3af67 --- /dev/null +++ b/core/src/main/java/org/apache/sdap/mudrod/main/package-info.java @@ -0,0 +1,17 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * This package includes Main entry point for Running the Mudrod system. + */ +package org.apache.sdap.mudrod.main; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/metadata/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/sdap/mudrod/metadata/package-info.java b/core/src/main/java/org/apache/sdap/mudrod/metadata/package-info.java new file mode 100644 index 0000000..b7b6258 --- /dev/null +++ b/core/src/main/java/org/apache/sdap/mudrod/metadata/package-info.java @@ -0,0 +1,18 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * This package includes metadata pre-processing, processing, and data structure + * classes. + */ +package org.apache.sdap.mudrod.metadata; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/ApiHarvester.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/ApiHarvester.java b/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/ApiHarvester.java new file mode 100644 index 0000000..7b8b5c1 --- /dev/null +++ b/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/ApiHarvester.java @@ -0,0 +1,178 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.metadata.pre; + +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; + +import org.apache.commons.io.IOUtils; +import org.apache.sdap.mudrod.discoveryengine.DiscoveryStepAbstract; +import org.apache.sdap.mudrod.driver.ESDriver; +import org.apache.sdap.mudrod.driver.SparkDriver; +import org.apache.sdap.mudrod.main.MudrodConstants; +import org.apache.sdap.mudrod.utils.HttpRequest; +import org.elasticsearch.action.index.IndexRequest; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.util.Properties; + +/** + * ClassName: ApiHarvester Function: Harvest metadata from PO.DAACweb service. + */ +public class ApiHarvester extends DiscoveryStepAbstract { + + private static final long serialVersionUID = 1L; + private static final Logger LOG = LoggerFactory.getLogger(ApiHarvester.class); + + /** + * Creates a new instance of ApiHarvester. + * + * @param props the Mudrod configuration + * @param es the Elasticsearch drive + * @param spark the spark driver + */ + public ApiHarvester(Properties props, ESDriver es, SparkDriver spark) { + super(props, es, spark); + } + + @Override + public Object execute() { + LOG.info("Starting Metadata harvesting."); + startTime = System.currentTimeMillis(); + //remove old metadata from ES + es.deleteType(props.getProperty(MudrodConstants.ES_INDEX_NAME), props.getProperty(MudrodConstants.RAW_METADATA_TYPE)); + //harvest new metadata using PO.DAAC web services + harvestMetadatafromWeb(); + es.createBulkProcessor(); + addMetadataMapping(); + importToES(); + es.destroyBulkProcessor(); + endTime = System.currentTimeMillis(); + es.refreshIndex(); + LOG.info("Metadata harvesting completed. Time elapsed: {}", (endTime - startTime) / 1000); + return null; + } + + /** + * addMetadataMapping: Add mapping to index metadata in Elasticsearch. Please + * invoke this method before import metadata to Elasticsearch. + */ + public void addMetadataMapping() { + String mappingJson = "{\r\n \"dynamic_templates\": " + "[\r\n " + "{\r\n \"strings\": " + "{\r\n \"match_mapping_type\": \"string\"," + + "\r\n \"mapping\": {\r\n \"type\": \"text\"," + "\r\n \"fielddata\": true," + "\r\n \"analyzer\": \"english\"," + + "\r\n \"fields\": {\r\n \"raw\": {" + "\r\n \"type\": \"string\"," + "\r\n \"index\": \"not_analyzed\"" + "\r\n }" + + "\r\n }\r\n " + "\r\n }" + "\r\n }\r\n }\r\n ]\r\n}"; + + es.getClient().admin().indices().preparePutMapping(props.getProperty(MudrodConstants.ES_INDEX_NAME)).setType(props.getProperty(MudrodConstants.RAW_METADATA_TYPE)).setSource(mappingJson).execute() + .actionGet(); + } + + /** + * importToES: Index metadata into elasticsearch from local file directory. + * Please make sure metadata have been harvest from web service before + * invoking this method. + */ + private void importToES() { + File directory = new File(props.getProperty(MudrodConstants.RAW_METADATA_PATH)); + if(!directory.exists()) + directory.mkdir(); + File[] fList = directory.listFiles(); + for (File file : fList) { + InputStream is; + try { + is = new FileInputStream(file); + importSingleFileToES(is); + } catch (FileNotFoundException e) { + LOG.error("Error finding file!", e); + } + + } + } + + private void importSingleFileToES(InputStream is) { + try { + String jsonTxt = IOUtils.toString(is); + JsonParser parser = new JsonParser(); + JsonElement item = parser.parse(jsonTxt); + IndexRequest ir = new IndexRequest(props.getProperty(MudrodConstants.ES_INDEX_NAME), props.getProperty(MudrodConstants.RAW_METADATA_TYPE)).source(item.toString()); + es.getBulkProcessor().add(ir); + } catch (IOException e) { + LOG.error("Error indexing metadata record!", e); + } + } + + /** + * harvestMetadatafromWeb: Harvest metadata from PO.DAAC web service. + */ + private void harvestMetadatafromWeb() { + LOG.info("Metadata download started."); + int startIndex = 0; + int doc_length = 0; + JsonParser parser = new JsonParser(); + do { + String searchAPI = "https://podaac.jpl.nasa.gov/api/dataset?startIndex=" + Integer.toString(startIndex) + "&entries=10&sortField=Dataset-AllTimePopularity&sortOrder=asc&id=&value=&search="; + HttpRequest http = new HttpRequest(); + String response = http.getRequest(searchAPI); + + JsonElement json = parser.parse(response); + JsonObject responseObject = json.getAsJsonObject(); + JsonArray docs = responseObject.getAsJsonObject("response").getAsJsonArray("docs"); + + doc_length = docs.size(); + + File file = new File(props.getProperty(MudrodConstants.RAW_METADATA_PATH)); + if (!file.exists()) { + if (file.mkdir()) { + LOG.info("Directory is created!"); + } else { + LOG.error("Failed to create directory!"); + } + } + for (int i = 0; i < doc_length; i++) { + JsonElement item = docs.get(i); + int docId = startIndex + i; + File itemfile = new File(props.getProperty(MudrodConstants.RAW_METADATA_PATH) + "/" + docId + ".json"); + + try (FileWriter fw = new FileWriter(itemfile.getAbsoluteFile()); BufferedWriter bw = new BufferedWriter(fw);) { + itemfile.createNewFile(); + bw.write(item.toString()); + } catch (IOException e) { + LOG.error("Error writing metadata to local file!", e); + } + } + + startIndex += 10; + + try { + Thread.sleep(100); + } catch (InterruptedException e) { + LOG.error("Error entering Elasticsearch Mappings!", e); + Thread.currentThread().interrupt(); + } + + } while (doc_length != 0); + + LOG.info("Metadata downloading finished"); + } + + @Override + public Object execute(Object o) { + return null; + } + +} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/MatrixGenerator.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/MatrixGenerator.java b/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/MatrixGenerator.java new file mode 100644 index 0000000..63565b2 --- /dev/null +++ b/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/MatrixGenerator.java @@ -0,0 +1,85 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.metadata.pre; + +import org.apache.sdap.mudrod.discoveryengine.DiscoveryStepAbstract; +import org.apache.sdap.mudrod.driver.ESDriver; +import org.apache.sdap.mudrod.driver.SparkDriver; +import org.apache.sdap.mudrod.main.MudrodConstants; +import org.apache.sdap.mudrod.metadata.structure.MetadataExtractor; +import org.apache.sdap.mudrod.utils.LabeledRowMatrix; +import org.apache.sdap.mudrod.utils.MatrixUtil; +import org.apache.spark.api.java.JavaPairRDD; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.Properties; + +/** + * Generate term-metadata matrix from original metadata. Each row in + * the matrix is corresponding to a term, and each column is a metadata. + */ +public class MatrixGenerator extends DiscoveryStepAbstract { + + /** + * + */ + private static final long serialVersionUID = 1L; + private static final Logger LOG = LoggerFactory.getLogger(MatrixGenerator.class); + + /** + * Creates a new instance of MatrixGenerator. + * + * @param props the Mudrod configuration + * @param es the Elasticsearch drive + * @param spark the spark drive + */ + public MatrixGenerator(Properties props, ESDriver es, SparkDriver spark) { + super(props, es, spark); + } + + /** + * Generate a csv which is a term-metadata matrix genetrated from original + * metadata. + * + * @see DiscoveryStepAbstract#execute() + */ + @Override + public Object execute() { + LOG.info("Metadata matrix started"); + startTime = System.currentTimeMillis(); + + String metadataMatrixFile = props.getProperty("metadataMatrix"); + try { + MetadataExtractor extractor = new MetadataExtractor(); + JavaPairRDD<String, List<String>> metadataTermsRDD = extractor.loadMetadata(this.es, this.spark.sc, props.getProperty(MudrodConstants.ES_INDEX_NAME), props.getProperty(MudrodConstants.RAW_METADATA_TYPE)); + LabeledRowMatrix wordDocMatrix = MatrixUtil.createWordDocMatrix(metadataTermsRDD); + MatrixUtil.exportToCSV(wordDocMatrix.rowMatrix, wordDocMatrix.rowkeys, wordDocMatrix.colkeys, metadataMatrixFile); + + } catch (Exception e) { + LOG.error("Error during Metadata matrix generaion: {}", e); + } + + endTime = System.currentTimeMillis(); + LOG.info("Metadata matrix finished time elapsed: {}s", (endTime - startTime) / 1000); + return null; + } + + @Override + public Object execute(Object o) { + return null; + } + +} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/package-info.java b/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/package-info.java new file mode 100644 index 0000000..ffecbc8 --- /dev/null +++ b/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/package-info.java @@ -0,0 +1,17 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * This package includes metadata pre-processing functions. + */ +package org.apache.sdap.mudrod.metadata.pre; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/metadata/process/MetadataAnalyzer.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/sdap/mudrod/metadata/process/MetadataAnalyzer.java b/core/src/main/java/org/apache/sdap/mudrod/metadata/process/MetadataAnalyzer.java new file mode 100644 index 0000000..80e23c1 --- /dev/null +++ b/core/src/main/java/org/apache/sdap/mudrod/metadata/process/MetadataAnalyzer.java @@ -0,0 +1,88 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.metadata.process; + +import org.apache.sdap.mudrod.discoveryengine.DiscoveryStepAbstract; +import org.apache.sdap.mudrod.driver.ESDriver; +import org.apache.sdap.mudrod.driver.SparkDriver; +import org.apache.sdap.mudrod.semantics.SVDAnalyzer; +import org.apache.sdap.mudrod.utils.LinkageTriple; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Serializable; +import java.util.List; +import java.util.Properties; + +/** + * ClassName: MetadataAnalyzer + * Function: Calculate semantic relationship of vocabularies extracted from + * metadata. + */ +public class MetadataAnalyzer extends DiscoveryStepAbstract implements Serializable { + + /** + * + */ + private static final long serialVersionUID = 1L; + private static final Logger LOG = LoggerFactory.getLogger(MetadataAnalyzer.class); + + /** + * Creates a new instance of MetadataAnalyzer. + * + * @param props the Mudrod configuration + * @param es the Elasticsearch drive + * @param spark the spark drive + */ + public MetadataAnalyzer(Properties props, ESDriver es, SparkDriver spark) { + super(props, es, spark); + } + + @Override + public Object execute(Object o) { + return null; + } + + /** + * Calculate semantic relationship of vocabularies from a csv file which is a + * term-metadata matrix. + * + * @see DiscoveryStepAbstract#execute() + */ + @Override + public Object execute() { + try { + LOG.info("*****************Metadata Analyzer starts******************"); + startTime = System.currentTimeMillis(); + + SVDAnalyzer analyzer = new SVDAnalyzer(props, es, spark); + int svdDimension = Integer.parseInt(props.getProperty("metadataSVDDimension")); + String metadataMatrixFile = props.getProperty("metadataMatrix"); + String svdMatrixFileName = props.getProperty("metadataSVDMatrix_tmp"); + + analyzer.getSVDMatrix(metadataMatrixFile, svdDimension, svdMatrixFileName); + List<LinkageTriple> triples = analyzer.calTermSimfromMatrix(svdMatrixFileName); + + analyzer.saveToES(triples, props.getProperty("indexName"), props.getProperty("metadataLinkageType")); + + } catch (Exception e) { + e.printStackTrace(); + } + + endTime = System.currentTimeMillis(); + es.refreshIndex(); + LOG.info("*****************Metadata Analyzer ends******************Took {}s", (endTime - startTime) / 1000); + return null; + } +} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/metadata/process/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/sdap/mudrod/metadata/process/package-info.java b/core/src/main/java/org/apache/sdap/mudrod/metadata/process/package-info.java new file mode 100644 index 0000000..a0c0799 --- /dev/null +++ b/core/src/main/java/org/apache/sdap/mudrod/metadata/process/package-info.java @@ -0,0 +1,17 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * This package includes metadata processing classes. + */ +package org.apache.sdap.mudrod.metadata.process; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/MetadataExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/MetadataExtractor.java b/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/MetadataExtractor.java new file mode 100644 index 0000000..379d5b9 --- /dev/null +++ b/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/MetadataExtractor.java @@ -0,0 +1,145 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.metadata.structure; + +import org.apache.sdap.mudrod.driver.ESDriver; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.Function2; +import org.apache.spark.api.java.function.PairFunction; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.common.unit.TimeValue; +import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.search.SearchHit; +import scala.Tuple2; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutionException; + +public class MetadataExtractor implements Serializable { + + /** + * + */ + private static final long serialVersionUID = 1L; + + public MetadataExtractor() { + } + + /** + * loadMetadata:Load all metadata from Elasticsearch and convert them to + * pairRDD Please make sure metadata has been already harvested from web + * service and stored in Elasticsearch. + * + * @param es an Elasticsearch client node instance + * @param sc spark context + * @param index index name of log processing application + * @param type metadata type name + * @return PairRDD, in each pair key is metadata short name and value is term + * list extracted from metadata variables. + */ + public JavaPairRDD<String, List<String>> loadMetadata(ESDriver es, JavaSparkContext sc, String index, String type) { + List<PODAACMetadata> metadatas = this.loadMetadataFromES(es, index, type); + JavaPairRDD<String, List<String>> metadataTermsRDD = this.buildMetadataRDD(es, sc, index, metadatas); + return metadataTermsRDD; + } + + /** + * loadMetadataFromES: Load all metadata from Elasticsearch. + * + * @param es an Elasticsearch client node instance + * @param index index name of log processing application + * @param type metadata type name + * @return metadata list + */ + protected List<PODAACMetadata> loadMetadataFromES(ESDriver es, String index, String type) { + + List<PODAACMetadata> metadatas = new ArrayList<PODAACMetadata>(); + SearchResponse scrollResp = es.getClient().prepareSearch(index).setTypes(type).setQuery(QueryBuilders.matchAllQuery()).setScroll(new TimeValue(60000)).setSize(100).execute().actionGet(); + + while (true) { + for (SearchHit hit : scrollResp.getHits().getHits()) { + Map<String, Object> result = hit.getSource(); + String shortname = (String) result.get("Dataset-ShortName"); + List<String> topic = (List<String>) result.get("DatasetParameter-Topic"); + List<String> term = (List<String>) result.get("DatasetParameter-Term"); + List<String> keyword = (List<String>) result.get("Dataset-Metadata"); + List<String> variable = (List<String>) result.get("DatasetParameter-Variable"); + List<String> longname = (List<String>) result.get("DatasetProject-Project-LongName"); + + List<String> region = (List<String>) result.get("DatasetRegion-Region"); + + PODAACMetadata metadata = null; + try { + metadata = new PODAACMetadata(shortname, longname, es.customAnalyzing(index, topic), es.customAnalyzing(index, term), es.customAnalyzing(index, variable), es.customAnalyzing(index, keyword), + es.customAnalyzing(index, region)); + } catch (InterruptedException | ExecutionException e) { + e.printStackTrace(); + + } + metadatas.add(metadata); + } + scrollResp = es.getClient().prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(600000)).execute().actionGet(); + if (scrollResp.getHits().getHits().length == 0) { + break; + } + } + + return metadatas; + } + + /** + * buildMetadataRDD: Convert metadata list to JavaPairRDD + * + * @param es an Elasticsearch client node instance + * @param sc spark context + * @param index index name of log processing application + * @param metadatas metadata list + * @return PairRDD, in each pair key is metadata short name and value is term + * list extracted from metadata variables. + */ + protected JavaPairRDD<String, List<String>> buildMetadataRDD(ESDriver es, JavaSparkContext sc, String index, List<PODAACMetadata> metadatas) { + JavaRDD<PODAACMetadata> metadataRDD = sc.parallelize(metadatas); + JavaPairRDD<String, List<String>> metadataTermsRDD = metadataRDD.mapToPair(new PairFunction<PODAACMetadata, String, List<String>>() { + /** + * + */ + private static final long serialVersionUID = 1L; + + @Override + public Tuple2<String, List<String>> call(PODAACMetadata metadata) throws Exception { + return new Tuple2<String, List<String>>(metadata.getShortName(), metadata.getAllTermList()); + } + }).reduceByKey(new Function2<List<String>, List<String>, List<String>>() { + /** + * + */ + private static final long serialVersionUID = 1L; + + @Override + public List<String> call(List<String> v1, List<String> v2) throws Exception { + List<String> list = new ArrayList<String>(); + list.addAll(v1); + list.addAll(v2); + return list; + } + }); + + return metadataTermsRDD; + } +} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/PODAACMetadata.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/PODAACMetadata.java b/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/PODAACMetadata.java new file mode 100644 index 0000000..de3edf7 --- /dev/null +++ b/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/PODAACMetadata.java @@ -0,0 +1,337 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.metadata.structure; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +/** + * ClassName: PODAACMetadata Function: PODAACMetadata setter and getter methods + */ +public class PODAACMetadata implements Serializable { + + /** + * + */ + private static final long serialVersionUID = 1L; + // shortname: data set short name + private String shortname; + // abstractStr: data set abstract + private String abstractStr; + // isoTopic: data set topic + private String isoTopic; + // sensor: sensor + private String sensor; + // source: data source + private String source; + // project: data project + private String project; + // hasAbstarct: whether data set has abstract + boolean hasAbstarct; + + // longnameList: data set long name list + private List<String> longnameList; + // keywordList:data set key word list + private List<String> keywordList; + // termList: data set term list + private List<String> termList; + // topicList: data set topic list + private List<String> topicList; + // variableList: data set variable list + private List<String> variableList; + // abstractList: data set abstract term list + private List<String> abstractList; + // isotopicList: data set iso topic list + private List<String> isotopicList; + // sensorList: data set sensor list + private List<String> sensorList; + // sourceList: data set source list + private List<String> sourceList; + // projectList: data set project list + private List<String> projectList; + // regionList: data set region list + private List<String> regionList; + + public PODAACMetadata() { + // Default constructor + } + + /** + * Creates a new instance of PODAACMetadata. + * + * @param shortname data set short name + * @param longname data set long name + * @param topics data set topics + * @param terms data set terms + * @param variables data set variables + * @param keywords data set keywords + * @param region list of regions + */ + public PODAACMetadata(String shortname, List<String> longname, List<String> topics, List<String> terms, List<String> variables, List<String> keywords, List<String> region) { + this.shortname = shortname; + this.longnameList = longname; + this.keywordList = keywords; + this.termList = terms; + this.topicList = topics; + this.variableList = variables; + this.regionList = region; + } + + /** + * setTerms: set term of data set + * + * @param termstr data set terms + */ + public void setTerms(String termstr) { + this.splitString(termstr, this.termList); + } + + /** + * setKeywords: set key word of data set + * + * @param keywords data set keywords + */ + public void setKeywords(String keywords) { + this.splitString(keywords, this.keywordList); + } + + /** + * setTopicList: set topic of data set + * + * @param topicStr data set topics + */ + public void setTopicList(String topicStr) { + this.splitString(topicStr, this.topicList); + } + + /** + * setVaraliableList: set varilable of data set + * + * @param varilableStr data set variables + */ + public void setVaraliableList(String varilableStr) { + this.splitString(varilableStr, this.variableList); + } + + /** + * setProjectList:set project of data set + * + * @param project data set projects + */ + public void setProjectList(String project) { + this.splitString(project, this.projectList); + } + + /** + * setSourceList: set source of data set + * + * @param source data set sources + */ + public void setSourceList(String source) { + this.splitString(source, this.sourceList); + } + + /** + * setSensorList: set sensor of data set + * + * @param sensor data set sensors + */ + public void setSensorList(String sensor) { + this.splitString(sensor, this.sensorList); + } + + /** + * setISOTopicList:set iso topic of data set + * + * @param isoTopic data set iso topics + */ + public void setISOTopicList(String isoTopic) { + this.splitString(isoTopic, this.isotopicList); + } + + /** + * getKeywordList: get key word of data set + * + * @return data set keyword list + */ + public List<String> getKeywordList() { + return this.keywordList; + } + + /** + * getTermList:get term list of data set + * + * @return data set term list + */ + public List<String> getTermList() { + return this.termList; + } + + /** + * getShortName:get short name of data set + * + * @return data set short name + */ + public String getShortName() { + return this.shortname; + } + + /** + * getKeyword:get key word of data set + * + * @return data set keyword string + */ + public String getKeyword() { + return String.join(",", this.keywordList); + } + + /** + * getTerm:get term of data set + * + * @return data set term string + */ + public String getTerm() { + return String.join(",", this.termList); + } + + /** + * getTopic:get topic of data set + * + * @return data set topic string + */ + public String getTopic() { + return String.join(",", this.topicList); + } + + /** + * getVariable:get variable of data set + * + * @return data set variable string + */ + public String getVariable() { + return String.join(",", this.variableList); + } + + /** + * getAbstract:get abstract of data set + * + * @return data set abstract + */ + public String getAbstract() { + return this.abstractStr; + } + + /** + * getProject:get project of data set + * + * @return data set project string + */ + public String getProject() { + return this.project; + } + + /** + * getSource:get source of data set + * + * @return data set source string + */ + public String getSource() { + return this.source; + } + + /** + * getSensor:get sensor of data set + * + * @return data set sensor string + */ + public String getSensor() { + return this.sensor; + } + + /** + * getISOTopic:get iso topic of data set + * + * @return data set ISO topic string + */ + public String getISOTopic() { + return this.isoTopic; + } + + /** + * getAllTermList: get all term list of data set + * + * @return data set term list + */ + public List<String> getAllTermList() { + List<String> allterms = new ArrayList<>(); + + if (this.termList != null && !this.termList.isEmpty()) { + allterms.addAll(this.termList); + } + + if (this.keywordList != null && !this.keywordList.isEmpty()) { + allterms.addAll(this.keywordList); + } + + if (this.topicList != null && !this.topicList.isEmpty()) { + allterms.addAll(this.topicList); + } + + if (this.variableList != null && !this.variableList.isEmpty()) { + allterms.addAll(this.variableList); + } + + if (this.regionList != null && !this.regionList.isEmpty()) { + allterms.addAll(this.regionList); + } + return allterms; + } + + /** + * splitString: split value of fields of data set + * + * @param oristr original string + * @param list result after splitting + */ + private void splitString(String oristr, List<String> list) { + if (oristr == null) { + return; + } + + if (oristr.startsWith("\"")) { + oristr = oristr.substring(1); + } + if (oristr.endsWith("\"")) { + oristr = oristr.substring(0, oristr.length() - 1); + } + + String strs[] = oristr.trim().split(","); + if (strs != null) { + for (int i = 0; i < strs.length; i++) { + String str = strs[i].trim(); + if (str.startsWith(",") || str.startsWith("\"")) { + str = str.substring(1); + } + if (str.endsWith(",") || str.endsWith("\"")) { + str = str.substring(0, str.length() - 1); + } + if (str == "") { + continue; + } + list.add(str); + } + } + } +} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/package-info.java b/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/package-info.java new file mode 100644 index 0000000..938b4ac --- /dev/null +++ b/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/package-info.java @@ -0,0 +1,17 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * This package includes classes needed for metadata analysis + */ +package org.apache.sdap.mudrod.metadata.structure; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/ontology/Ontology.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/sdap/mudrod/ontology/Ontology.java b/core/src/main/java/org/apache/sdap/mudrod/ontology/Ontology.java new file mode 100644 index 0000000..70116de --- /dev/null +++ b/core/src/main/java/org/apache/sdap/mudrod/ontology/Ontology.java @@ -0,0 +1,67 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ontology; + +import java.util.Iterator; + +/** + * Base class for working with ontologies. Methods indicate ability + * to load, merge e.g. merge relevant ontology subgraphs into a new + * subgraph which can be used within Mudrod, subclass retreival, + * synonym expansion, etc. + * + * @author lewismc + */ +public interface Ontology { + + /** + * Load an array URIs which resolve to ontology resources. + * + * @param urls a {@link java.lang.String} containing ontology URIs. + */ + public void load(String[] urls); + + /** + * Load a collection of default ontology resources. + */ + public void load() ; + + /** + * merge relevant ontology subgraphs into a new subgraph which can + * be used within Mudrod + * + * @param o an ontology to merge with the current ontology held + * within Mudrod. + */ + public void merge(Ontology o); + + /** + * Retreive all subclasses for a particular entity provided within the + * search term e.g.subclass-based query expansion. + * + * @param entitySearchTerm an input search term + * @return an {@link java.util.Iterator} object containing subClass entries. + */ + public Iterator<String> subclasses(String entitySearchTerm); + + /** + * Retreive all synonyms for a particular entity provided within the + * search term e.g.synonym-based query expansion. + * + * @param queryKeyPhrase a phrase to undertake synonym expansion on. + * @return an {@link java.util.Iterator} object containing synonym entries. + */ + public Iterator<String> synonyms(String queryKeyPhrase); + +} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/ontology/OntologyFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/sdap/mudrod/ontology/OntologyFactory.java b/core/src/main/java/org/apache/sdap/mudrod/ontology/OntologyFactory.java new file mode 100644 index 0000000..984607c --- /dev/null +++ b/core/src/main/java/org/apache/sdap/mudrod/ontology/OntologyFactory.java @@ -0,0 +1,83 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ontology; + +import org.apache.sdap.mudrod.main.MudrodConstants; +import org.apache.sdap.mudrod.ontology.process.EsipCOROntology; +import org.apache.sdap.mudrod.ontology.process.EsipPortalOntology; +import org.apache.sdap.mudrod.ontology.process.LocalOntology; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Properties; + +/** + * The mechanism for creating an {@link Ontology} + * implementation. The {@link Ontology} implementation + * should be specified in + * <a href="https://github.com/mudrod/mudrod/blob/master/core/src/main/resources/config.xml"> + * config.xml</a> with configuration key + * <code>mudrod.ontology.implementation</code>. + * This property can also be accessed via + * {@link MudrodConstants#ONTOLOGY_IMPL}. + * + * @author lewismc + */ +public class OntologyFactory { + + public static final Logger LOG = LoggerFactory.getLogger(OntologyFactory.class); + + private Properties props; + + /** + * The mechanism for creating an {@link Ontology} + * implementation. + * + * @param props a populated Mudrod {@link java.util.Properties} object. + */ + public OntologyFactory(Properties props) { + this.props = props; + } + + /** + * Obtain the {@link Ontology} + * implementation for use within Mudrod. + * + * @return Returns the ontology implementation specified + * in <a href="https://github.com/mudrod/mudrod/blob/master/core/src/main/resources/config.xml"> + * config.xml</a> with configuration key + * <code>mudrod.ontology.implementation</code>. This property can also be accessed via + * {@link MudrodConstants#ONTOLOGY_IMPL}. + */ + public Ontology getOntology() { + + String ontologyImpl = this.props.getProperty(MudrodConstants.ONTOLOGY_IMPL, "Local"); + + LOG.info("Using ontology extension: {}", ontologyImpl); + Ontology ontImpl; + switch (ontologyImpl) { + case "EsipCOR": + ontImpl = new EsipCOROntology(); + break; + case "EsipPortal": + ontImpl = new EsipPortalOntology(); + break; + default: + ontImpl = new LocalOntology(); + break; + } + return ontImpl; + } + +} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/ontology/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/sdap/mudrod/ontology/package-info.java b/core/src/main/java/org/apache/sdap/mudrod/ontology/package-info.java new file mode 100644 index 0000000..44596e3 --- /dev/null +++ b/core/src/main/java/org/apache/sdap/mudrod/ontology/package-info.java @@ -0,0 +1,17 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * This package includes ontology pre-processing and processing classes. + */ +package org.apache.sdap.mudrod.ontology; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/ontology/pre/AggregateTriples.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/sdap/mudrod/ontology/pre/AggregateTriples.java b/core/src/main/java/org/apache/sdap/mudrod/ontology/pre/AggregateTriples.java new file mode 100644 index 0000000..e94d678 --- /dev/null +++ b/core/src/main/java/org/apache/sdap/mudrod/ontology/pre/AggregateTriples.java @@ -0,0 +1,225 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ontology.pre; + +import org.apache.commons.io.FilenameUtils; +import org.apache.sdap.mudrod.discoveryengine.DiscoveryStepAbstract; +import org.apache.sdap.mudrod.driver.ESDriver; +import org.apache.sdap.mudrod.driver.SparkDriver; +import org.jdom2.Document; +import org.jdom2.Element; +import org.jdom2.JDOMException; +import org.jdom2.Namespace; +import org.jdom2.filter.ElementFilter; +import org.jdom2.input.SAXBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.Properties; + +/** + * Supports ability to extract triples (subclassOf, equivalent class) from OWL file + */ +public class AggregateTriples extends DiscoveryStepAbstract { + private static final long serialVersionUID = 1L; + private static final Logger LOG = LoggerFactory.getLogger(AggregateTriples.class); + + public AggregateTriples(Properties props, ESDriver es, SparkDriver spark) { + super(props, es, spark); + } + + /** + * Method of executing triple aggregation + */ + @Override + public Object execute() { + File file = new File(this.props.getProperty("oceanTriples")); + if (file.exists()) { + file.delete(); + } + try { + file.createNewFile(); + } catch (IOException e2) { + e2.printStackTrace(); + } + + FileWriter fw; + try { + fw = new FileWriter(file.getAbsoluteFile()); + bw = new BufferedWriter(fw); + } catch (IOException e) { + e.printStackTrace(); + } + + File[] files = new File(this.props.getProperty("ontologyInputDir")).listFiles(); + for (File file_in : files) { + String ext = FilenameUtils.getExtension(file_in.getAbsolutePath()); + if ("owl".equals(ext)) { + try { + loadxml(file_in.getAbsolutePath()); + getAllClass(); + } catch (JDOMException e1) { + e1.printStackTrace(); + } catch (IOException e1) { + e1.printStackTrace(); + } + + } + } + + try { + bw.close(); + } catch (IOException e) { + e.printStackTrace(); + } + return null; + } + + public Document document; + public Element rootNode = null; + final static String owl_namespace = "http://www.w3.org/2002/07/owl#"; + final static String rdf_namespace = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; + final static String rdfs_namespace = "http://www.w3.org/2000/01/rdf-schema#"; + + BufferedWriter bw = null; + + /** + * Load OWL file into memory + * + * @param filePathName local path of OWL file + * @throws JDOMException JDOMException + * @throws IOException IOException + */ + public void loadxml(String filePathName) throws JDOMException, IOException { + SAXBuilder saxBuilder = new SAXBuilder(); + File file = new File(filePathName); + + document = saxBuilder.build(file); + rootNode = document.getRootElement(); + } + + /** + * Method of going through OWL structure + */ + public void loopxml() { + Iterator<?> processDescendants = rootNode.getDescendants(new ElementFilter()); + String text = ""; + + while (processDescendants.hasNext()) { + Element e = (Element) processDescendants.next(); + String currentName = e.getName(); + text = e.getTextTrim(); + if ("".equals(text)) { + LOG.info(currentName); + } else { + LOG.info("{} : {}", currentName, text); + } + } + } + + /** + * Method of identifying a specific child given a element name + * + * @param str element name + * @param ele parent element + * @return the element of child + */ + public Element findChild(String str, Element ele) { + Iterator<?> processDescendants = ele.getDescendants(new ElementFilter()); + String name = ""; + Element result = null; + + while (processDescendants.hasNext()) { + Element e = (Element) processDescendants.next(); + name = e.getName(); + if (name.equals(str)) { + result = e; + return result; + } + } + return result; + + } + + /** + * Method of extract triples (subclassOf, equivalent class) from OWL file + * + * @throws IOException IOException + */ + public void getAllClass() throws IOException { + List<?> classElements = rootNode.getChildren("Class", Namespace.getNamespace("owl", owl_namespace)); + + for (int i = 0; i < classElements.size(); i++) { + Element classElement = (Element) classElements.get(i); + String className = classElement.getAttributeValue("about", Namespace.getNamespace("rdf", rdf_namespace)); + + if (className == null) { + className = classElement.getAttributeValue("ID", Namespace.getNamespace("rdf", rdf_namespace)); + } + + List<?> subclassElements = classElement.getChildren("subClassOf", Namespace.getNamespace("rdfs", rdfs_namespace)); + for (int j = 0; j < subclassElements.size(); j++) { + Element subclassElement = (Element) subclassElements.get(j); + String subclassName = subclassElement.getAttributeValue("resource", Namespace.getNamespace("rdf", rdf_namespace)); + if (subclassName == null) { + Element allValuesFromEle = findChild("allValuesFrom", subclassElement); + if (allValuesFromEle != null) { + subclassName = allValuesFromEle.getAttributeValue("resource", Namespace.getNamespace("rdf", rdf_namespace)); + bw.write(cutString(className) + ",SubClassOf," + cutString(subclassName) + "\n"); + } + } else { + bw.write(cutString(className) + ",SubClassOf," + cutString(subclassName) + "\n"); + } + + } + + List equalClassElements = classElement.getChildren("equivalentClass", Namespace.getNamespace("owl", owl_namespace)); + for (int k = 0; k < equalClassElements.size(); k++) { + Element equalClassElement = (Element) equalClassElements.get(k); + String equalClassElementName = equalClassElement.getAttributeValue("resource", Namespace.getNamespace("rdf", rdf_namespace)); + + if (equalClassElementName != null) { + bw.write(cutString(className) + ",equivalentClass," + cutString(equalClassElementName) + "\n"); + } + } + + } + } + + /** + * Method of cleaning up a string + * + * @param str String needed to be processed + * @return the processed string + */ + public String cutString(String str) { + str = str.substring(str.indexOf("#") + 1); + String[] strArray = str.split("(?=[A-Z])"); + str = Arrays.toString(strArray); + return str.substring(1, str.length() - 1).replace(",", ""); + } + + @Override + public Object execute(Object o) { + return null; + } + +} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/ontology/pre/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/sdap/mudrod/ontology/pre/package-info.java b/core/src/main/java/org/apache/sdap/mudrod/ontology/pre/package-info.java new file mode 100644 index 0000000..3f7c87e --- /dev/null +++ b/core/src/main/java/org/apache/sdap/mudrod/ontology/pre/package-info.java @@ -0,0 +1,17 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * This package includes ontology pre-processing classes. + */ +package org.apache.sdap.mudrod.ontology.pre; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/ontology/process/EsipCOROntology.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/sdap/mudrod/ontology/process/EsipCOROntology.java b/core/src/main/java/org/apache/sdap/mudrod/ontology/process/EsipCOROntology.java new file mode 100644 index 0000000..45d04a8 --- /dev/null +++ b/core/src/main/java/org/apache/sdap/mudrod/ontology/process/EsipCOROntology.java @@ -0,0 +1,69 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ontology.process; + +import java.util.Iterator; + +import org.apache.sdap.mudrod.ontology.Ontology; + +/** + * @author lewismc + */ +public class EsipCOROntology implements Ontology { + + /** + * + */ + public EsipCOROntology() { + //default constructor + } + + @Override + public void load() { + // to be completed + } + + /* (non-Javadoc) + * @see Ontology#load(java.lang.String[]) + */ + @Override + public void load(String[] urls) { + // to be completed + } + + /* (non-Javadoc) + * @see Ontology#merge(Ontology) + */ + @Override + public void merge(Ontology o) { + // to be completed + } + + /* (non-Javadoc) + * @see Ontology#subclasses(java.lang.String) + */ + @Override + public Iterator<String> subclasses(String entitySearchTerm) { + return null; + } + + /* (non-Javadoc) + * @see Ontology#synonyms(java.lang.String) + */ + @Override + public Iterator<String> synonyms(String queryKeyPhrase) { + return null; + } + +} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/ontology/process/EsipPortalOntology.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/sdap/mudrod/ontology/process/EsipPortalOntology.java b/core/src/main/java/org/apache/sdap/mudrod/ontology/process/EsipPortalOntology.java new file mode 100644 index 0000000..c989a29 --- /dev/null +++ b/core/src/main/java/org/apache/sdap/mudrod/ontology/process/EsipPortalOntology.java @@ -0,0 +1,72 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ontology.process; + +import java.util.Iterator; + +import org.apache.sdap.mudrod.ontology.Ontology; + +/** + * @author lewismc + */ +public class EsipPortalOntology implements Ontology { + + /** + * + */ + public EsipPortalOntology() { + //default constructor + } + + /* (non-Javadoc) + * @see Ontology#load(java.lang.String[]) + */ + @Override + public void load(String[] urls) { + // to be completed + } + + /* (non-Javadoc) + * @see Ontology#load() + */ + @Override + public void load() { + // to be completed + } + + /* (non-Javadoc) + * @see Ontology#merge(Ontology) + */ + @Override + public void merge(Ontology o) { + // to be completed + } + + /* (non-Javadoc) + * @see Ontology#subclasses(java.lang.String) + */ + @Override + public Iterator<String> subclasses(String entitySearchTerm) { + return null; + } + + /* (non-Javadoc) + * @see Ontology#synonyms(java.lang.String) + */ + @Override + public Iterator<String> synonyms(String queryKeyPhrase) { + return null; + } + +} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/ontology/process/LocalOntology.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/sdap/mudrod/ontology/process/LocalOntology.java b/core/src/main/java/org/apache/sdap/mudrod/ontology/process/LocalOntology.java new file mode 100644 index 0000000..0380c07 --- /dev/null +++ b/core/src/main/java/org/apache/sdap/mudrod/ontology/process/LocalOntology.java @@ -0,0 +1,391 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.sdap.mudrod.ontology.process; + +import org.apache.jena.ontology.Individual; +import org.apache.jena.ontology.OntClass; +import org.apache.jena.ontology.OntModel; +import org.apache.jena.ontology.OntModelSpec; +import org.apache.jena.ontology.OntResource; +import org.apache.jena.ontology.Restriction; +import org.apache.jena.rdf.model.AnonId; +import org.apache.jena.rdf.model.Literal; +import org.apache.jena.rdf.model.ModelFactory; +import org.apache.jena.rdf.model.Resource; +import org.apache.jena.shared.PrefixMapping; +import org.apache.sdap.mudrod.ontology.Ontology; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.PrintStream; +import java.net.MalformedURLException; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +/** + * The LocalOntology implementation enables us to work with Ontology files + * whcih are cached locally and available on the runtime classpath e.g. + * in <code>src/main/resource/ontology/...</code>. + * From here we can test and iterate on how use of ontology can enhance search. + */ +public class LocalOntology implements Ontology { + + public static final Logger LOG = LoggerFactory.getLogger(LocalOntology.class); + + public static final String DELIMITER_SEARCHTERM = " "; + + private Map<Object, Object> searchTerms = new HashMap<>(); + private static OntologyParser parser; + private static OntModel ontologyModel; + private Ontology ontology; + private static Map<AnonId, String> mAnonIDs = new HashMap<>(); + private static int mAnonCount = 0; + private List<String> ontArrayList; + + public LocalOntology() { + //only initialize all the static variables + //if first time called to this ontology constructor + if (ontology == null) { + if (LOG.isInfoEnabled()) { + LOG.info("Creating new ontology"); + } + parser = new OwlParser(); + ontology = this; + } + if (ontologyModel == null) + ontologyModel = ModelFactory.createOntologyModel(OntModelSpec.OWL_MEM, null); + load(); + } + + /** + * Static accessor for {@link LocalOntology} + * instance implementation defined within <code>config.xml</code>. + * + * @return a {@link LocalOntology} + */ + public Ontology getInstance() { + if (ontology == null) { + ontology = new LocalOntology(); + } + return ontology; + } + + /** + * Load the default <i>sweetAll.owl</i> ontology + * from <a href="https://raw.githubusercontent.com/ESIPFed/sweet/master/2.4/sweetAll.owl"> + * https://raw.githubusercontent.com/ESIPFed/sweet/master/2.4/sweetAll.owl</a> + */ + @Override + public void load() { + URL ontURL = null; + try { + ontURL = new URL("https://raw.githubusercontent.com/ESIPFed/sweet/master/2.4/sweetAll.owl"); + //ontURL = new URL("https://raw.githubusercontent.com/ESIPFed/sweet/master/2.4/reprDataProduct.owl"); + } catch (MalformedURLException e) { + LOG.error("Error when attempting to create URL resource: ", e); + } + ontArrayList = new ArrayList<>(); + try { + ontArrayList.add(ontURL.toURI().toString()); + } catch (URISyntaxException e) { + LOG.error("Error in URL syntax, please check your Ontology resource: ", e); + } + if (!ontArrayList.isEmpty()) { + load(ontArrayList.stream().toArray(String[]::new)); + } + } + + /** + * Load a string array of local URIs which refernece .owl files. + */ + @Override + public void load(String[] urls) { + for (int i = 0; i < urls.length; i++) { + String url = urls[i].trim(); + if (!"".equals(url)) + if (LOG.isInfoEnabled()) { + LOG.info("Reading and processing {}", url); + } + load(ontologyModel, url); + } + parser.parse(ontology, ontologyModel); + } + + private void load(Object m, String url) { + try { + ((OntModel) m).read(url, null, null); + LOG.info("Successfully processed {}", url); + } catch (Exception e) { + LOG.error("Failed whilst attempting to read ontology {}: Error: ", url, e); + } + } + + /** + * Get the {@link org.apache.sdap.mudrod.ontology.process.OntologyParser} + * implementation being used to process the input ontology resources. + * @return an {@link org.apache.sdap.mudrod.ontology.process.OntologyParser} implementation + */ + public OntologyParser getParser() { + if (parser == null) { + parser = new OwlParser(); + } + return parser; + } + + /** + * Return the {@link org.apache.jena.ontology.OntModel} instance + * which created from input ontology resources. + * @return a constructed {@link org.apache.jena.ontology.OntModel} + */ + public static OntModel getModel() { + return ontologyModel; + } + + /** + * Return the loaded Ontology resources. + * @return a {@link java.util.List} of resources. + */ + public List<String> getLoadedOntologyResources() { + if (ontArrayList != null) { + return ontArrayList; + } else { + return new ArrayList<>(); + } + } + /** + * Not yet implemented. + */ + @Override + public void merge(Ontology o) { + // not yet implemented + } + + /** + * Retrieve all subclasses of entity(ies) hashed to searchTerm + * @param entitySearchTerm a query (keywords) for which to obtain + * subclasses. + * @return an {@link java.util.Iterator} containing the subclass as Strings. + */ + @Override + public Iterator<String> subclasses(String entitySearchTerm) { + Map<OntResource, String> classMap = retrieve(entitySearchTerm); + Map<String, String> subclasses = new HashMap<>(); + + Iterator<OntResource> iter = classMap.keySet().iterator(); + while (iter.hasNext()) { + OntResource resource = iter.next(); + + if (resource instanceof OntClass) { + //get subclasses N.B. we only get direct sub-classes e.g. direct children + //it is possible for us to navigate the entire class tree if we wish, we simply + //need to pass the .listSubClasses(true) boolean parameter. + for (Iterator<?> i = ((OntClass) resource).listSubClasses(); i.hasNext();) { + OntResource subclass = (OntResource) i.next(); + for (Iterator<?> j = subclass.listLabels(null); j.hasNext();) { + Literal l = (Literal) j.next(); + subclasses.put(l.toString(), "1"); + } + } + //get individuals + for (Iterator<?> i = ((OntClass) resource).listInstances(); i.hasNext(); ) { + OntResource subclass = (OntResource) i.next(); + for (Iterator<?> j = subclass.listLabels(null); j.hasNext();) { + Literal l = (Literal) j.next(); + subclasses.put(l.toString(), "1"); + } + } + } else if (resource instanceof Individual) { + for (Iterator<?> i = resource.listSameAs(); i.hasNext();) { + OntResource subclass = (OntResource) i.next(); + for (Iterator<?> j = subclass.listLabels(null); j.hasNext();) { + Literal l = (Literal) j.next(); + subclasses.put(l.toString(), "1"); + } + } + } + } + return subclasses.keySet().iterator(); + } + + /** + * Retreives synonyms for an given phrase if the phrase + * is present in the ontology + * @param queryKeyPhrase an input string representing a phrase + * for which we wish to obtain synonyms. + * @return an {@link java.util.Iterator} containing synonyms string tokens + * or an empty if no synonyms exist for the given queryKeyPhrase. + */ + @Override + public Iterator synonyms(String queryKeyPhrase) { + + Map<?, ?> classMap = retrieve(queryKeyPhrase); + + Map<Object, Object> synonyms = new HashMap<>(); + + Iterator<?> iter = classMap.keySet().iterator(); + while (iter.hasNext()) { + OntResource resource = (OntResource) iter.next(); + + //listLabels + for (Iterator<?> i = resource.listLabels(null); i.hasNext();) { + Literal l = (Literal) i.next(); + synonyms.put(l.toString(), "1"); + } + + if (resource instanceof Individual) { + //get all individuals same as this one + for (Iterator<?> i = resource.listSameAs(); i.hasNext();) { + Individual individual = (Individual) i.next(); + //add labels + for (Iterator<?> j = individual.listLabels(null); j.hasNext();) { + Literal l = (Literal) i.next(); + synonyms.put(l.toString(), "1"); + } + } + } else if (resource instanceof OntClass) { + //list equivalent classes + for (Iterator<?> i = ((OntClass) resource).listEquivalentClasses(); i.hasNext();) { + OntClass equivClass = (OntClass) i.next(); + //add labels + for (Iterator<?> j = equivClass.listLabels(null); j.hasNext();) { + Literal l = (Literal) j.next(); + synonyms.put(l.toString(), "1"); + } + } + } + } + + return synonyms.keySet().iterator(); + } + + public void addSearchTerm(String label, OntResource resource) { + Map<OntResource, String> m = retrieve(label); + if (m == null) { + m = new HashMap<>(); + } + m.put(resource, "1"); + searchTerms.put(label.toLowerCase(), m); + } + + /** + * A basic lookup function for retrieving keys (phrases or tokens) + * from the ontology search terms map. Right now only exact lookups + * will retrieve a result... this could be improved by using some + * advanced parsing logic... such as Lucene query parser. + * @param label the label (phrases or tokens) to retrieve from the + * ontology search terms map. + * @return an {@link java.util.Map} if there are match(es) + * or an empty {@link java.util.HashMap} if there are no + * matches. + */ + public Map<OntResource, String> retrieve(String label) { + @SuppressWarnings("unchecked") + Map<OntResource, String> m = (Map<OntResource, String>) searchTerms.get(label.toLowerCase()); + if (m == null) { + m = new HashMap<>(); + } + return m; + } + + protected static void renderHierarchy(PrintStream out, OntClass cls, List<Object> occurs, int depth) { + renderClassDescription(out, cls, depth); + out.println(); + + // recurse to the next level down + if (cls.canAs(OntClass.class) && !occurs.contains(cls)) { + for (Iterator<?> i = cls.listSubClasses(true); i.hasNext(); ) { + OntClass sub = (OntClass) i.next(); + + // we push this expression on the occurs list before we recurse + occurs.add(cls); + renderHierarchy(out, sub, occurs, depth + 1); + occurs.remove(cls); + } + for (Iterator<?> i = cls.listInstances(); i.hasNext(); ) { + Individual individual = (Individual) i.next(); + renderURI(out, individual.getModel(), individual.getURI()); + out.print(" ["); + for (Iterator<?> j = individual.listLabels(null); j.hasNext(); ) { + out.print(((Literal) j.next()).getString() + ", "); + } + out.print("] "); + out.println(); + } + } + } + + public static void renderClassDescription(PrintStream out, OntClass c, int depth) { + indent(out, depth); + + if (c.isRestriction()) { + renderRestriction(out, (Restriction) c.as(Restriction.class)); + } else { + if (!c.isAnon()) { + out.print("Class "); + renderURI(out, c.getModel(), c.getURI()); + + out.print(c.getLocalName()); + + out.print(" ["); + for (Iterator<?> i = c.listLabels(null); i.hasNext(); ) { + out.print(((Literal) i.next()).getString() + ", "); + } + out.print("] "); + } else { + renderAnonymous(out, c, "class"); + } + } + } + + protected static void renderRestriction(PrintStream out, Restriction r) { + if (!r.isAnon()) { + out.print("Restriction "); + renderURI(out, r.getModel(), r.getURI()); + } else { + renderAnonymous(out, r, "restriction"); + } + + out.print(" on property "); + renderURI(out, r.getModel(), r.getOnProperty().getURI()); + } + + protected static void renderURI(PrintStream out, PrefixMapping prefixes, String uri) { + out.print(prefixes.expandPrefix(uri)); + } + + protected static void renderAnonymous(PrintStream out, Resource anon, String name) { + String anonID = mAnonIDs.get(anon.getId()); + if (anonID == null) { + anonID = "a-" + mAnonCount++; + mAnonIDs.put(anon.getId(), anonID); + } + + out.print("Anonymous "); + out.print(name); + out.print(" with ID "); + out.print(anonID); + } + + protected static void indent(PrintStream out, int depth) { + for (int i = 0; i < depth; i++) { + out.print(" "); + } + } + +}
