[09/17] incubator-sdap-mudrod git commit: SDAP-7 Change all package namespaces to org.apache.sdap

lewismc Tue, 19 Dec 2017 06:13:49 -0800

http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/main/MudrodEngine.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/sdap/mudrod/main/MudrodEngine.java 
b/core/src/main/java/org/apache/sdap/mudrod/main/MudrodEngine.java
new file mode 100644
index 0000000..341d5fc
--- /dev/null
+++ b/core/src/main/java/org/apache/sdap/mudrod/main/MudrodEngine.java
@@ -0,0 +1,456 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you 
+ * may not use this file except in compliance with the License. 
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.sdap.mudrod.main;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.sdap.mudrod.discoveryengine.DiscoveryEngineAbstract;
+import org.apache.sdap.mudrod.discoveryengine.MetadataDiscoveryEngine;
+import org.apache.sdap.mudrod.discoveryengine.OntologyDiscoveryEngine;
+import org.apache.sdap.mudrod.discoveryengine.RecommendEngine;
+import org.apache.sdap.mudrod.discoveryengine.WeblogDiscoveryEngine;
+import org.apache.sdap.mudrod.driver.ESDriver;
+import org.apache.sdap.mudrod.driver.SparkDriver;
+import org.apache.sdap.mudrod.integration.LinkageIntegration;
+import org.jdom2.Document;
+import org.jdom2.Element;
+import org.jdom2.JDOMException;
+import org.jdom2.input.SAXBuilder;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static org.apache.sdap.mudrod.main.MudrodConstants.DATA_DIR;
+
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.nio.file.Files;
+import java.util.List;
+import java.util.Properties;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+/**
+ * Main entry point for Running the Mudrod system. Invocation of this class is
+ * tightly linked to the primary Mudrod configuration which can be located at
+ * <a href=
+ * 
"https://github.com/mudrod/mudrod/blob/master/core/src/main/resources/config.xml";>config.xml</a>.
+ */
+public class MudrodEngine {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(MudrodEngine.class);
+  private Properties props = new Properties();
+  private ESDriver es = null;
+  private SparkDriver spark = null;
+  private static final String LOG_INGEST = "logIngest";
+  private static final String META_INGEST = "metaIngest";
+  private static final String FULL_INGEST = "fullIngest";
+  private static final String PROCESSING = "processingWithPreResults";
+  private static final String ES_HOST = "esHost";
+  private static final String ES_TCP_PORT = "esTCPPort";
+  private static final String ES_HTTP_PORT = "esPort";
+
+  /**
+   * Public constructor for this class.
+   */
+  public MudrodEngine() {
+    // default constructor
+  }
+
+  /**
+   * Start the {@link ESDriver}. Should only be called after call to
+   * {@link MudrodEngine#loadConfig()}
+   *
+   * @return fully provisioned {@link ESDriver}
+   */
+  public ESDriver startESDriver() {
+    return new ESDriver(props);
+  }
+
+  /**
+   * Start the {@link SparkDriver}. Should only be called after call to
+   * {@link MudrodEngine#loadConfig()}
+   *
+   * @return fully provisioned {@link SparkDriver}
+   */
+  public SparkDriver startSparkDriver() {
+    return new SparkDriver(props);
+  }
+
+  /**
+   * Retreive the Mudrod configuration as a Properties Map containing K, V of
+   * type String.
+   *
+   * @return a {@link java.util.Properties} object
+   */
+  public Properties getConfig() {
+    return props;
+  }
+
+  /**
+   * Retreive the Mudrod {@link ESDriver}
+   *
+   * @return the {@link ESDriver} instance.
+   */
+  public ESDriver getESDriver() {
+    return this.es;
+  }
+
+  /**
+   * Set the Elasticsearch driver for MUDROD
+   *
+   * @param es
+   *          an ES driver instance
+   */
+  public void setESDriver(ESDriver es) {
+    this.es = es;
+  }
+
+  private InputStream locateConfig() {
+
+    String configLocation = System.getenv(MudrodConstants.MUDROD_CONFIG) == 
null ? "" : System.getenv(MudrodConstants.MUDROD_CONFIG);
+    File configFile = new File(configLocation);
+
+    try {
+      InputStream configStream = new FileInputStream(configFile);
+      LOG.info("Loaded config file from " + configFile.getAbsolutePath());
+      return configStream;
+    } catch (IOException e) {
+      LOG.info("File specified by environment variable " + 
MudrodConstants.MUDROD_CONFIG + "=\'" + configLocation + "\' could not be 
loaded. " + e.getMessage());
+    }
+
+    InputStream configStream = 
MudrodEngine.class.getClassLoader().getResourceAsStream("config.xml");
+
+    if (configStream != null) {
+      LOG.info("Loaded config file from {}", 
MudrodEngine.class.getClassLoader().getResource("config.xml").getPath());
+    }
+
+    return configStream;
+  }
+
+  /**
+   * Load the configuration provided at <a href=
+   * 
"https://github.com/mudrod/mudrod/blob/master/core/src/main/resources/config.xml";>config.xml</a>.
+   *
+   * @return a populated {@link java.util.Properties} object.
+   */
+  public Properties loadConfig() {
+    SAXBuilder saxBuilder = new SAXBuilder();
+
+    InputStream configStream = locateConfig();
+
+    Document document;
+    try {
+      document = saxBuilder.build(configStream);
+      Element rootNode = document.getRootElement();
+      List<Element> paraList = rootNode.getChildren("para");
+
+      for (int i = 0; i < paraList.size(); i++) {
+        Element paraNode = paraList.get(i);
+        String attributeName = paraNode.getAttributeValue("name");
+        if (MudrodConstants.SVM_SGD_MODEL.equals(attributeName)) {
+          props.put(attributeName, 
decompressSVMWithSGDModel(paraNode.getTextTrim()));
+        } else {
+          props.put(attributeName, paraNode.getTextTrim());
+        }
+      }
+    } catch (JDOMException | IOException e) {
+      LOG.error("Exception whilst retrieving or processing XML contained 
within 'config.xml'!", e);
+    }
+    return getConfig();
+
+  }
+
+  private String decompressSVMWithSGDModel(String archiveName) throws 
IOException {
+
+    URL scmArchive = getClass().getClassLoader().getResource(archiveName);
+    if (scmArchive == null) {
+      throw new IOException("Unable to locate " + archiveName + " as a 
classpath resource.");
+    }
+    File tempDir = Files.createTempDirectory("mudrod").toFile();
+    assert tempDir.setWritable(true);
+    File archiveFile = new File(tempDir, archiveName);
+    FileUtils.copyURLToFile(scmArchive, archiveFile);
+
+    // Decompress archive
+    int BUFFER_SIZE = 512000;
+    ZipInputStream zipIn = new ZipInputStream(new 
FileInputStream(archiveFile));
+    ZipEntry entry;
+    while ((entry = zipIn.getNextEntry()) != null) {
+      File f = new File(tempDir, entry.getName());
+      // If the entry is a directory, create the directory.
+      if (entry.isDirectory() && !f.exists()) {
+        boolean created = f.mkdirs();
+        if (!created) {
+          LOG.error("Unable to create directory '{}', during extraction of 
archive contents.", f.getAbsolutePath());
+        }
+      } else if (!entry.isDirectory()) {
+        boolean created = f.getParentFile().mkdirs();
+        if (!created && !f.getParentFile().exists()) {
+          LOG.error("Unable to create directory '{}', during extraction of 
archive contents.", f.getParentFile().getAbsolutePath());
+        }
+        int count;
+        byte data[] = new byte[BUFFER_SIZE];
+        FileOutputStream fos = new FileOutputStream(new File(tempDir, 
entry.getName()), false);
+        try (BufferedOutputStream dest = new BufferedOutputStream(fos, 
BUFFER_SIZE)) {
+          while ((count = zipIn.read(data, 0, BUFFER_SIZE)) != -1) {
+            dest.write(data, 0, count);
+          }
+        }
+      }
+    }
+
+    return new File(tempDir, StringUtils.removeEnd(archiveName, 
".zip")).toURI().toString();
+  }
+
+  /**
+   * Preprocess and process logs {@link DiscoveryEngineAbstract} 
implementations
+   * for weblog
+   */
+  public void startLogIngest() {
+    DiscoveryEngineAbstract wd = new WeblogDiscoveryEngine(props, es, spark);
+    wd.preprocess();
+    wd.process();
+    LOG.info("*****************logs have been ingested 
successfully******************");
+  }
+
+  /**
+   * updating and analysing metadata to metadata similarity results
+   */
+  public void startMetaIngest() {
+    DiscoveryEngineAbstract md = new MetadataDiscoveryEngine(props, es, spark);
+    md.preprocess();
+    md.process();
+
+    DiscoveryEngineAbstract recom = new RecommendEngine(props, es, spark);
+    recom.preprocess();
+    recom.process();
+    LOG.info("Metadata has been ingested successfully.");
+  }
+
+  public void startFullIngest() {
+    DiscoveryEngineAbstract wd = new WeblogDiscoveryEngine(props, es, spark);
+    wd.preprocess();
+    wd.process();
+
+    DiscoveryEngineAbstract md = new MetadataDiscoveryEngine(props, es, spark);
+    md.preprocess();
+    md.process();
+
+    DiscoveryEngineAbstract recom = new RecommendEngine(props, es, spark);
+    recom.preprocess();
+    recom.process();
+    LOG.info("Full ingest has finished successfully.");
+  }
+
+  /**
+   * Only preprocess various {@link DiscoveryEngineAbstract} implementations 
for
+   * weblog, ontology and metadata, linkage discovery and integration.
+   */
+  public void startProcessing() {
+    DiscoveryEngineAbstract wd = new WeblogDiscoveryEngine(props, es, spark);
+    wd.process();
+
+    DiscoveryEngineAbstract od = new OntologyDiscoveryEngine(props, es, spark);
+    od.preprocess();
+    od.process();
+
+    DiscoveryEngineAbstract md = new MetadataDiscoveryEngine(props, es, spark);
+    md.preprocess();
+    md.process();
+
+    LinkageIntegration li = new LinkageIntegration(props, es, spark);
+    li.execute();
+
+    DiscoveryEngineAbstract recom = new RecommendEngine(props, es, spark);
+    recom.process();
+  }
+
+  /**
+   * Close the connection to the {@link ESDriver} instance.
+   */
+  public void end() {
+    if (es != null) {
+      es.close();
+    }
+  }
+
+  /**
+   * Main program invocation. Accepts one argument denoting location (on disk)
+   * to a log file which is to be ingested. Help will be provided if invoked
+   * with incorrect parameters.
+   *
+   * @param args
+   *          {@link java.lang.String} array contaning correct parameters.
+   */
+  public static void main(String[] args) {
+    // boolean options
+    Option helpOpt = new Option("h", "help", false, "show this help message");
+
+    // log ingest (preprocessing + processing)
+    Option logIngestOpt = new Option("l", LOG_INGEST, false, "begin log 
ingest");
+    // metadata ingest (preprocessing + processing)
+    Option metaIngestOpt = new Option("m", META_INGEST, false, "begin metadata 
ingest");
+    // ingest both log and metadata
+    Option fullIngestOpt = new Option("f", FULL_INGEST, false, "begin full 
ingest Mudrod workflow");
+    // processing only, assuming that preprocessing results is in dataDir
+    Option processingOpt = new Option("p", PROCESSING, false, "begin 
processing with preprocessing results");
+
+    // argument options
+    Option dataDirOpt = 
OptionBuilder.hasArg(true).withArgName("/path/to/data/directory").hasArgs(1).withDescription("the
 data directory to be processed by Mudrod").withLongOpt("dataDirectory")
+        .isRequired().create(DATA_DIR);
+
+    Option esHostOpt = 
OptionBuilder.hasArg(true).withArgName("host_name").hasArgs(1).withDescription("elasticsearch
 cluster unicast host").withLongOpt("elasticSearchHost").isRequired(false)
+        .create(ES_HOST);
+
+    Option esTCPPortOpt = 
OptionBuilder.hasArg(true).withArgName("port_num").hasArgs(1).withDescription("elasticsearch
 transport TCP port").withLongOpt("elasticSearchTransportTCPPort")
+        .isRequired(false).create(ES_TCP_PORT);
+
+    Option esPortOpt = 
OptionBuilder.hasArg(true).withArgName("port_num").hasArgs(1).withDescription("elasticsearch
 HTTP/REST port").withLongOpt("elasticSearchHTTPPort").isRequired(false)
+        .create(ES_HTTP_PORT);
+
+    // create the options
+    Options options = new Options();
+    options.addOption(helpOpt);
+    options.addOption(logIngestOpt);
+    options.addOption(metaIngestOpt);
+    options.addOption(fullIngestOpt);
+    options.addOption(processingOpt);
+    options.addOption(dataDirOpt);
+    options.addOption(esHostOpt);
+    options.addOption(esTCPPortOpt);
+    options.addOption(esPortOpt);
+
+    CommandLineParser parser = new GnuParser();
+    try {
+      CommandLine line = parser.parse(options, args);
+      String processingType = null;
+
+      if (line.hasOption(LOG_INGEST)) {
+        processingType = LOG_INGEST;
+      } else if (line.hasOption(PROCESSING)) {
+        processingType = PROCESSING;
+      } else if (line.hasOption(META_INGEST)) {
+        processingType = META_INGEST;
+      } else if (line.hasOption(FULL_INGEST)) {
+        processingType = FULL_INGEST;
+      }
+
+      String dataDir = line.getOptionValue(DATA_DIR).replace("\\", "/");
+      if (!dataDir.endsWith("/")) {
+        dataDir += "/";
+      }
+
+      MudrodEngine me = new MudrodEngine();
+      me.loadConfig();
+      me.props.put(DATA_DIR, dataDir);
+
+      if (line.hasOption(ES_HOST)) {
+        String esHost = line.getOptionValue(ES_HOST);
+        me.props.put(MudrodConstants.ES_UNICAST_HOSTS, esHost);
+      }
+
+      if (line.hasOption(ES_TCP_PORT)) {
+        String esTcpPort = line.getOptionValue(ES_TCP_PORT);
+        me.props.put(MudrodConstants.ES_TRANSPORT_TCP_PORT, esTcpPort);
+      }
+
+      if (line.hasOption(ES_HTTP_PORT)) {
+        String esHttpPort = line.getOptionValue(ES_HTTP_PORT);
+        me.props.put(MudrodConstants.ES_HTTP_PORT, esHttpPort);
+      }
+
+      me.es = new ESDriver(me.getConfig());
+      me.spark = new SparkDriver(me.getConfig());
+      loadFullConfig(me, dataDir);
+      if (processingType != null) {
+        switch (processingType) {
+        case PROCESSING:
+          me.startProcessing();
+          break;
+        case LOG_INGEST:
+          me.startLogIngest();
+          break;
+        case META_INGEST:
+          me.startMetaIngest();
+          break;
+        case FULL_INGEST:
+          me.startFullIngest();
+          break;
+        default:
+          break;
+        }
+      }
+      me.end();
+    } catch (Exception e) {
+      HelpFormatter formatter = new HelpFormatter();
+      formatter.printHelp("MudrodEngine: 'dataDir' argument is mandatory. " + 
"User must also provide an ingest method.", options, true);
+      LOG.error("Error whilst parsing command line.", e);
+    }
+  }
+
+  private static void loadFullConfig(MudrodEngine me, String dataDir) {
+    //TODO all of the properties defined below, which are determined are
+    //runtime need to be added to MudrodConstants.java and referenced 
+    //accordingly and consistently from 
Properties.getProperty(MudrodConstant...);
+    me.props.put("ontologyInputDir", dataDir + "SWEET_ocean/");
+    me.props.put("oceanTriples", dataDir + "Ocean_triples.csv");
+    me.props.put("userHistoryMatrix", dataDir + "UserHistoryMatrix.csv");
+    me.props.put("clickstreamMatrix", dataDir + "ClickstreamMatrix.csv");
+    me.props.put("metadataMatrix", dataDir + "MetadataMatrix.csv");
+    me.props.put("clickstreamSVDMatrix_tmp", dataDir + 
"clickstreamSVDMatrix_tmp.csv");
+    me.props.put("metadataSVDMatrix_tmp", dataDir + 
"metadataSVDMatrix_tmp.csv");
+    me.props.put("raw_metadataPath", dataDir + 
me.props.getProperty(MudrodConstants.RAW_METADATA_TYPE));
+
+    me.props.put("jtopia", dataDir + "jtopiaModel");
+    me.props.put("metadata_term_tfidf_matrix", dataDir + 
"metadata_term_tfidf.csv");
+    me.props.put("metadata_word_tfidf_matrix", dataDir + 
"metadata_word_tfidf.csv");
+    me.props.put("session_metadata_Matrix", dataDir + 
"metadata_session_coocurrence_matrix.csv");
+
+    me.props.put("metadataOBCode", dataDir + "MetadataOHCode");
+    me.props.put("metadata_topic", dataDir + "metadata_topic");
+    me.props.put("metadata_topic_matrix", dataDir + 
"metadata_topic_matrix.csv");
+  }
+
+  /**
+   * Obtain the spark implementation.
+   *
+   * @return the {@link SparkDriver}
+   */
+  public SparkDriver getSparkDriver() {
+    return this.spark;
+  }
+
+  /**
+   * Set the {@link SparkDriver}
+   *
+   * @param sparkDriver
+   *          a configured {@link SparkDriver}
+   */
+  public void setSparkDriver(SparkDriver sparkDriver) {
+    this.spark = sparkDriver;
+
+  }
+}


http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/main/package-info.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/sdap/mudrod/main/package-info.java 
b/core/src/main/java/org/apache/sdap/mudrod/main/package-info.java
new file mode 100644
index 0000000..8a3af67
--- /dev/null
+++ b/core/src/main/java/org/apache/sdap/mudrod/main/package-info.java
@@ -0,0 +1,17 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you 
+ * may not use this file except in compliance with the License. 
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * This package includes Main entry point for Running the Mudrod system.
+ */
+package org.apache.sdap.mudrod.main;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/metadata/package-info.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/metadata/package-info.java 
b/core/src/main/java/org/apache/sdap/mudrod/metadata/package-info.java
new file mode 100644
index 0000000..b7b6258
--- /dev/null
+++ b/core/src/main/java/org/apache/sdap/mudrod/metadata/package-info.java
@@ -0,0 +1,18 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you 
+ * may not use this file except in compliance with the License. 
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * This package includes metadata pre-processing, processing, and data 
structure
+ * classes.
+ */
+package org.apache.sdap.mudrod.metadata;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/ApiHarvester.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/ApiHarvester.java 
b/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/ApiHarvester.java
new file mode 100644
index 0000000..7b8b5c1
--- /dev/null
+++ b/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/ApiHarvester.java
@@ -0,0 +1,178 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.sdap.mudrod.metadata.pre;
+
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonParser;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.sdap.mudrod.discoveryengine.DiscoveryStepAbstract;
+import org.apache.sdap.mudrod.driver.ESDriver;
+import org.apache.sdap.mudrod.driver.SparkDriver;
+import org.apache.sdap.mudrod.main.MudrodConstants;
+import org.apache.sdap.mudrod.utils.HttpRequest;
+import org.elasticsearch.action.index.IndexRequest;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.*;
+import java.util.Properties;
+
+/**
+ * ClassName: ApiHarvester Function: Harvest metadata from PO.DAACweb service.
+ */
+public class ApiHarvester extends DiscoveryStepAbstract {
+
+  private static final long serialVersionUID = 1L;
+  private static final Logger LOG = 
LoggerFactory.getLogger(ApiHarvester.class);
+
+  /**
+   * Creates a new instance of ApiHarvester.
+   *
+   * @param props the Mudrod configuration
+   * @param es    the Elasticsearch drive
+   * @param spark the spark driver
+   */
+  public ApiHarvester(Properties props, ESDriver es, SparkDriver spark) {
+    super(props, es, spark);
+  }
+
+  @Override
+  public Object execute() {
+    LOG.info("Starting Metadata harvesting.");
+    startTime = System.currentTimeMillis();
+    //remove old metadata from ES
+    es.deleteType(props.getProperty(MudrodConstants.ES_INDEX_NAME), 
props.getProperty(MudrodConstants.RAW_METADATA_TYPE));
+    //harvest new metadata using PO.DAAC web services
+    harvestMetadatafromWeb();
+    es.createBulkProcessor();
+    addMetadataMapping();
+    importToES();
+    es.destroyBulkProcessor();
+    endTime = System.currentTimeMillis();
+    es.refreshIndex();
+    LOG.info("Metadata harvesting completed. Time elapsed: {}", (endTime - 
startTime) / 1000);
+    return null;
+  }
+
+  /**
+   * addMetadataMapping: Add mapping to index metadata in Elasticsearch. Please
+   * invoke this method before import metadata to Elasticsearch.
+   */
+  public void addMetadataMapping() {
+    String mappingJson = "{\r\n   \"dynamic_templates\": " + "[\r\n      " + 
"{\r\n         \"strings\": " + "{\r\n            \"match_mapping_type\": 
\"string\","
+        + "\r\n            \"mapping\": {\r\n               \"type\": 
\"text\"," + "\r\n               \"fielddata\": true," + "\r\n               
\"analyzer\": \"english\","
+        + "\r\n            \"fields\": {\r\n               \"raw\": {" + "\r\n 
              \"type\": \"string\"," + "\r\n               \"index\": 
\"not_analyzed\"" + "\r\n            }"
+        + "\r\n         }\r\n " + "\r\n            }" + "\r\n         }\r\n    
  }\r\n   ]\r\n}";
+
+    
es.getClient().admin().indices().preparePutMapping(props.getProperty(MudrodConstants.ES_INDEX_NAME)).setType(props.getProperty(MudrodConstants.RAW_METADATA_TYPE)).setSource(mappingJson).execute()
+        .actionGet();
+  }
+
+  /**
+   * importToES: Index metadata into elasticsearch from local file directory.
+   * Please make sure metadata have been harvest from web service before
+   * invoking this method.
+   */
+  private void importToES() {
+    File directory = new 
File(props.getProperty(MudrodConstants.RAW_METADATA_PATH));
+    if(!directory.exists())
+      directory.mkdir();
+    File[] fList = directory.listFiles();
+    for (File file : fList) {
+      InputStream is;
+      try {
+        is = new FileInputStream(file);
+        importSingleFileToES(is);
+      } catch (FileNotFoundException e) {
+        LOG.error("Error finding file!", e);
+      }
+
+    }
+  }
+
+  private void importSingleFileToES(InputStream is) {
+    try {
+      String jsonTxt = IOUtils.toString(is);
+      JsonParser parser = new JsonParser();
+      JsonElement item = parser.parse(jsonTxt);
+      IndexRequest ir = new 
IndexRequest(props.getProperty(MudrodConstants.ES_INDEX_NAME), 
props.getProperty(MudrodConstants.RAW_METADATA_TYPE)).source(item.toString());
+      es.getBulkProcessor().add(ir);
+    } catch (IOException e) {
+      LOG.error("Error indexing metadata record!", e);
+    }
+  }
+
+  /**
+   * harvestMetadatafromWeb: Harvest metadata from PO.DAAC web service.
+   */
+  private void harvestMetadatafromWeb() {
+    LOG.info("Metadata download started.");
+    int startIndex = 0;
+    int doc_length = 0;
+    JsonParser parser = new JsonParser();
+    do {
+      String searchAPI = "https://podaac.jpl.nasa.gov/api/dataset?startIndex="; 
+ Integer.toString(startIndex) + 
"&entries=10&sortField=Dataset-AllTimePopularity&sortOrder=asc&id=&value=&search=";
+      HttpRequest http = new HttpRequest();
+      String response = http.getRequest(searchAPI);
+
+      JsonElement json = parser.parse(response);
+      JsonObject responseObject = json.getAsJsonObject();
+      JsonArray docs = 
responseObject.getAsJsonObject("response").getAsJsonArray("docs");
+
+      doc_length = docs.size();
+
+      File file = new 
File(props.getProperty(MudrodConstants.RAW_METADATA_PATH));
+      if (!file.exists()) {
+        if (file.mkdir()) {
+          LOG.info("Directory is created!");
+        } else {
+          LOG.error("Failed to create directory!");
+        }
+      }
+      for (int i = 0; i < doc_length; i++) {
+        JsonElement item = docs.get(i);
+        int docId = startIndex + i;
+        File itemfile = new 
File(props.getProperty(MudrodConstants.RAW_METADATA_PATH) + "/" + docId + 
".json");
+
+        try (FileWriter fw = new FileWriter(itemfile.getAbsoluteFile()); 
BufferedWriter bw = new BufferedWriter(fw);) {
+          itemfile.createNewFile();
+          bw.write(item.toString());
+        } catch (IOException e) {
+          LOG.error("Error writing metadata to local file!", e);
+        }
+      }
+
+      startIndex += 10;
+
+      try {
+        Thread.sleep(100);
+      } catch (InterruptedException e) {
+        LOG.error("Error entering Elasticsearch Mappings!", e);
+        Thread.currentThread().interrupt();
+      }
+
+    } while (doc_length != 0);
+    
+    LOG.info("Metadata downloading finished");
+  }
+
+  @Override
+  public Object execute(Object o) {
+    return null;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/MatrixGenerator.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/MatrixGenerator.java 
b/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/MatrixGenerator.java
new file mode 100644
index 0000000..63565b2
--- /dev/null
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/MatrixGenerator.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you 
+ * may not use this file except in compliance with the License. 
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.sdap.mudrod.metadata.pre;
+
+import org.apache.sdap.mudrod.discoveryengine.DiscoveryStepAbstract;
+import org.apache.sdap.mudrod.driver.ESDriver;
+import org.apache.sdap.mudrod.driver.SparkDriver;
+import org.apache.sdap.mudrod.main.MudrodConstants;
+import org.apache.sdap.mudrod.metadata.structure.MetadataExtractor;
+import org.apache.sdap.mudrod.utils.LabeledRowMatrix;
+import org.apache.sdap.mudrod.utils.MatrixUtil;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.List;
+import java.util.Properties;
+
+/**
+ * Generate term-metadata matrix from original metadata. Each row in
+ * the matrix is corresponding to a term, and each column is a metadata.
+ */
+public class MatrixGenerator extends DiscoveryStepAbstract {
+
+  /**
+   *
+   */
+  private static final long serialVersionUID = 1L;
+  private static final Logger LOG = 
LoggerFactory.getLogger(MatrixGenerator.class);
+
+  /**
+   * Creates a new instance of MatrixGenerator.
+   *
+   * @param props the Mudrod configuration
+   * @param es    the Elasticsearch drive
+   * @param spark the spark drive
+   */
+  public MatrixGenerator(Properties props, ESDriver es, SparkDriver spark) {
+    super(props, es, spark);
+  }
+
+  /**
+   * Generate a csv which is a term-metadata matrix genetrated from original
+   * metadata.
+   *
+   * @see DiscoveryStepAbstract#execute()
+   */
+  @Override
+  public Object execute() {
+    LOG.info("Metadata matrix started");
+    startTime = System.currentTimeMillis();
+
+    String metadataMatrixFile = props.getProperty("metadataMatrix");
+    try {
+      MetadataExtractor extractor = new MetadataExtractor();
+      JavaPairRDD<String, List<String>> metadataTermsRDD = 
extractor.loadMetadata(this.es, this.spark.sc, 
props.getProperty(MudrodConstants.ES_INDEX_NAME), 
props.getProperty(MudrodConstants.RAW_METADATA_TYPE));
+      LabeledRowMatrix wordDocMatrix = 
MatrixUtil.createWordDocMatrix(metadataTermsRDD);
+      MatrixUtil.exportToCSV(wordDocMatrix.rowMatrix, wordDocMatrix.rowkeys, 
wordDocMatrix.colkeys, metadataMatrixFile);
+
+    } catch (Exception e) {
+      LOG.error("Error during Metadata matrix generaion: {}", e);
+    }
+
+    endTime = System.currentTimeMillis();
+    LOG.info("Metadata matrix finished time elapsed: {}s", (endTime - 
startTime) / 1000);
+    return null;
+  }
+
+  @Override
+  public Object execute(Object o) {
+    return null;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/package-info.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/package-info.java 
b/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/package-info.java
new file mode 100644
index 0000000..ffecbc8
--- /dev/null
+++ b/core/src/main/java/org/apache/sdap/mudrod/metadata/pre/package-info.java
@@ -0,0 +1,17 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you 
+ * may not use this file except in compliance with the License. 
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * This package includes metadata pre-processing functions.
+ */
+package org.apache.sdap.mudrod.metadata.pre;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/metadata/process/MetadataAnalyzer.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/metadata/process/MetadataAnalyzer.java
 
b/core/src/main/java/org/apache/sdap/mudrod/metadata/process/MetadataAnalyzer.java
new file mode 100644
index 0000000..80e23c1
--- /dev/null
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/metadata/process/MetadataAnalyzer.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you 
+ * may not use this file except in compliance with the License. 
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.sdap.mudrod.metadata.process;
+
+import org.apache.sdap.mudrod.discoveryengine.DiscoveryStepAbstract;
+import org.apache.sdap.mudrod.driver.ESDriver;
+import org.apache.sdap.mudrod.driver.SparkDriver;
+import org.apache.sdap.mudrod.semantics.SVDAnalyzer;
+import org.apache.sdap.mudrod.utils.LinkageTriple;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.Serializable;
+import java.util.List;
+import java.util.Properties;
+
+/**
+ * ClassName: MetadataAnalyzer
+ * Function: Calculate semantic relationship of vocabularies extracted from
+ * metadata.
+ */
+public class MetadataAnalyzer extends DiscoveryStepAbstract implements 
Serializable {
+
+  /**
+   *
+   */
+  private static final long serialVersionUID = 1L;
+  private static final Logger LOG = 
LoggerFactory.getLogger(MetadataAnalyzer.class);
+
+  /**
+   * Creates a new instance of MetadataAnalyzer.
+   *
+   * @param props the Mudrod configuration
+   * @param es    the Elasticsearch drive
+   * @param spark the spark drive
+   */
+  public MetadataAnalyzer(Properties props, ESDriver es, SparkDriver spark) {
+    super(props, es, spark);
+  }
+
+  @Override
+  public Object execute(Object o) {
+    return null;
+  }
+
+  /**
+   * Calculate semantic relationship of vocabularies from a csv file which is a
+   * term-metadata matrix.
+   *
+   * @see DiscoveryStepAbstract#execute()
+   */
+  @Override
+  public Object execute() {
+    try {
+      LOG.info("*****************Metadata Analyzer starts******************");
+      startTime = System.currentTimeMillis();
+
+      SVDAnalyzer analyzer = new SVDAnalyzer(props, es, spark);
+      int svdDimension = 
Integer.parseInt(props.getProperty("metadataSVDDimension"));
+      String metadataMatrixFile = props.getProperty("metadataMatrix");
+      String svdMatrixFileName = props.getProperty("metadataSVDMatrix_tmp");
+
+      analyzer.getSVDMatrix(metadataMatrixFile, svdDimension, 
svdMatrixFileName);
+      List<LinkageTriple> triples = 
analyzer.calTermSimfromMatrix(svdMatrixFileName);
+
+      analyzer.saveToES(triples, props.getProperty("indexName"), 
props.getProperty("metadataLinkageType"));
+
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+
+    endTime = System.currentTimeMillis();
+    es.refreshIndex();
+    LOG.info("*****************Metadata Analyzer ends******************Took 
{}s", (endTime - startTime) / 1000);
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/metadata/process/package-info.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/metadata/process/package-info.java 
b/core/src/main/java/org/apache/sdap/mudrod/metadata/process/package-info.java
new file mode 100644
index 0000000..a0c0799
--- /dev/null
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/metadata/process/package-info.java
@@ -0,0 +1,17 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you 
+ * may not use this file except in compliance with the License. 
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * This package includes metadata processing classes.
+ */
+package org.apache.sdap.mudrod.metadata.process;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/MetadataExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/MetadataExtractor.java
 
b/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/MetadataExtractor.java
new file mode 100644
index 0000000..379d5b9
--- /dev/null
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/MetadataExtractor.java
@@ -0,0 +1,145 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.sdap.mudrod.metadata.structure;
+
+import org.apache.sdap.mudrod.driver.ESDriver;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.Function2;
+import org.apache.spark.api.java.function.PairFunction;
+import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.index.query.QueryBuilders;
+import org.elasticsearch.search.SearchHit;
+import scala.Tuple2;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ExecutionException;
+
+public class MetadataExtractor implements Serializable {
+
+  /**
+   *
+   */
+  private static final long serialVersionUID = 1L;
+
+  public MetadataExtractor() {
+  }
+
+  /**
+   * loadMetadata:Load all metadata from Elasticsearch and convert them to
+   * pairRDD Please make sure metadata has been already harvested from web
+   * service and stored in Elasticsearch.
+   *
+   * @param es    an Elasticsearch client node instance
+   * @param sc    spark context
+   * @param index index name of log processing application
+   * @param type  metadata type name
+   * @return PairRDD, in each pair key is metadata short name and value is term
+   * list extracted from metadata variables.
+   */
+  public JavaPairRDD<String, List<String>> loadMetadata(ESDriver es, 
JavaSparkContext sc, String index, String type) {
+    List<PODAACMetadata> metadatas = this.loadMetadataFromES(es, index, type);
+    JavaPairRDD<String, List<String>> metadataTermsRDD = 
this.buildMetadataRDD(es, sc, index, metadatas);
+    return metadataTermsRDD;
+  }
+
+  /**
+   * loadMetadataFromES: Load all metadata from Elasticsearch.
+   *
+   * @param es    an Elasticsearch client node instance
+   * @param index index name of log processing application
+   * @param type  metadata type name
+   * @return metadata list
+   */
+  protected List<PODAACMetadata> loadMetadataFromES(ESDriver es, String index, 
String type) {
+
+    List<PODAACMetadata> metadatas = new ArrayList<PODAACMetadata>();
+    SearchResponse scrollResp = 
es.getClient().prepareSearch(index).setTypes(type).setQuery(QueryBuilders.matchAllQuery()).setScroll(new
 TimeValue(60000)).setSize(100).execute().actionGet();
+
+    while (true) {
+      for (SearchHit hit : scrollResp.getHits().getHits()) {
+        Map<String, Object> result = hit.getSource();
+        String shortname = (String) result.get("Dataset-ShortName");
+        List<String> topic = (List<String>) 
result.get("DatasetParameter-Topic");
+        List<String> term = (List<String>) result.get("DatasetParameter-Term");
+        List<String> keyword = (List<String>) result.get("Dataset-Metadata");
+        List<String> variable = (List<String>) 
result.get("DatasetParameter-Variable");
+        List<String> longname = (List<String>) 
result.get("DatasetProject-Project-LongName");
+
+        List<String> region = (List<String>) 
result.get("DatasetRegion-Region");
+
+        PODAACMetadata metadata = null;
+        try {
+          metadata = new PODAACMetadata(shortname, longname, 
es.customAnalyzing(index, topic), es.customAnalyzing(index, term), 
es.customAnalyzing(index, variable), es.customAnalyzing(index, keyword),
+              es.customAnalyzing(index, region));
+        } catch (InterruptedException | ExecutionException e) {
+          e.printStackTrace();
+
+        }
+        metadatas.add(metadata);
+      }
+      scrollResp = 
es.getClient().prepareSearchScroll(scrollResp.getScrollId()).setScroll(new 
TimeValue(600000)).execute().actionGet();
+      if (scrollResp.getHits().getHits().length == 0) {
+        break;
+      }
+    }
+
+    return metadatas;
+  }
+
+  /**
+   * buildMetadataRDD: Convert metadata list to JavaPairRDD
+   *
+   * @param es        an Elasticsearch client node instance
+   * @param sc        spark context
+   * @param index     index name of log processing application
+   * @param metadatas metadata list
+   * @return PairRDD, in each pair key is metadata short name and value is term
+   * list extracted from metadata variables.
+   */
+  protected JavaPairRDD<String, List<String>> buildMetadataRDD(ESDriver es, 
JavaSparkContext sc, String index, List<PODAACMetadata> metadatas) {
+    JavaRDD<PODAACMetadata> metadataRDD = sc.parallelize(metadatas);
+    JavaPairRDD<String, List<String>> metadataTermsRDD = 
metadataRDD.mapToPair(new PairFunction<PODAACMetadata, String, List<String>>() {
+      /**
+       *
+       */
+      private static final long serialVersionUID = 1L;
+
+      @Override
+      public Tuple2<String, List<String>> call(PODAACMetadata metadata) throws 
Exception {
+        return new Tuple2<String, List<String>>(metadata.getShortName(), 
metadata.getAllTermList());
+      }
+    }).reduceByKey(new Function2<List<String>, List<String>, List<String>>() {
+      /**
+       *
+       */
+      private static final long serialVersionUID = 1L;
+
+      @Override
+      public List<String> call(List<String> v1, List<String> v2) throws 
Exception {
+        List<String> list = new ArrayList<String>();
+        list.addAll(v1);
+        list.addAll(v2);
+        return list;
+      }
+    });
+
+    return metadataTermsRDD;
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/PODAACMetadata.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/PODAACMetadata.java
 
b/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/PODAACMetadata.java
new file mode 100644
index 0000000..de3edf7
--- /dev/null
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/PODAACMetadata.java
@@ -0,0 +1,337 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.sdap.mudrod.metadata.structure;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * ClassName: PODAACMetadata Function: PODAACMetadata setter and getter methods
+ */
+public class PODAACMetadata implements Serializable {
+
+  /**
+   *
+   */
+  private static final long serialVersionUID = 1L;
+  // shortname: data set short name
+  private String shortname;
+  // abstractStr: data set abstract
+  private String abstractStr;
+  // isoTopic: data set topic
+  private String isoTopic;
+  // sensor: sensor
+  private String sensor;
+  // source: data source
+  private String source;
+  // project: data project
+  private String project;
+  // hasAbstarct: whether data set has abstract
+  boolean hasAbstarct;
+
+  // longnameList: data set long name list
+  private List<String> longnameList;
+  // keywordList:data set key word list
+  private List<String> keywordList;
+  // termList: data set term list
+  private List<String> termList;
+  // topicList: data set topic list
+  private List<String> topicList;
+  // variableList: data set variable list
+  private List<String> variableList;
+  // abstractList: data set abstract term list
+  private List<String> abstractList;
+  // isotopicList: data set iso topic list
+  private List<String> isotopicList;
+  // sensorList: data set sensor list
+  private List<String> sensorList;
+  // sourceList: data set source list
+  private List<String> sourceList;
+  // projectList: data set project list
+  private List<String> projectList;
+  // regionList: data set region list
+  private List<String> regionList;
+
+  public PODAACMetadata() {
+    // Default constructor
+  }
+
+  /**
+   * Creates a new instance of PODAACMetadata.
+   *
+   * @param shortname data set short name
+   * @param longname  data set long name
+   * @param topics    data set topics
+   * @param terms     data set terms
+   * @param variables data set variables
+   * @param keywords  data set keywords
+   * @param region    list of regions
+   */
+  public PODAACMetadata(String shortname, List<String> longname, List<String> 
topics, List<String> terms, List<String> variables, List<String> keywords, 
List<String> region) {
+    this.shortname = shortname;
+    this.longnameList = longname;
+    this.keywordList = keywords;
+    this.termList = terms;
+    this.topicList = topics;
+    this.variableList = variables;
+    this.regionList = region;
+  }
+
+  /**
+   * setTerms: set term of data set
+   *
+   * @param termstr data set terms
+   */
+  public void setTerms(String termstr) {
+    this.splitString(termstr, this.termList);
+  }
+
+  /**
+   * setKeywords: set key word of data set
+   *
+   * @param keywords data set keywords
+   */
+  public void setKeywords(String keywords) {
+    this.splitString(keywords, this.keywordList);
+  }
+
+  /**
+   * setTopicList: set topic of data set
+   *
+   * @param topicStr data set topics
+   */
+  public void setTopicList(String topicStr) {
+    this.splitString(topicStr, this.topicList);
+  }
+
+  /**
+   * setVaraliableList: set varilable of data set
+   *
+   * @param varilableStr data set variables
+   */
+  public void setVaraliableList(String varilableStr) {
+    this.splitString(varilableStr, this.variableList);
+  }
+
+  /**
+   * setProjectList:set project of data set
+   *
+   * @param project data set projects
+   */
+  public void setProjectList(String project) {
+    this.splitString(project, this.projectList);
+  }
+
+  /**
+   * setSourceList: set source of data set
+   *
+   * @param source data set sources
+   */
+  public void setSourceList(String source) {
+    this.splitString(source, this.sourceList);
+  }
+
+  /**
+   * setSensorList: set sensor of data set
+   *
+   * @param sensor data set sensors
+   */
+  public void setSensorList(String sensor) {
+    this.splitString(sensor, this.sensorList);
+  }
+
+  /**
+   * setISOTopicList:set iso topic of data set
+   *
+   * @param isoTopic data set iso topics
+   */
+  public void setISOTopicList(String isoTopic) {
+    this.splitString(isoTopic, this.isotopicList);
+  }
+
+  /**
+   * getKeywordList: get key word of data set
+   *
+   * @return data set keyword list
+   */
+  public List<String> getKeywordList() {
+    return this.keywordList;
+  }
+
+  /**
+   * getTermList:get term list of data set
+   *
+   * @return data set term list
+   */
+  public List<String> getTermList() {
+    return this.termList;
+  }
+
+  /**
+   * getShortName:get short name of data set
+   *
+   * @return data set short name
+   */
+  public String getShortName() {
+    return this.shortname;
+  }
+
+  /**
+   * getKeyword:get key word of data set
+   *
+   * @return data set keyword string
+   */
+  public String getKeyword() {
+    return String.join(",", this.keywordList);
+  }
+
+  /**
+   * getTerm:get term of data set
+   *
+   * @return data set term string
+   */
+  public String getTerm() {
+    return String.join(",", this.termList);
+  }
+
+  /**
+   * getTopic:get topic of data set
+   *
+   * @return data set topic string
+   */
+  public String getTopic() {
+    return String.join(",", this.topicList);
+  }
+
+  /**
+   * getVariable:get variable of data set
+   *
+   * @return data set variable string
+   */
+  public String getVariable() {
+    return String.join(",", this.variableList);
+  }
+
+  /**
+   * getAbstract:get abstract of data set
+   *
+   * @return data set abstract
+   */
+  public String getAbstract() {
+    return this.abstractStr;
+  }
+
+  /**
+   * getProject:get project of data set
+   *
+   * @return data set project string
+   */
+  public String getProject() {
+    return this.project;
+  }
+
+  /**
+   * getSource:get source of data set
+   *
+   * @return data set source string
+   */
+  public String getSource() {
+    return this.source;
+  }
+
+  /**
+   * getSensor:get sensor of data set
+   *
+   * @return data set sensor string
+   */
+  public String getSensor() {
+    return this.sensor;
+  }
+
+  /**
+   * getISOTopic:get iso topic of data set
+   *
+   * @return data set ISO topic string
+   */
+  public String getISOTopic() {
+    return this.isoTopic;
+  }
+
+  /**
+   * getAllTermList: get all term list of data set
+   *
+   * @return data set term list
+   */
+  public List<String> getAllTermList() {
+    List<String> allterms = new ArrayList<>();
+
+    if (this.termList != null && !this.termList.isEmpty()) {
+      allterms.addAll(this.termList);
+    }
+
+    if (this.keywordList != null && !this.keywordList.isEmpty()) {
+      allterms.addAll(this.keywordList);
+    }
+
+    if (this.topicList != null && !this.topicList.isEmpty()) {
+      allterms.addAll(this.topicList);
+    }
+
+    if (this.variableList != null && !this.variableList.isEmpty()) {
+      allterms.addAll(this.variableList);
+    }
+
+    if (this.regionList != null && !this.regionList.isEmpty()) {
+      allterms.addAll(this.regionList);
+    }
+    return allterms;
+  }
+
+  /**
+   * splitString: split value of fields of data set
+   *
+   * @param oristr original string
+   * @param list   result after splitting
+   */
+  private void splitString(String oristr, List<String> list) {
+    if (oristr == null) {
+      return;
+    }
+
+    if (oristr.startsWith("\"")) {
+      oristr = oristr.substring(1);
+    }
+    if (oristr.endsWith("\"")) {
+      oristr = oristr.substring(0, oristr.length() - 1);
+    }
+
+    String strs[] = oristr.trim().split(",");
+    if (strs != null) {
+      for (int i = 0; i < strs.length; i++) {
+        String str = strs[i].trim();
+        if (str.startsWith(",") || str.startsWith("\"")) {
+          str = str.substring(1);
+        }
+        if (str.endsWith(",") || str.endsWith("\"")) {
+          str = str.substring(0, str.length() - 1);
+        }
+        if (str == "") {
+          continue;
+        }
+        list.add(str);
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/package-info.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/package-info.java
 
b/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/package-info.java
new file mode 100644
index 0000000..938b4ac
--- /dev/null
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/metadata/structure/package-info.java
@@ -0,0 +1,17 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you 
+ * may not use this file except in compliance with the License. 
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * This package includes classes needed for metadata analysis
+ */
+package org.apache.sdap.mudrod.metadata.structure;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/ontology/Ontology.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/sdap/mudrod/ontology/Ontology.java 
b/core/src/main/java/org/apache/sdap/mudrod/ontology/Ontology.java
new file mode 100644
index 0000000..70116de
--- /dev/null
+++ b/core/src/main/java/org/apache/sdap/mudrod/ontology/Ontology.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.sdap.mudrod.ontology;
+
+import java.util.Iterator;
+
+/**
+ * Base class for working with ontologies. Methods indicate ability
+ * to load, merge e.g. merge relevant ontology subgraphs into a new
+ * subgraph which can be used within Mudrod, subclass retreival,
+ * synonym expansion, etc.
+ *
+ * @author lewismc
+ */
+public interface Ontology {
+
+  /**
+   * Load an array URIs which resolve to ontology resources.
+   *
+   * @param urls a {@link java.lang.String} containing ontology URIs.
+   */
+  public void load(String[] urls);
+
+  /**
+   * Load a collection of default ontology resources.
+   */
+  public void load() ;
+
+  /**
+   * merge relevant ontology subgraphs into a new subgraph which can
+   * be used within Mudrod
+   *
+   * @param o an ontology to merge with the current ontology held
+   *          within Mudrod.
+   */
+  public void merge(Ontology o);
+
+  /**
+   * Retreive all subclasses for a particular entity provided within the
+   * search term e.g.subclass-based query expansion.
+   *
+   * @param entitySearchTerm an input search term
+   * @return an {@link java.util.Iterator} object containing subClass entries.
+   */
+  public Iterator<String> subclasses(String entitySearchTerm);
+
+  /**
+   * Retreive all synonyms for a particular entity provided within the
+   * search term e.g.synonym-based query expansion.
+   *
+   * @param queryKeyPhrase a phrase to undertake synonym expansion on.
+   * @return an {@link java.util.Iterator} object containing synonym entries.
+   */
+  public Iterator<String> synonyms(String queryKeyPhrase);
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/ontology/OntologyFactory.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/ontology/OntologyFactory.java 
b/core/src/main/java/org/apache/sdap/mudrod/ontology/OntologyFactory.java
new file mode 100644
index 0000000..984607c
--- /dev/null
+++ b/core/src/main/java/org/apache/sdap/mudrod/ontology/OntologyFactory.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.sdap.mudrod.ontology;
+
+import org.apache.sdap.mudrod.main.MudrodConstants;
+import org.apache.sdap.mudrod.ontology.process.EsipCOROntology;
+import org.apache.sdap.mudrod.ontology.process.EsipPortalOntology;
+import org.apache.sdap.mudrod.ontology.process.LocalOntology;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Properties;
+
+/**
+ * The mechanism for creating an {@link Ontology}
+ * implementation. The {@link Ontology} implementation
+ * should be specified in
+ * <a 
href="https://github.com/mudrod/mudrod/blob/master/core/src/main/resources/config.xml";>
+ * config.xml</a> with configuration key
+ * <code>mudrod.ontology.implementation</code>.
+ * This property can also be accessed via
+ * {@link MudrodConstants#ONTOLOGY_IMPL}.
+ *
+ * @author lewismc
+ */
+public class OntologyFactory {
+
+  public static final Logger LOG = 
LoggerFactory.getLogger(OntologyFactory.class);
+
+  private Properties props;
+
+  /**
+   * The mechanism for creating an {@link Ontology}
+   * implementation.
+   *
+   * @param props a populated Mudrod {@link java.util.Properties} object.
+   */
+  public OntologyFactory(Properties props) {
+    this.props = props;
+  }
+
+  /**
+   * Obtain the {@link Ontology}
+   * implementation for use within Mudrod.
+   *
+   * @return Returns the ontology implementation specified
+   * in <a 
href="https://github.com/mudrod/mudrod/blob/master/core/src/main/resources/config.xml";>
+   * config.xml</a> with configuration key
+   * <code>mudrod.ontology.implementation</code>. This property can also be 
accessed via
+   * {@link MudrodConstants#ONTOLOGY_IMPL}.
+   */
+  public Ontology getOntology() {
+
+    String ontologyImpl = 
this.props.getProperty(MudrodConstants.ONTOLOGY_IMPL, "Local");
+
+    LOG.info("Using ontology extension: {}", ontologyImpl);
+    Ontology ontImpl;
+    switch (ontologyImpl) {
+    case "EsipCOR":
+      ontImpl = new EsipCOROntology();
+      break;
+    case "EsipPortal":
+      ontImpl = new EsipPortalOntology();
+      break;
+    default:
+      ontImpl = new LocalOntology();
+      break;
+    }
+    return ontImpl;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/ontology/package-info.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/ontology/package-info.java 
b/core/src/main/java/org/apache/sdap/mudrod/ontology/package-info.java
new file mode 100644
index 0000000..44596e3
--- /dev/null
+++ b/core/src/main/java/org/apache/sdap/mudrod/ontology/package-info.java
@@ -0,0 +1,17 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you 
+ * may not use this file except in compliance with the License. 
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * This package includes ontology pre-processing and processing classes.
+ */
+package org.apache.sdap.mudrod.ontology;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/ontology/pre/AggregateTriples.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/ontology/pre/AggregateTriples.java 
b/core/src/main/java/org/apache/sdap/mudrod/ontology/pre/AggregateTriples.java
new file mode 100644
index 0000000..e94d678
--- /dev/null
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/ontology/pre/AggregateTriples.java
@@ -0,0 +1,225 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you 
+ * may not use this file except in compliance with the License. 
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.sdap.mudrod.ontology.pre;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.sdap.mudrod.discoveryengine.DiscoveryStepAbstract;
+import org.apache.sdap.mudrod.driver.ESDriver;
+import org.apache.sdap.mudrod.driver.SparkDriver;
+import org.jdom2.Document;
+import org.jdom2.Element;
+import org.jdom2.JDOMException;
+import org.jdom2.Namespace;
+import org.jdom2.filter.ElementFilter;
+import org.jdom2.input.SAXBuilder;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Properties;
+
+/**
+ * Supports ability to extract triples (subclassOf, equivalent class) from OWL 
file
+ */
+public class AggregateTriples extends DiscoveryStepAbstract {
+  private static final long serialVersionUID = 1L;
+  private static final Logger LOG = 
LoggerFactory.getLogger(AggregateTriples.class);
+
+  public AggregateTriples(Properties props, ESDriver es, SparkDriver spark) {
+    super(props, es, spark);
+  }
+
+  /**
+   * Method of executing triple aggregation
+   */
+  @Override
+  public Object execute() {
+    File file = new File(this.props.getProperty("oceanTriples"));
+    if (file.exists()) {
+      file.delete();
+    }
+    try {
+      file.createNewFile();
+    } catch (IOException e2) {
+      e2.printStackTrace();
+    }
+
+    FileWriter fw;
+    try {
+      fw = new FileWriter(file.getAbsoluteFile());
+      bw = new BufferedWriter(fw);
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+
+    File[] files = new 
File(this.props.getProperty("ontologyInputDir")).listFiles();
+    for (File file_in : files) {
+      String ext = FilenameUtils.getExtension(file_in.getAbsolutePath());
+      if ("owl".equals(ext)) {
+        try {
+          loadxml(file_in.getAbsolutePath());
+          getAllClass();
+        } catch (JDOMException e1) {
+          e1.printStackTrace();
+        } catch (IOException e1) {
+          e1.printStackTrace();
+        }
+
+      }
+    }
+
+    try {
+      bw.close();
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+    return null;
+  }
+
+  public Document document;
+  public Element rootNode = null;
+  final static String owl_namespace = "http://www.w3.org/2002/07/owl#";;
+  final static String rdf_namespace = 
"http://www.w3.org/1999/02/22-rdf-syntax-ns#";;
+  final static String rdfs_namespace = "http://www.w3.org/2000/01/rdf-schema#";;
+
+  BufferedWriter bw = null;
+
+  /**
+   * Load OWL file into memory
+   *
+   * @param filePathName local path of OWL file
+   * @throws JDOMException JDOMException
+   * @throws IOException   IOException
+   */
+  public void loadxml(String filePathName) throws JDOMException, IOException {
+    SAXBuilder saxBuilder = new SAXBuilder();
+    File file = new File(filePathName);
+
+    document = saxBuilder.build(file);
+    rootNode = document.getRootElement();
+  }
+
+  /**
+   * Method of going through OWL structure
+   */
+  public void loopxml() {
+    Iterator<?> processDescendants = rootNode.getDescendants(new 
ElementFilter());
+    String text = "";
+
+    while (processDescendants.hasNext()) {
+      Element e = (Element) processDescendants.next();
+      String currentName = e.getName();
+      text = e.getTextTrim();
+      if ("".equals(text)) {
+        LOG.info(currentName);
+      } else {
+        LOG.info("{} : {}", currentName, text);
+      }
+    }
+  }
+
+  /**
+   * Method of identifying a specific child given a element name
+   *
+   * @param str element name
+   * @param ele parent element
+   * @return the element of child
+   */
+  public Element findChild(String str, Element ele) {
+    Iterator<?> processDescendants = ele.getDescendants(new ElementFilter());
+    String name = "";
+    Element result = null;
+
+    while (processDescendants.hasNext()) {
+      Element e = (Element) processDescendants.next();
+      name = e.getName();
+      if (name.equals(str)) {
+        result = e;
+        return result;
+      }
+    }
+    return result;
+
+  }
+
+  /**
+   * Method of extract triples (subclassOf, equivalent class) from OWL file
+   *
+   * @throws IOException IOException
+   */
+  public void getAllClass() throws IOException {
+    List<?> classElements = rootNode.getChildren("Class", 
Namespace.getNamespace("owl", owl_namespace));
+
+    for (int i = 0; i < classElements.size(); i++) {
+      Element classElement = (Element) classElements.get(i);
+      String className = classElement.getAttributeValue("about", 
Namespace.getNamespace("rdf", rdf_namespace));
+
+      if (className == null) {
+        className = classElement.getAttributeValue("ID", 
Namespace.getNamespace("rdf", rdf_namespace));
+      }
+
+      List<?> subclassElements = classElement.getChildren("subClassOf", 
Namespace.getNamespace("rdfs", rdfs_namespace));
+      for (int j = 0; j < subclassElements.size(); j++) {
+        Element subclassElement = (Element) subclassElements.get(j);
+        String subclassName = subclassElement.getAttributeValue("resource", 
Namespace.getNamespace("rdf", rdf_namespace));
+        if (subclassName == null) {
+          Element allValuesFromEle = findChild("allValuesFrom", 
subclassElement);
+          if (allValuesFromEle != null) {
+            subclassName = allValuesFromEle.getAttributeValue("resource", 
Namespace.getNamespace("rdf", rdf_namespace));
+            bw.write(cutString(className) + ",SubClassOf," + 
cutString(subclassName) + "\n");
+          }
+        } else {
+          bw.write(cutString(className) + ",SubClassOf," + 
cutString(subclassName) + "\n");
+        }
+
+      }
+
+      List equalClassElements = classElement.getChildren("equivalentClass", 
Namespace.getNamespace("owl", owl_namespace));
+      for (int k = 0; k < equalClassElements.size(); k++) {
+        Element equalClassElement = (Element) equalClassElements.get(k);
+        String equalClassElementName = 
equalClassElement.getAttributeValue("resource", Namespace.getNamespace("rdf", 
rdf_namespace));
+
+        if (equalClassElementName != null) {
+          bw.write(cutString(className) + ",equivalentClass," + 
cutString(equalClassElementName) + "\n");
+        }
+      }
+
+    }
+  }
+
+  /**
+   * Method of cleaning up a string
+   *
+   * @param str String needed to be processed
+   * @return the processed string
+   */
+  public String cutString(String str) {
+    str = str.substring(str.indexOf("#") + 1);
+    String[] strArray = str.split("(?=[A-Z])");
+    str = Arrays.toString(strArray);
+    return str.substring(1, str.length() - 1).replace(",", "");
+  }
+
+  @Override
+  public Object execute(Object o) {
+    return null;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/ontology/pre/package-info.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/ontology/pre/package-info.java 
b/core/src/main/java/org/apache/sdap/mudrod/ontology/pre/package-info.java
new file mode 100644
index 0000000..3f7c87e
--- /dev/null
+++ b/core/src/main/java/org/apache/sdap/mudrod/ontology/pre/package-info.java
@@ -0,0 +1,17 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you 
+ * may not use this file except in compliance with the License. 
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * This package includes ontology pre-processing classes.
+ */
+package org.apache.sdap.mudrod.ontology.pre;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/ontology/process/EsipCOROntology.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/ontology/process/EsipCOROntology.java
 
b/core/src/main/java/org/apache/sdap/mudrod/ontology/process/EsipCOROntology.java
new file mode 100644
index 0000000..45d04a8
--- /dev/null
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/ontology/process/EsipCOROntology.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.sdap.mudrod.ontology.process;
+
+import java.util.Iterator;
+
+import org.apache.sdap.mudrod.ontology.Ontology;
+
+/**
+ * @author lewismc
+ */
+public class EsipCOROntology implements Ontology {
+
+  /**
+   *
+   */
+  public EsipCOROntology() {
+    //default constructor
+  }
+
+  @Override
+  public void load() {
+    // to be completed
+  }
+
+  /* (non-Javadoc)
+   * @see Ontology#load(java.lang.String[])
+   */
+  @Override
+  public void load(String[] urls) {
+    // to be completed
+  }
+
+  /* (non-Javadoc)
+   * @see Ontology#merge(Ontology)
+   */
+  @Override
+  public void merge(Ontology o) {
+    // to be completed
+  }
+
+  /* (non-Javadoc)
+   * @see Ontology#subclasses(java.lang.String)
+   */
+  @Override
+  public Iterator<String> subclasses(String entitySearchTerm) {
+    return null;
+  }
+
+  /* (non-Javadoc)
+   * @see Ontology#synonyms(java.lang.String)
+   */
+  @Override
+  public Iterator<String> synonyms(String queryKeyPhrase) {
+    return null;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/ontology/process/EsipPortalOntology.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/ontology/process/EsipPortalOntology.java
 
b/core/src/main/java/org/apache/sdap/mudrod/ontology/process/EsipPortalOntology.java
new file mode 100644
index 0000000..c989a29
--- /dev/null
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/ontology/process/EsipPortalOntology.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.sdap.mudrod.ontology.process;
+
+import java.util.Iterator;
+
+import org.apache.sdap.mudrod.ontology.Ontology;
+
+/**
+ * @author lewismc
+ */
+public class EsipPortalOntology implements Ontology {
+
+  /**
+   *
+   */
+  public EsipPortalOntology() {
+    //default constructor
+  }
+
+  /* (non-Javadoc)
+   * @see Ontology#load(java.lang.String[])
+   */
+  @Override
+  public void load(String[] urls) {
+    // to be completed
+  }
+
+  /* (non-Javadoc)
+   * @see Ontology#load()
+   */
+  @Override
+  public void load() {
+    // to be completed
+  }
+
+  /* (non-Javadoc)
+   * @see Ontology#merge(Ontology)
+   */
+  @Override
+  public void merge(Ontology o) {
+    // to be completed
+  }
+
+  /* (non-Javadoc)
+   * @see Ontology#subclasses(java.lang.String)
+   */
+  @Override
+  public Iterator<String> subclasses(String entitySearchTerm) {
+    return null;
+  }
+
+  /* (non-Javadoc)
+   * @see Ontology#synonyms(java.lang.String)
+   */
+  @Override
+  public Iterator<String> synonyms(String queryKeyPhrase) {
+    return null;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/org/apache/sdap/mudrod/ontology/process/LocalOntology.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/ontology/process/LocalOntology.java 
b/core/src/main/java/org/apache/sdap/mudrod/ontology/process/LocalOntology.java
new file mode 100644
index 0000000..0380c07
--- /dev/null
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/ontology/process/LocalOntology.java
@@ -0,0 +1,391 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you 
+ * may not use this file except in compliance with the License. 
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.sdap.mudrod.ontology.process;
+
+import org.apache.jena.ontology.Individual;
+import org.apache.jena.ontology.OntClass;
+import org.apache.jena.ontology.OntModel;
+import org.apache.jena.ontology.OntModelSpec;
+import org.apache.jena.ontology.OntResource;
+import org.apache.jena.ontology.Restriction;
+import org.apache.jena.rdf.model.AnonId;
+import org.apache.jena.rdf.model.Literal;
+import org.apache.jena.rdf.model.ModelFactory;
+import org.apache.jena.rdf.model.Resource;
+import org.apache.jena.shared.PrefixMapping;
+import org.apache.sdap.mudrod.ontology.Ontology;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.PrintStream;
+import java.net.MalformedURLException;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * The LocalOntology implementation enables us to work with Ontology files
+ * whcih are cached locally and available on the runtime classpath e.g.
+ * in <code>src/main/resource/ontology/...</code>.
+ * From here we can test and iterate on how use of ontology can enhance search.
+ */
+public class LocalOntology implements Ontology {
+
+  public static final Logger LOG = 
LoggerFactory.getLogger(LocalOntology.class);
+
+  public static final String DELIMITER_SEARCHTERM = " ";
+
+  private Map<Object, Object> searchTerms = new HashMap<>();
+  private static OntologyParser parser;
+  private static OntModel ontologyModel;
+  private Ontology ontology;
+  private static Map<AnonId, String> mAnonIDs = new HashMap<>();
+  private static int mAnonCount = 0;
+  private List<String> ontArrayList;
+
+  public LocalOntology() {
+    //only initialize all the static variables
+    //if first time called to this ontology constructor
+    if (ontology == null) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("Creating new ontology");
+      }
+      parser = new OwlParser();
+      ontology = this;
+    }
+    if (ontologyModel == null)
+      ontologyModel = ModelFactory.createOntologyModel(OntModelSpec.OWL_MEM, 
null);
+    load();
+  }
+
+  /**
+   * Static accessor for {@link LocalOntology}
+   * instance implementation defined within <code>config.xml</code>.
+   *
+   * @return a {@link LocalOntology}
+   */
+  public Ontology getInstance() {
+    if (ontology == null) {
+      ontology = new LocalOntology();
+    }
+    return ontology;
+  }
+
+  /**
+   * Load the default <i>sweetAll.owl</i> ontology
+   * from <a 
href="https://raw.githubusercontent.com/ESIPFed/sweet/master/2.4/sweetAll.owl";>
+   * 
https://raw.githubusercontent.com/ESIPFed/sweet/master/2.4/sweetAll.owl</a>
+   */
+  @Override
+  public void load() {
+    URL ontURL = null;
+    try {
+      ontURL = new 
URL("https://raw.githubusercontent.com/ESIPFed/sweet/master/2.4/sweetAll.owl";);
+      //ontURL = new 
URL("https://raw.githubusercontent.com/ESIPFed/sweet/master/2.4/reprDataProduct.owl";);
+    } catch (MalformedURLException e) {
+      LOG.error("Error when attempting to create URL resource: ", e);
+    }
+    ontArrayList = new ArrayList<>();
+    try {
+      ontArrayList.add(ontURL.toURI().toString());
+    } catch (URISyntaxException e) {
+      LOG.error("Error in URL syntax, please check your Ontology resource: ", 
e);
+    }
+    if (!ontArrayList.isEmpty()) {
+      load(ontArrayList.stream().toArray(String[]::new));
+    }
+  }
+
+  /**
+   * Load a string array of local URIs which refernece .owl files.
+   */
+  @Override
+  public void load(String[] urls) {
+    for (int i = 0; i < urls.length; i++) {
+      String url = urls[i].trim();
+      if (!"".equals(url))
+        if (LOG.isInfoEnabled()) {
+          LOG.info("Reading and processing {}", url);
+        }
+      load(ontologyModel, url);
+    }
+    parser.parse(ontology, ontologyModel);
+  }
+
+  private void load(Object m, String url) {
+    try {
+      ((OntModel) m).read(url, null, null);
+      LOG.info("Successfully processed {}", url);
+    } catch (Exception e) {
+      LOG.error("Failed whilst attempting to read ontology {}: Error: ", url, 
e);
+    }
+  }
+
+  /**
+   * Get the {@link org.apache.sdap.mudrod.ontology.process.OntologyParser}
+   * implementation being used to process the input ontology resources.
+   * @return an {@link org.apache.sdap.mudrod.ontology.process.OntologyParser} 
implementation
+   */
+  public OntologyParser getParser() {
+    if (parser == null) {
+      parser = new OwlParser();
+    }
+    return parser;
+  }
+
+  /**
+   * Return the {@link org.apache.jena.ontology.OntModel} instance
+   * which created from input ontology resources.
+   * @return a constructed {@link org.apache.jena.ontology.OntModel}
+   */
+  public static OntModel getModel() {
+    return ontologyModel;
+  }
+
+  /**
+   * Return the loaded Ontology resources.
+   * @return a {@link java.util.List} of resources.
+   */
+  public List<String> getLoadedOntologyResources() {
+    if (ontArrayList != null) {
+      return ontArrayList;
+    } else {
+      return new ArrayList<>();
+    }
+  }
+  /**
+   * Not yet implemented.
+   */
+  @Override
+  public void merge(Ontology o) {
+    // not yet implemented
+  }
+
+  /**
+   * Retrieve all subclasses of entity(ies) hashed to searchTerm
+   * @param entitySearchTerm a query (keywords) for which to obtain
+   * subclasses.
+   * @return an {@link java.util.Iterator} containing the subclass as Strings.
+   */
+  @Override
+  public Iterator<String> subclasses(String entitySearchTerm) {
+    Map<OntResource, String> classMap = retrieve(entitySearchTerm);
+    Map<String, String> subclasses = new HashMap<>();
+
+    Iterator<OntResource> iter = classMap.keySet().iterator();
+    while (iter.hasNext()) {
+      OntResource resource = iter.next();
+
+      if (resource instanceof OntClass) {
+        //get subclasses N.B. we only get direct sub-classes e.g. direct 
children
+        //it is possible for us to navigate the entire class tree if we wish, 
we simply
+        //need to pass the .listSubClasses(true) boolean parameter.
+        for (Iterator<?> i = ((OntClass) resource).listSubClasses(); 
i.hasNext();) {
+          OntResource subclass = (OntResource) i.next();
+          for (Iterator<?> j = subclass.listLabels(null); j.hasNext();) {
+            Literal l = (Literal) j.next();
+            subclasses.put(l.toString(), "1");
+          }
+        }
+        //get individuals
+        for (Iterator<?> i = ((OntClass) resource).listInstances(); 
i.hasNext(); ) {
+          OntResource subclass = (OntResource) i.next();
+          for (Iterator<?> j = subclass.listLabels(null); j.hasNext();) {
+            Literal l = (Literal) j.next();
+            subclasses.put(l.toString(), "1");
+          }
+        }
+      } else if (resource instanceof Individual) {
+        for (Iterator<?> i = resource.listSameAs(); i.hasNext();) {
+          OntResource subclass = (OntResource) i.next();
+          for (Iterator<?> j = subclass.listLabels(null); j.hasNext();) {
+            Literal l = (Literal) j.next();
+            subclasses.put(l.toString(), "1");
+          }
+        }
+      }
+    }
+    return subclasses.keySet().iterator();
+  }
+
+  /**
+   * Retreives synonyms for an given phrase if the phrase
+   * is present in the ontology
+   * @param queryKeyPhrase an input string representing a phrase
+   * for which we wish to obtain synonyms.
+   * @return an {@link java.util.Iterator} containing synonyms string tokens
+   * or an empty if no synonyms exist for the given queryKeyPhrase.
+   */
+  @Override
+  public Iterator synonyms(String queryKeyPhrase) {
+
+    Map<?, ?> classMap = retrieve(queryKeyPhrase);
+
+    Map<Object, Object> synonyms = new HashMap<>();
+
+    Iterator<?> iter = classMap.keySet().iterator();
+    while (iter.hasNext()) {
+      OntResource resource = (OntResource) iter.next();
+
+      //listLabels
+      for (Iterator<?> i = resource.listLabels(null); i.hasNext();) {
+        Literal l = (Literal) i.next();
+        synonyms.put(l.toString(), "1");
+      }
+
+      if (resource instanceof Individual) {
+        //get all individuals same as this one
+        for (Iterator<?> i = resource.listSameAs(); i.hasNext();) {
+          Individual individual = (Individual) i.next();
+          //add labels
+          for (Iterator<?> j = individual.listLabels(null); j.hasNext();) {
+            Literal l = (Literal) i.next();
+            synonyms.put(l.toString(), "1");
+          }
+        }
+      } else if (resource instanceof OntClass) {
+        //list equivalent classes
+        for (Iterator<?> i = ((OntClass) resource).listEquivalentClasses(); 
i.hasNext();) {
+          OntClass equivClass = (OntClass) i.next();
+          //add labels
+          for (Iterator<?> j = equivClass.listLabels(null); j.hasNext();) {
+            Literal l = (Literal) j.next();
+            synonyms.put(l.toString(), "1");
+          }
+        }
+      }
+    }
+
+    return synonyms.keySet().iterator();
+  }
+
+  public void addSearchTerm(String label, OntResource resource) {
+    Map<OntResource, String> m = retrieve(label);
+    if (m == null) {
+      m = new HashMap<>();
+    }
+    m.put(resource, "1");
+    searchTerms.put(label.toLowerCase(), m);
+  }
+
+  /**
+   * A basic lookup function for retrieving keys (phrases or tokens)
+   * from the ontology search terms map. Right now only exact lookups
+   * will retrieve a result... this could be improved by using some
+   * advanced parsing logic... such as Lucene query parser.
+   * @param label the label (phrases or tokens) to retrieve from the 
+   * ontology search terms map.
+   * @return an {@link java.util.Map} if there are match(es)
+   * or an empty {@link java.util.HashMap} if there are no
+   * matches.
+   */
+  public Map<OntResource, String> retrieve(String label) {
+    @SuppressWarnings("unchecked")
+    Map<OntResource, String> m = (Map<OntResource, String>) 
searchTerms.get(label.toLowerCase());
+    if (m == null) {
+      m = new HashMap<>();
+    }
+    return m;
+  }
+
+  protected static void renderHierarchy(PrintStream out, OntClass cls, 
List<Object> occurs, int depth) {
+    renderClassDescription(out, cls, depth);
+    out.println();
+
+    // recurse to the next level down
+    if (cls.canAs(OntClass.class) && !occurs.contains(cls)) {
+      for (Iterator<?> i = cls.listSubClasses(true); i.hasNext(); ) {
+        OntClass sub = (OntClass) i.next();
+
+        // we push this expression on the occurs list before we recurse
+        occurs.add(cls);
+        renderHierarchy(out, sub, occurs, depth + 1);
+        occurs.remove(cls);
+      }
+      for (Iterator<?> i = cls.listInstances(); i.hasNext(); ) {
+        Individual individual = (Individual) i.next();
+        renderURI(out, individual.getModel(), individual.getURI());
+        out.print(" [");
+        for (Iterator<?> j = individual.listLabels(null); j.hasNext(); ) {
+          out.print(((Literal) j.next()).getString() + ", ");
+        }
+        out.print("] ");
+        out.println();
+      }
+    }
+  }
+
+  public static void renderClassDescription(PrintStream out, OntClass c, int 
depth) {
+    indent(out, depth);
+
+    if (c.isRestriction()) {
+      renderRestriction(out, (Restriction) c.as(Restriction.class));
+    } else {
+      if (!c.isAnon()) {
+        out.print("Class ");
+        renderURI(out, c.getModel(), c.getURI());
+
+        out.print(c.getLocalName());
+
+        out.print(" [");
+        for (Iterator<?> i = c.listLabels(null); i.hasNext(); ) {
+          out.print(((Literal) i.next()).getString() + ", ");
+        }
+        out.print("] ");
+      } else {
+        renderAnonymous(out, c, "class");
+      }
+    }
+  }
+
+  protected static void renderRestriction(PrintStream out, Restriction r) {
+    if (!r.isAnon()) {
+      out.print("Restriction ");
+      renderURI(out, r.getModel(), r.getURI());
+    } else {
+      renderAnonymous(out, r, "restriction");
+    }
+
+    out.print(" on property ");
+    renderURI(out, r.getModel(), r.getOnProperty().getURI());
+  }
+
+  protected static void renderURI(PrintStream out, PrefixMapping prefixes, 
String uri) {
+    out.print(prefixes.expandPrefix(uri));
+  }
+
+  protected static void renderAnonymous(PrintStream out, Resource anon, String 
name) {
+    String anonID = mAnonIDs.get(anon.getId());
+    if (anonID == null) {
+      anonID = "a-" + mAnonCount++;
+      mAnonIDs.put(anon.getId(), anonID);
+    }
+
+    out.print("Anonymous ");
+    out.print(name);
+    out.print(" with ID ");
+    out.print(anonID);
+  }
+
+  protected static void indent(PrintStream out, int depth) {
+    for (int i = 0; i < depth; i++) {
+      out.print(" ");
+    }
+  }
+
+}

[09/17] incubator-sdap-mudrod git commit: SDAP-7 Change all package namespaces to org.apache.sdap

Reply via email to