http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/MetadataExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/MetadataExtractor.java b/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/MetadataExtractor.java deleted file mode 100644 index a79ca87..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/MetadataExtractor.java +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.metadata.structure; - -import gov.nasa.jpl.mudrod.driver.ESDriver; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function2; -import org.apache.spark.api.java.function.PairFunction; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.common.unit.TimeValue; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.SearchHit; -import scala.Tuple2; - -import java.io.Serializable; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ExecutionException; - -public class MetadataExtractor implements Serializable { - - /** - * - */ - private static final long serialVersionUID = 1L; - - public MetadataExtractor() { - } - - /** - * loadMetadata:Load all metadata from Elasticsearch and convert them to - * pairRDD Please make sure metadata has been already harvested from web - * service and stored in Elasticsearch. - * - * @param es an Elasticsearch client node instance - * @param sc spark context - * @param index index name of log processing application - * @param type metadata type name - * @return PairRDD, in each pair key is metadata short name and value is term - * list extracted from metadata variables. - */ - public JavaPairRDD<String, List<String>> loadMetadata(ESDriver es, JavaSparkContext sc, String index, String type) { - List<PODAACMetadata> metadatas = this.loadMetadataFromES(es, index, type); - JavaPairRDD<String, List<String>> metadataTermsRDD = this.buildMetadataRDD(es, sc, index, metadatas); - return metadataTermsRDD; - } - - /** - * loadMetadataFromES: Load all metadata from Elasticsearch. - * - * @param es an Elasticsearch client node instance - * @param index index name of log processing application - * @param type metadata type name - * @return metadata list - */ - protected List<PODAACMetadata> loadMetadataFromES(ESDriver es, String index, String type) { - - List<PODAACMetadata> metadatas = new ArrayList<PODAACMetadata>(); - SearchResponse scrollResp = es.getClient().prepareSearch(index).setTypes(type).setQuery(QueryBuilders.matchAllQuery()).setScroll(new TimeValue(60000)).setSize(100).execute().actionGet(); - - while (true) { - for (SearchHit hit : scrollResp.getHits().getHits()) { - Map<String, Object> result = hit.getSource(); - String shortname = (String) result.get("Dataset-ShortName"); - List<String> topic = (List<String>) result.get("DatasetParameter-Topic"); - List<String> term = (List<String>) result.get("DatasetParameter-Term"); - List<String> keyword = (List<String>) result.get("Dataset-Metadata"); - List<String> variable = (List<String>) result.get("DatasetParameter-Variable"); - List<String> longname = (List<String>) result.get("DatasetProject-Project-LongName"); - - List<String> region = (List<String>) result.get("DatasetRegion-Region"); - - PODAACMetadata metadata = null; - try { - metadata = new PODAACMetadata(shortname, longname, es.customAnalyzing(index, topic), es.customAnalyzing(index, term), es.customAnalyzing(index, variable), es.customAnalyzing(index, keyword), - es.customAnalyzing(index, region)); - } catch (InterruptedException | ExecutionException e) { - e.printStackTrace(); - - } - metadatas.add(metadata); - } - scrollResp = es.getClient().prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(600000)).execute().actionGet(); - if (scrollResp.getHits().getHits().length == 0) { - break; - } - } - - return metadatas; - } - - /** - * buildMetadataRDD: Convert metadata list to JavaPairRDD - * - * @param es an Elasticsearch client node instance - * @param sc spark context - * @param index index name of log processing application - * @param metadatas metadata list - * @return PairRDD, in each pair key is metadata short name and value is term - * list extracted from metadata variables. - */ - protected JavaPairRDD<String, List<String>> buildMetadataRDD(ESDriver es, JavaSparkContext sc, String index, List<PODAACMetadata> metadatas) { - JavaRDD<PODAACMetadata> metadataRDD = sc.parallelize(metadatas); - JavaPairRDD<String, List<String>> metadataTermsRDD = metadataRDD.mapToPair(new PairFunction<PODAACMetadata, String, List<String>>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Tuple2<String, List<String>> call(PODAACMetadata metadata) throws Exception { - return new Tuple2<String, List<String>>(metadata.getShortName(), metadata.getAllTermList()); - } - }).reduceByKey(new Function2<List<String>, List<String>, List<String>>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public List<String> call(List<String> v1, List<String> v2) throws Exception { - List<String> list = new ArrayList<String>(); - list.addAll(v1); - list.addAll(v2); - return list; - } - }); - - return metadataTermsRDD; - } -}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/PODAACMetadata.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/PODAACMetadata.java b/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/PODAACMetadata.java deleted file mode 100644 index 50b17c0..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/PODAACMetadata.java +++ /dev/null @@ -1,337 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.metadata.structure; - -import java.io.Serializable; -import java.util.ArrayList; -import java.util.List; - -/** - * ClassName: PODAACMetadata Function: PODAACMetadata setter and getter methods - */ -public class PODAACMetadata implements Serializable { - - /** - * - */ - private static final long serialVersionUID = 1L; - // shortname: data set short name - private String shortname; - // abstractStr: data set abstract - private String abstractStr; - // isoTopic: data set topic - private String isoTopic; - // sensor: sensor - private String sensor; - // source: data source - private String source; - // project: data project - private String project; - // hasAbstarct: whether data set has abstract - boolean hasAbstarct; - - // longnameList: data set long name list - private List<String> longnameList; - // keywordList:data set key word list - private List<String> keywordList; - // termList: data set term list - private List<String> termList; - // topicList: data set topic list - private List<String> topicList; - // variableList: data set variable list - private List<String> variableList; - // abstractList: data set abstract term list - private List<String> abstractList; - // isotopicList: data set iso topic list - private List<String> isotopicList; - // sensorList: data set sensor list - private List<String> sensorList; - // sourceList: data set source list - private List<String> sourceList; - // projectList: data set project list - private List<String> projectList; - // regionList: data set region list - private List<String> regionList; - - public PODAACMetadata() { - // Default constructor - } - - /** - * Creates a new instance of PODAACMetadata. - * - * @param shortname data set short name - * @param longname data set long name - * @param topics data set topics - * @param terms data set terms - * @param variables data set variables - * @param keywords data set keywords - * @param region list of regions - */ - public PODAACMetadata(String shortname, List<String> longname, List<String> topics, List<String> terms, List<String> variables, List<String> keywords, List<String> region) { - this.shortname = shortname; - this.longnameList = longname; - this.keywordList = keywords; - this.termList = terms; - this.topicList = topics; - this.variableList = variables; - this.regionList = region; - } - - /** - * setTerms: set term of data set - * - * @param termstr data set terms - */ - public void setTerms(String termstr) { - this.splitString(termstr, this.termList); - } - - /** - * setKeywords: set key word of data set - * - * @param keywords data set keywords - */ - public void setKeywords(String keywords) { - this.splitString(keywords, this.keywordList); - } - - /** - * setTopicList: set topic of data set - * - * @param topicStr data set topics - */ - public void setTopicList(String topicStr) { - this.splitString(topicStr, this.topicList); - } - - /** - * setVaraliableList: set varilable of data set - * - * @param varilableStr data set variables - */ - public void setVaraliableList(String varilableStr) { - this.splitString(varilableStr, this.variableList); - } - - /** - * setProjectList:set project of data set - * - * @param project data set projects - */ - public void setProjectList(String project) { - this.splitString(project, this.projectList); - } - - /** - * setSourceList: set source of data set - * - * @param source data set sources - */ - public void setSourceList(String source) { - this.splitString(source, this.sourceList); - } - - /** - * setSensorList: set sensor of data set - * - * @param sensor data set sensors - */ - public void setSensorList(String sensor) { - this.splitString(sensor, this.sensorList); - } - - /** - * setISOTopicList:set iso topic of data set - * - * @param isoTopic data set iso topics - */ - public void setISOTopicList(String isoTopic) { - this.splitString(isoTopic, this.isotopicList); - } - - /** - * getKeywordList: get key word of data set - * - * @return data set keyword list - */ - public List<String> getKeywordList() { - return this.keywordList; - } - - /** - * getTermList:get term list of data set - * - * @return data set term list - */ - public List<String> getTermList() { - return this.termList; - } - - /** - * getShortName:get short name of data set - * - * @return data set short name - */ - public String getShortName() { - return this.shortname; - } - - /** - * getKeyword:get key word of data set - * - * @return data set keyword string - */ - public String getKeyword() { - return String.join(",", this.keywordList); - } - - /** - * getTerm:get term of data set - * - * @return data set term string - */ - public String getTerm() { - return String.join(",", this.termList); - } - - /** - * getTopic:get topic of data set - * - * @return data set topic string - */ - public String getTopic() { - return String.join(",", this.topicList); - } - - /** - * getVariable:get variable of data set - * - * @return data set variable string - */ - public String getVariable() { - return String.join(",", this.variableList); - } - - /** - * getAbstract:get abstract of data set - * - * @return data set abstract - */ - public String getAbstract() { - return this.abstractStr; - } - - /** - * getProject:get project of data set - * - * @return data set project string - */ - public String getProject() { - return this.project; - } - - /** - * getSource:get source of data set - * - * @return data set source string - */ - public String getSource() { - return this.source; - } - - /** - * getSensor:get sensor of data set - * - * @return data set sensor string - */ - public String getSensor() { - return this.sensor; - } - - /** - * getISOTopic:get iso topic of data set - * - * @return data set ISO topic string - */ - public String getISOTopic() { - return this.isoTopic; - } - - /** - * getAllTermList: get all term list of data set - * - * @return data set term list - */ - public List<String> getAllTermList() { - List<String> allterms = new ArrayList<>(); - - if (this.termList != null && !this.termList.isEmpty()) { - allterms.addAll(this.termList); - } - - if (this.keywordList != null && !this.keywordList.isEmpty()) { - allterms.addAll(this.keywordList); - } - - if (this.topicList != null && !this.topicList.isEmpty()) { - allterms.addAll(this.topicList); - } - - if (this.variableList != null && !this.variableList.isEmpty()) { - allterms.addAll(this.variableList); - } - - if (this.regionList != null && !this.regionList.isEmpty()) { - allterms.addAll(this.regionList); - } - return allterms; - } - - /** - * splitString: split value of fields of data set - * - * @param oristr original string - * @param list result after splitting - */ - private void splitString(String oristr, List<String> list) { - if (oristr == null) { - return; - } - - if (oristr.startsWith("\"")) { - oristr = oristr.substring(1); - } - if (oristr.endsWith("\"")) { - oristr = oristr.substring(0, oristr.length() - 1); - } - - String strs[] = oristr.trim().split(","); - if (strs != null) { - for (int i = 0; i < strs.length; i++) { - String str = strs[i].trim(); - if (str.startsWith(",") || str.startsWith("\"")) { - str = str.substring(1); - } - if (str.endsWith(",") || str.endsWith("\"")) { - str = str.substring(0, str.length() - 1); - } - if (str == "") { - continue; - } - list.add(str); - } - } - } -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/package-info.java deleted file mode 100644 index d7de65d..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/metadata/structure/package-info.java +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * This package includes classes needed for metadata analysis - */ -package gov.nasa.jpl.mudrod.metadata.structure; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/Ontology.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/Ontology.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/Ontology.java deleted file mode 100644 index 7bc76fb..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/Ontology.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.ontology; - -import java.util.Iterator; - -/** - * Base class for working with ontologies. Methods indicate ability - * to load, merge e.g. merge relevant ontology subgraphs into a new - * subgraph which can be used within Mudrod, subclass retreival, - * synonym expansion, etc. - * - * @author lewismc - */ -public interface Ontology { - - /** - * Load an array URIs which resolve to ontology resources. - * - * @param urls a {@link java.lang.String} containing ontology URIs. - */ - public void load(String[] urls); - - /** - * Load a collection of default ontology resources. - */ - public void load() ; - - /** - * merge relevant ontology subgraphs into a new subgraph which can - * be used within Mudrod - * - * @param o an ontology to merge with the current ontology held - * within Mudrod. - */ - public void merge(Ontology o); - - /** - * Retreive all subclasses for a particular entity provided within the - * search term e.g.subclass-based query expansion. - * - * @param entitySearchTerm an input search term - * @return an {@link java.util.Iterator} object containing subClass entries. - */ - public Iterator<String> subclasses(String entitySearchTerm); - - /** - * Retreive all synonyms for a particular entity provided within the - * search term e.g.synonym-based query expansion. - * - * @param queryKeyPhrase a phrase to undertake synonym expansion on. - * @return an {@link java.util.Iterator} object containing synonym entries. - */ - public Iterator<String> synonyms(String queryKeyPhrase); - -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/OntologyFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/OntologyFactory.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/OntologyFactory.java deleted file mode 100644 index f0ef6cd..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/OntologyFactory.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.ontology; - -import gov.nasa.jpl.mudrod.main.MudrodConstants; -import gov.nasa.jpl.mudrod.ontology.process.EsipCOROntology; -import gov.nasa.jpl.mudrod.ontology.process.EsipPortalOntology; -import gov.nasa.jpl.mudrod.ontology.process.LocalOntology; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Properties; - -/** - * The mechanism for creating an {@link Ontology} - * implementation. The {@link Ontology} implementation - * should be specified in - * <a href="https://github.com/mudrod/mudrod/blob/master/core/src/main/resources/config.xml"> - * config.xml</a> with configuration key - * <code>mudrod.ontology.implementation</code>. - * This property can also be accessed via - * {@link MudrodConstants#ONTOLOGY_IMPL}. - * - * @author lewismc - */ -public class OntologyFactory { - - public static final Logger LOG = LoggerFactory.getLogger(OntologyFactory.class); - - private Properties props; - - /** - * The mechanism for creating an {@link Ontology} - * implementation. - * - * @param props a populated Mudrod {@link java.util.Properties} object. - */ - public OntologyFactory(Properties props) { - this.props = props; - } - - /** - * Obtain the {@link Ontology} - * implementation for use within Mudrod. - * - * @return Returns the ontology implementation specified - * in <a href="https://github.com/mudrod/mudrod/blob/master/core/src/main/resources/config.xml"> - * config.xml</a> with configuration key - * <code>mudrod.ontology.implementation</code>. This property can also be accessed via - * {@link MudrodConstants#ONTOLOGY_IMPL}. - */ - public Ontology getOntology() { - - String ontologyImpl = this.props.getProperty(MudrodConstants.ONTOLOGY_IMPL, "Local"); - - LOG.info("Using ontology extension: {}", ontologyImpl); - Ontology ontImpl; - switch (ontologyImpl) { - case "EsipCOR": - ontImpl = new EsipCOROntology(); - break; - case "EsipPortal": - ontImpl = new EsipPortalOntology(); - break; - default: - ontImpl = new LocalOntology(); - break; - } - return ontImpl; - } - -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/package-info.java deleted file mode 100644 index 3763634..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/package-info.java +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * This package includes ontology pre-processing and processing classes. - */ -package gov.nasa.jpl.mudrod.ontology; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/pre/AggregateTriples.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/pre/AggregateTriples.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/pre/AggregateTriples.java deleted file mode 100644 index 99de87d..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/pre/AggregateTriples.java +++ /dev/null @@ -1,225 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.ontology.pre; - -import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract; -import gov.nasa.jpl.mudrod.driver.ESDriver; -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import org.apache.commons.io.FilenameUtils; -import org.jdom2.Document; -import org.jdom2.Element; -import org.jdom2.JDOMException; -import org.jdom2.Namespace; -import org.jdom2.filter.ElementFilter; -import org.jdom2.input.SAXBuilder; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; -import java.util.Properties; - -/** - * Supports ability to extract triples (subclassOf, equivalent class) from OWL file - */ -public class AggregateTriples extends DiscoveryStepAbstract { - private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(AggregateTriples.class); - - public AggregateTriples(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - } - - /** - * Method of executing triple aggregation - */ - @Override - public Object execute() { - File file = new File(this.props.getProperty("oceanTriples")); - if (file.exists()) { - file.delete(); - } - try { - file.createNewFile(); - } catch (IOException e2) { - e2.printStackTrace(); - } - - FileWriter fw; - try { - fw = new FileWriter(file.getAbsoluteFile()); - bw = new BufferedWriter(fw); - } catch (IOException e) { - e.printStackTrace(); - } - - File[] files = new File(this.props.getProperty("ontologyInputDir")).listFiles(); - for (File file_in : files) { - String ext = FilenameUtils.getExtension(file_in.getAbsolutePath()); - if ("owl".equals(ext)) { - try { - loadxml(file_in.getAbsolutePath()); - getAllClass(); - } catch (JDOMException e1) { - e1.printStackTrace(); - } catch (IOException e1) { - e1.printStackTrace(); - } - - } - } - - try { - bw.close(); - } catch (IOException e) { - e.printStackTrace(); - } - return null; - } - - public Document document; - public Element rootNode = null; - final static String owl_namespace = "http://www.w3.org/2002/07/owl#"; - final static String rdf_namespace = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; - final static String rdfs_namespace = "http://www.w3.org/2000/01/rdf-schema#"; - - BufferedWriter bw = null; - - /** - * Load OWL file into memory - * - * @param filePathName local path of OWL file - * @throws JDOMException JDOMException - * @throws IOException IOException - */ - public void loadxml(String filePathName) throws JDOMException, IOException { - SAXBuilder saxBuilder = new SAXBuilder(); - File file = new File(filePathName); - - document = saxBuilder.build(file); - rootNode = document.getRootElement(); - } - - /** - * Method of going through OWL structure - */ - public void loopxml() { - Iterator<?> processDescendants = rootNode.getDescendants(new ElementFilter()); - String text = ""; - - while (processDescendants.hasNext()) { - Element e = (Element) processDescendants.next(); - String currentName = e.getName(); - text = e.getTextTrim(); - if ("".equals(text)) { - LOG.info(currentName); - } else { - LOG.info("{} : {}", currentName, text); - } - } - } - - /** - * Method of identifying a specific child given a element name - * - * @param str element name - * @param ele parent element - * @return the element of child - */ - public Element findChild(String str, Element ele) { - Iterator<?> processDescendants = ele.getDescendants(new ElementFilter()); - String name = ""; - Element result = null; - - while (processDescendants.hasNext()) { - Element e = (Element) processDescendants.next(); - name = e.getName(); - if (name.equals(str)) { - result = e; - return result; - } - } - return result; - - } - - /** - * Method of extract triples (subclassOf, equivalent class) from OWL file - * - * @throws IOException IOException - */ - public void getAllClass() throws IOException { - List<?> classElements = rootNode.getChildren("Class", Namespace.getNamespace("owl", owl_namespace)); - - for (int i = 0; i < classElements.size(); i++) { - Element classElement = (Element) classElements.get(i); - String className = classElement.getAttributeValue("about", Namespace.getNamespace("rdf", rdf_namespace)); - - if (className == null) { - className = classElement.getAttributeValue("ID", Namespace.getNamespace("rdf", rdf_namespace)); - } - - List<?> subclassElements = classElement.getChildren("subClassOf", Namespace.getNamespace("rdfs", rdfs_namespace)); - for (int j = 0; j < subclassElements.size(); j++) { - Element subclassElement = (Element) subclassElements.get(j); - String subclassName = subclassElement.getAttributeValue("resource", Namespace.getNamespace("rdf", rdf_namespace)); - if (subclassName == null) { - Element allValuesFromEle = findChild("allValuesFrom", subclassElement); - if (allValuesFromEle != null) { - subclassName = allValuesFromEle.getAttributeValue("resource", Namespace.getNamespace("rdf", rdf_namespace)); - bw.write(cutString(className) + ",SubClassOf," + cutString(subclassName) + "\n"); - } - } else { - bw.write(cutString(className) + ",SubClassOf," + cutString(subclassName) + "\n"); - } - - } - - List equalClassElements = classElement.getChildren("equivalentClass", Namespace.getNamespace("owl", owl_namespace)); - for (int k = 0; k < equalClassElements.size(); k++) { - Element equalClassElement = (Element) equalClassElements.get(k); - String equalClassElementName = equalClassElement.getAttributeValue("resource", Namespace.getNamespace("rdf", rdf_namespace)); - - if (equalClassElementName != null) { - bw.write(cutString(className) + ",equivalentClass," + cutString(equalClassElementName) + "\n"); - } - } - - } - } - - /** - * Method of cleaning up a string - * - * @param str String needed to be processed - * @return the processed string - */ - public String cutString(String str) { - str = str.substring(str.indexOf("#") + 1); - String[] strArray = str.split("(?=[A-Z])"); - str = Arrays.toString(strArray); - return str.substring(1, str.length() - 1).replace(",", ""); - } - - @Override - public Object execute(Object o) { - return null; - } - -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/pre/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/pre/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/pre/package-info.java deleted file mode 100644 index 0570bc7..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/pre/package-info.java +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * This package includes ontology pre-processing classes. - */ -package gov.nasa.jpl.mudrod.ontology.pre; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/EsipCOROntology.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/EsipCOROntology.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/EsipCOROntology.java deleted file mode 100644 index 6194197..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/EsipCOROntology.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.ontology.process; - -import gov.nasa.jpl.mudrod.ontology.Ontology; - -import java.util.Iterator; - -/** - * @author lewismc - */ -public class EsipCOROntology implements Ontology { - - /** - * - */ - public EsipCOROntology() { - //default constructor - } - - @Override - public void load() { - // to be completed - } - - /* (non-Javadoc) - * @see Ontology#load(java.lang.String[]) - */ - @Override - public void load(String[] urls) { - // to be completed - } - - /* (non-Javadoc) - * @see Ontology#merge(Ontology) - */ - @Override - public void merge(Ontology o) { - // to be completed - } - - /* (non-Javadoc) - * @see Ontology#subclasses(java.lang.String) - */ - @Override - public Iterator<String> subclasses(String entitySearchTerm) { - return null; - } - - /* (non-Javadoc) - * @see Ontology#synonyms(java.lang.String) - */ - @Override - public Iterator<String> synonyms(String queryKeyPhrase) { - return null; - } - -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/EsipPortalOntology.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/EsipPortalOntology.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/EsipPortalOntology.java deleted file mode 100644 index 9c4888b..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/EsipPortalOntology.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.ontology.process; - -import gov.nasa.jpl.mudrod.ontology.Ontology; - -import java.util.Iterator; - -/** - * @author lewismc - */ -public class EsipPortalOntology implements Ontology { - - /** - * - */ - public EsipPortalOntology() { - //default constructor - } - - /* (non-Javadoc) - * @see Ontology#load(java.lang.String[]) - */ - @Override - public void load(String[] urls) { - // to be completed - } - - /* (non-Javadoc) - * @see Ontology#load() - */ - @Override - public void load() { - // to be completed - } - - /* (non-Javadoc) - * @see Ontology#merge(Ontology) - */ - @Override - public void merge(Ontology o) { - // to be completed - } - - /* (non-Javadoc) - * @see Ontology#subclasses(java.lang.String) - */ - @Override - public Iterator<String> subclasses(String entitySearchTerm) { - return null; - } - - /* (non-Javadoc) - * @see Ontology#synonyms(java.lang.String) - */ - @Override - public Iterator<String> synonyms(String queryKeyPhrase) { - return null; - } - -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/LocalOntology.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/LocalOntology.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/LocalOntology.java deleted file mode 100644 index 55ca51d..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/LocalOntology.java +++ /dev/null @@ -1,392 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.ontology.process; - -import gov.nasa.jpl.mudrod.ontology.Ontology; - -import org.apache.jena.ontology.Individual; -import org.apache.jena.ontology.OntClass; -import org.apache.jena.ontology.OntModel; -import org.apache.jena.ontology.OntModelSpec; -import org.apache.jena.ontology.OntResource; -import org.apache.jena.ontology.Restriction; -import org.apache.jena.rdf.model.AnonId; -import org.apache.jena.rdf.model.Literal; -import org.apache.jena.rdf.model.ModelFactory; -import org.apache.jena.rdf.model.Resource; -import org.apache.jena.shared.PrefixMapping; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.PrintStream; -import java.net.MalformedURLException; -import java.net.URISyntaxException; -import java.net.URL; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; - -/** - * The LocalOntology implementation enables us to work with Ontology files - * whcih are cached locally and available on the runtime classpath e.g. - * in <code>src/main/resource/ontology/...</code>. - * From here we can test and iterate on how use of ontology can enhance search. - */ -public class LocalOntology implements Ontology { - - public static final Logger LOG = LoggerFactory.getLogger(LocalOntology.class); - - public static final String DELIMITER_SEARCHTERM = " "; - - private Map<Object, Object> searchTerms = new HashMap<>(); - private static OntologyParser parser; - private static OntModel ontologyModel; - private Ontology ontology; - private static Map<AnonId, String> mAnonIDs = new HashMap<>(); - private static int mAnonCount = 0; - private List<String> ontArrayList; - - public LocalOntology() { - //only initialize all the static variables - //if first time called to this ontology constructor - if (ontology == null) { - if (LOG.isInfoEnabled()) { - LOG.info("Creating new ontology"); - } - parser = new OwlParser(); - ontology = this; - } - if (ontologyModel == null) - ontologyModel = ModelFactory.createOntologyModel(OntModelSpec.OWL_MEM, null); - load(); - } - - /** - * Static accessor for {@link LocalOntology} - * instance implementation defined within <code>config.xml</code>. - * - * @return a {@link LocalOntology} - */ - public Ontology getInstance() { - if (ontology == null) { - ontology = new LocalOntology(); - } - return ontology; - } - - /** - * Load the default <i>sweetAll.owl</i> ontology - * from <a href="https://raw.githubusercontent.com/ESIPFed/sweet/master/2.4/sweetAll.owl"> - * https://raw.githubusercontent.com/ESIPFed/sweet/master/2.4/sweetAll.owl</a> - */ - @Override - public void load() { - URL ontURL = null; - try { - ontURL = new URL("https://raw.githubusercontent.com/ESIPFed/sweet/master/2.4/sweetAll.owl"); - //ontURL = new URL("https://raw.githubusercontent.com/ESIPFed/sweet/master/2.4/reprDataProduct.owl"); - } catch (MalformedURLException e) { - LOG.error("Error when attempting to create URL resource: ", e); - } - ontArrayList = new ArrayList<>(); - try { - ontArrayList.add(ontURL.toURI().toString()); - } catch (URISyntaxException e) { - LOG.error("Error in URL syntax, please check your Ontology resource: ", e); - } - if (!ontArrayList.isEmpty()) { - load(ontArrayList.stream().toArray(String[]::new)); - } - } - - /** - * Load a string array of local URIs which refernece .owl files. - */ - @Override - public void load(String[] urls) { - for (int i = 0; i < urls.length; i++) { - String url = urls[i].trim(); - if (!"".equals(url)) - if (LOG.isInfoEnabled()) { - LOG.info("Reading and processing {}", url); - } - load(ontologyModel, url); - } - parser.parse(ontology, ontologyModel); - } - - private void load(Object m, String url) { - try { - ((OntModel) m).read(url, null, null); - LOG.info("Successfully processed {}", url); - } catch (Exception e) { - LOG.error("Failed whilst attempting to read ontology {}: Error: ", url, e); - } - } - - /** - * Get the {@link gov.nasa.jpl.mudrod.ontology.process.OntologyParser} - * implementation being used to process the input ontology resources. - * @return an {@link gov.nasa.jpl.mudrod.ontology.process.OntologyParser} implementation - */ - public OntologyParser getParser() { - if (parser == null) { - parser = new OwlParser(); - } - return parser; - } - - /** - * Return the {@link org.apache.jena.ontology.OntModel} instance - * which created from input ontology resources. - * @return a constructed {@link org.apache.jena.ontology.OntModel} - */ - public static OntModel getModel() { - return ontologyModel; - } - - /** - * Return the loaded Ontology resources. - * @return a {@link java.util.List} of resources. - */ - public List<String> getLoadedOntologyResources() { - if (ontArrayList != null) { - return ontArrayList; - } else { - return new ArrayList<>(); - } - } - /** - * Not yet implemented. - */ - @Override - public void merge(Ontology o) { - // not yet implemented - } - - /** - * Retrieve all subclasses of entity(ies) hashed to searchTerm - * @param entitySearchTerm a query (keywords) for which to obtain - * subclasses. - * @return an {@link java.util.Iterator} containing the subclass as Strings. - */ - @Override - public Iterator<String> subclasses(String entitySearchTerm) { - Map<OntResource, String> classMap = retrieve(entitySearchTerm); - Map<String, String> subclasses = new HashMap<>(); - - Iterator<OntResource> iter = classMap.keySet().iterator(); - while (iter.hasNext()) { - OntResource resource = iter.next(); - - if (resource instanceof OntClass) { - //get subclasses N.B. we only get direct sub-classes e.g. direct children - //it is possible for us to navigate the entire class tree if we wish, we simply - //need to pass the .listSubClasses(true) boolean parameter. - for (Iterator<?> i = ((OntClass) resource).listSubClasses(); i.hasNext();) { - OntResource subclass = (OntResource) i.next(); - for (Iterator<?> j = subclass.listLabels(null); j.hasNext();) { - Literal l = (Literal) j.next(); - subclasses.put(l.toString(), "1"); - } - } - //get individuals - for (Iterator<?> i = ((OntClass) resource).listInstances(); i.hasNext(); ) { - OntResource subclass = (OntResource) i.next(); - for (Iterator<?> j = subclass.listLabels(null); j.hasNext();) { - Literal l = (Literal) j.next(); - subclasses.put(l.toString(), "1"); - } - } - } else if (resource instanceof Individual) { - for (Iterator<?> i = resource.listSameAs(); i.hasNext();) { - OntResource subclass = (OntResource) i.next(); - for (Iterator<?> j = subclass.listLabels(null); j.hasNext();) { - Literal l = (Literal) j.next(); - subclasses.put(l.toString(), "1"); - } - } - } - } - return subclasses.keySet().iterator(); - } - - /** - * Retreives synonyms for an given phrase if the phrase - * is present in the ontology - * @param queryKeyPhrase an input string representing a phrase - * for which we wish to obtain synonyms. - * @return an {@link java.util.Iterator} containing synonyms string tokens - * or an empty if no synonyms exist for the given queryKeyPhrase. - */ - @Override - public Iterator synonyms(String queryKeyPhrase) { - - Map<?, ?> classMap = retrieve(queryKeyPhrase); - - Map<Object, Object> synonyms = new HashMap<>(); - - Iterator<?> iter = classMap.keySet().iterator(); - while (iter.hasNext()) { - OntResource resource = (OntResource) iter.next(); - - //listLabels - for (Iterator<?> i = resource.listLabels(null); i.hasNext();) { - Literal l = (Literal) i.next(); - synonyms.put(l.toString(), "1"); - } - - if (resource instanceof Individual) { - //get all individuals same as this one - for (Iterator<?> i = resource.listSameAs(); i.hasNext();) { - Individual individual = (Individual) i.next(); - //add labels - for (Iterator<?> j = individual.listLabels(null); j.hasNext();) { - Literal l = (Literal) i.next(); - synonyms.put(l.toString(), "1"); - } - } - } else if (resource instanceof OntClass) { - //list equivalent classes - for (Iterator<?> i = ((OntClass) resource).listEquivalentClasses(); i.hasNext();) { - OntClass equivClass = (OntClass) i.next(); - //add labels - for (Iterator<?> j = equivClass.listLabels(null); j.hasNext();) { - Literal l = (Literal) j.next(); - synonyms.put(l.toString(), "1"); - } - } - } - } - - return synonyms.keySet().iterator(); - } - - public void addSearchTerm(String label, OntResource resource) { - Map<OntResource, String> m = retrieve(label); - if (m == null) { - m = new HashMap<>(); - } - m.put(resource, "1"); - searchTerms.put(label.toLowerCase(), m); - } - - /** - * A basic lookup function for retrieving keys (phrases or tokens) - * from the ontology search terms map. Right now only exact lookups - * will retrieve a result... this could be improved by using some - * advanced parsing logic... such as Lucene query parser. - * @param label the label (phrases or tokens) to retrieve from the - * ontology search terms map. - * @return an {@link java.util.Map} if there are match(es) - * or an empty {@link java.util.HashMap} if there are no - * matches. - */ - public Map<OntResource, String> retrieve(String label) { - @SuppressWarnings("unchecked") - Map<OntResource, String> m = (Map<OntResource, String>) searchTerms.get(label.toLowerCase()); - if (m == null) { - m = new HashMap<>(); - } - return m; - } - - protected static void renderHierarchy(PrintStream out, OntClass cls, List<Object> occurs, int depth) { - renderClassDescription(out, cls, depth); - out.println(); - - // recurse to the next level down - if (cls.canAs(OntClass.class) && !occurs.contains(cls)) { - for (Iterator<?> i = cls.listSubClasses(true); i.hasNext(); ) { - OntClass sub = (OntClass) i.next(); - - // we push this expression on the occurs list before we recurse - occurs.add(cls); - renderHierarchy(out, sub, occurs, depth + 1); - occurs.remove(cls); - } - for (Iterator<?> i = cls.listInstances(); i.hasNext(); ) { - Individual individual = (Individual) i.next(); - renderURI(out, individual.getModel(), individual.getURI()); - out.print(" ["); - for (Iterator<?> j = individual.listLabels(null); j.hasNext(); ) { - out.print(((Literal) j.next()).getString() + ", "); - } - out.print("] "); - out.println(); - } - } - } - - public static void renderClassDescription(PrintStream out, OntClass c, int depth) { - indent(out, depth); - - if (c.isRestriction()) { - renderRestriction(out, (Restriction) c.as(Restriction.class)); - } else { - if (!c.isAnon()) { - out.print("Class "); - renderURI(out, c.getModel(), c.getURI()); - - out.print(c.getLocalName()); - - out.print(" ["); - for (Iterator<?> i = c.listLabels(null); i.hasNext(); ) { - out.print(((Literal) i.next()).getString() + ", "); - } - out.print("] "); - } else { - renderAnonymous(out, c, "class"); - } - } - } - - protected static void renderRestriction(PrintStream out, Restriction r) { - if (!r.isAnon()) { - out.print("Restriction "); - renderURI(out, r.getModel(), r.getURI()); - } else { - renderAnonymous(out, r, "restriction"); - } - - out.print(" on property "); - renderURI(out, r.getModel(), r.getOnProperty().getURI()); - } - - protected static void renderURI(PrintStream out, PrefixMapping prefixes, String uri) { - out.print(prefixes.expandPrefix(uri)); - } - - protected static void renderAnonymous(PrintStream out, Resource anon, String name) { - String anonID = mAnonIDs.get(anon.getId()); - if (anonID == null) { - anonID = "a-" + mAnonCount++; - mAnonIDs.put(anon.getId(), anonID); - } - - out.print("Anonymous "); - out.print(name); - out.print(" with ID "); - out.print(anonID); - } - - protected static void indent(PrintStream out, int depth) { - for (int i = 0; i < depth; i++) { - out.print(" "); - } - } - -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyLinkCal.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyLinkCal.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyLinkCal.java deleted file mode 100644 index a68a0cb..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyLinkCal.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.ontology.process; - -import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract; -import gov.nasa.jpl.mudrod.driver.ESDriver; -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import org.elasticsearch.action.index.IndexRequest; -import org.elasticsearch.common.xcontent.XContentBuilder; -import org.elasticsearch.index.query.QueryBuilders; - -import java.io.BufferedReader; -import java.io.FileReader; -import java.io.IOException; -import java.util.Properties; -import java.util.concurrent.ExecutionException; - -import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; - -/** - * Supports ability to parse and process FTP and HTTP log files - */ -public class OntologyLinkCal extends DiscoveryStepAbstract { - - public OntologyLinkCal(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - es.deleteAllByQuery(props.getProperty("indexName"), props.getProperty("ontologyLinkageType"), QueryBuilders.matchAllQuery()); - addSWEETMapping(); - } - - /** - * Method of adding mapping for triples extracted from SWEET - */ - public void addSWEETMapping() { - XContentBuilder Mapping; - try { - Mapping = jsonBuilder().startObject().startObject(props.getProperty("ontologyLinkageType")).startObject("properties").startObject("concept_A").field("type", "string") - .field("index", "not_analyzed").endObject().startObject("concept_B").field("type", "string").field("index", "not_analyzed").endObject() - - .endObject().endObject().endObject(); - - es.getClient().admin().indices().preparePutMapping(props.getProperty("indexName")).setType(props.getProperty("ontologyLinkageType")).setSource(Mapping).execute().actionGet(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - /** - * Method of calculating and importing SWEET triples into Elasticsearch - */ - @Override - public Object execute() { - es.deleteType(props.getProperty("indexName"), props.getProperty("ontologyLinkageType")); - es.createBulkProcessor(); - - BufferedReader br = null; - String line = ""; - double weight = 0; - - try { - br = new BufferedReader(new FileReader(props.getProperty("oceanTriples"))); - while ((line = br.readLine()) != null) { - String[] strList = line.toLowerCase().split(","); - if (strList[1].equals("subclassof")) { - weight = 0.75; - } else { - weight = 0.9; - } - - IndexRequest ir = new IndexRequest(props.getProperty("indexName"), props.getProperty("ontologyLinkageType")).source( - jsonBuilder().startObject().field("concept_A", es.customAnalyzing(props.getProperty("indexName"), strList[2])) - .field("concept_B", es.customAnalyzing(props.getProperty("indexName"), strList[0])).field("weight", weight).endObject()); - es.getBulkProcessor().add(ir); - - } - - } catch (IOException e) { - e.printStackTrace(); - } catch (InterruptedException e) { - e.printStackTrace(); - } catch (ExecutionException e) { - e.printStackTrace(); - } finally { - if (br != null) { - try { - br.close(); - es.destroyBulkProcessor(); - es.refreshIndex(); - } catch (IOException e) { - e.printStackTrace(); - } - } - } - return null; - } - - @Override - public Object execute(Object o) { - return null; - } - -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyParser.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyParser.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyParser.java deleted file mode 100644 index eca6252..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OntologyParser.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.ontology.process; - -import org.apache.jena.ontology.OntClass; -import org.apache.jena.ontology.OntModel; - -import gov.nasa.jpl.mudrod.ontology.Ontology; - -import java.util.Iterator; - -/** - * Interface for specific ontology parsers e.g. .ttl, RDFXML, - * etc. - */ -public interface OntologyParser { - - /** - * An ontology model (RDF graph) to parse for literals. - * - * @param ont the associated {@link gov.nasa.jpl.mudrod.ontology.Ontology} - * implementation processing the ontology operation(s). - * @param ontModel the {@link org.apache.jena.ontology.OntModel} - */ - public void parse(Ontology ont, OntModel ontModel); - - /** - * An ontology model (RDF graph) for which to obtain an - * {@link java.util.Iterator} instance of all root classes. - * - * @param ontModel the {@link org.apache.jena.ontology.OntModel} - * @return an {@link java.util.Iterator} instance containing all root classes. - */ - public Iterator<OntClass> rootClasses(OntModel ontModel); - -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OwlParser.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OwlParser.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OwlParser.java deleted file mode 100644 index e43f04d..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/OwlParser.java +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.ontology.process; - -import org.apache.jena.ontology.Individual; -import org.apache.jena.ontology.OntClass; -import org.apache.jena.ontology.OntModel; -import org.apache.jena.rdf.model.Literal; - -import com.esotericsoftware.minlog.Log; - -import gov.nasa.jpl.mudrod.ontology.Ontology; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * {@link gov.nasa.jpl.mudrod.ontology.process.OntologyParser} - * implementation for <a href="http://www.w3.org/TR/owl-features/">W3C OWL</a> - * files. - */ -public class OwlParser implements OntologyParser { - - private Ontology ont; - private List<OntClass> roots = new ArrayList<>(); - - public OwlParser() { - //default constructor - } - - /** - * Parse OWL ontology files using Apache Jena - */ - @Override - public void parse(Ontology ont, OntModel m) { - this.ont = ont; - for (Iterator<OntClass> i = rootClasses(m); i.hasNext(); ) { - OntClass c = i.next(); - - //dont deal with anonymous classes - if (c.isAnon()) { - continue; - } - - parseClass(c, new ArrayList<>(), 0); - } - } - - protected void parseClass(OntClass cls, List<Object> occurs, int depth) { - //dont deal with anonymous classes - if (cls.isAnon()) { - return; - } - - //add cls to Ontology searchterms - //list labels - Iterator<?> labelIter = cls.listLabels(null); - //if has no labels - if (!labelIter.hasNext()) { - //add rdf:ID as a label - cls.addLabel(rdfidToLabel(cls.getLocalName()), null); - } - //reset the label iterator - labelIter = cls.listLabels(null); - - while (labelIter.hasNext()) { - Literal l = (Literal) labelIter.next(); - ((LocalOntology) ont).addSearchTerm(l.toString(), cls); - } - - // recurse to the next level down - if (cls.canAs(OntClass.class) && !occurs.contains(cls)) { - //list subclasses - for (Iterator<?> i = cls.listSubClasses(true); i.hasNext(); ) { - OntClass sub = (OntClass) i.next(); - - // we push this expression on the occurs list before we recurse - occurs.add(cls); - parseClass(sub, occurs, depth + 1); - occurs.remove(cls); - } - - //list instances - for (Iterator<?> i = cls.listInstances(); i.hasNext(); ) { - //add search terms for each instance - - //list labels - Individual individual = (Individual) i.next(); - for (Iterator<?> j = individual.listLabels(null); j.hasNext(); ) { - Literal l = (Literal) j.next(); - ((LocalOntology) ont).addSearchTerm(l.toString(), individual); - } - } - } - } - - /** - * Parses out all root classes of the given - * {@link org.apache.jena.ontology.OntModel} - * @param m the {@link org.apache.jena.ontology.OntModel} we wish to obtain - * all root classes for. - * @return an {@link java.util.Iterator} of {@link org.apache.jena.ontology.OntClass} - * elements representing all root classes. - */ - @Override - public Iterator<OntClass> rootClasses(OntModel m) { - Iterator<?> i = m.listClasses(); - if (i.hasNext() && i.next() instanceof OntClass) { - //assume ontology has root classes - processSingle(m); - } else { - //check for presence of aggregate/collection ontologies such as sweetAll.owl - processCollection(m); - } - - return roots.iterator(); - } - - private void processSingle(OntModel m) { - for (Iterator<?> i = m.listClasses(); i.hasNext(); ) { - OntClass c = (OntClass) i.next(); - try { - // too confusing to list all the restrictions as root classes - if (c.isAnon()) { - continue; - } - - if (c.hasSuperClass(m.getProfile().THING(), true) || c.getCardinality(m.getProfile().SUB_CLASS_OF()) == 0) { - // this class is directly descended from Thing - roots.add(c); - } - } catch (Exception e) { - Log.error("Error during extraction or root Classes from Ontology Model: ", e); - } - } - } - - private void processCollection(OntModel m) { - for (Iterator<?> i = m.listSubModels(true); i.hasNext(); ) { - OntModel ontModel = (OntModel) i.next(); - processSingle(ontModel); - } - } - - public String rdfidToLabel(String idString) { - Pattern p = Pattern.compile("([a-z0-9])([A-Z])"); - Matcher m = p.matcher(idString); - - String labelString = idString; - while (m.find()) { - labelString = labelString.replaceAll(m.group(1) + m.group(2), m.group(1) + " " + m.group(2)); - } - return labelString; - } - -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/package-info.java deleted file mode 100644 index 3447426..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ontology/process/package-info.java +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * This package includes ontology processing classes. - */ -package gov.nasa.jpl.mudrod.ontology.process; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/package-info.java deleted file mode 100644 index 1e5d8bf..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/package-info.java +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * This package includes the preprocessing, processing, and data structure used - * by recommendation module. - */ -package gov.nasa.jpl.mudrod.recommendation; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/ImportMetadata.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/ImportMetadata.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/ImportMetadata.java deleted file mode 100644 index c174f31..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/ImportMetadata.java +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.recommendation.pre; - -import com.google.gson.JsonElement; -import com.google.gson.JsonParser; -import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract; -import gov.nasa.jpl.mudrod.driver.ESDriver; -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import gov.nasa.jpl.mudrod.main.MudrodConstants; -import gov.nasa.jpl.mudrod.metadata.pre.ApiHarvester; -import org.apache.commons.io.IOUtils; -import org.elasticsearch.action.index.IndexRequest; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.*; -import java.util.Properties; - -/** - * ClassName: Import Metadata to elasticsearch - */ - -public class ImportMetadata extends DiscoveryStepAbstract { - - /** - * - */ - private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(ApiHarvester.class); - - public ImportMetadata(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - } - - @Override - public Object execute() { - LOG.info("Starting Metadata Harvesting"); - startTime = System.currentTimeMillis(); - addMetadataMapping(); - importToES(); - endTime = System.currentTimeMillis(); - es.refreshIndex(); - LOG.info("Finished Metadata Harvesting time elapsed: {}s", (endTime - startTime) / 1000); - return null; - } - - /** - * addMetadataMapping: Add mapping to index metadata in Elasticsearch. Please - * invoke this method before import metadata to Elasticsearch. - */ - public void addMetadataMapping() { - String mappingJson = "{\r\n \"dynamic_templates\": " + "[\r\n " + "{\r\n \"strings\": " + "{\r\n \"match_mapping_type\": \"string\"," - + "\r\n \"mapping\": {\r\n \"type\": \"string\"," + "\r\n \"analyzer\": \"csv\"\r\n }" + "\r\n }\r\n }\r\n ]\r\n}"; - - es.getClient().admin().indices().preparePutMapping(props.getProperty(MudrodConstants.ES_INDEX_NAME)).setType(props.getProperty("recom_metadataType")).setSource(mappingJson).execute().actionGet(); - - } - - /** - * importToES: Index metadata into elasticsearch from local file directory. - * Please make sure metadata have been harvest from web service before - * invoking this method. - */ - private void importToES() { - es.deleteType(props.getProperty("indexName"), props.getProperty("recom_metadataType")); - - es.createBulkProcessor(); - File directory = new File(props.getProperty(MudrodConstants.RAW_METADATA_PATH)); - File[] fList = directory.listFiles(); - for (File file : fList) { - InputStream is; - try { - is = new FileInputStream(file); - try { - String jsonTxt = IOUtils.toString(is); - JsonParser parser = new JsonParser(); - JsonElement item = parser.parse(jsonTxt); - IndexRequest ir = new IndexRequest(props.getProperty(MudrodConstants.ES_INDEX_NAME), props.getProperty("recom_metadataType")).source(item.toString()); - - // preprocessdata - - es.getBulkProcessor().add(ir); - } catch (IOException e) { - e.printStackTrace(); - } - } catch (FileNotFoundException e) { - e.printStackTrace(); - } - - } - - es.destroyBulkProcessor(); - } - - @Override - public Object execute(Object o) { - return null; - } -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/MetadataTFIDFGenerator.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/MetadataTFIDFGenerator.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/MetadataTFIDFGenerator.java deleted file mode 100644 index 02c74f0..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/MetadataTFIDFGenerator.java +++ /dev/null @@ -1,100 +0,0 @@ -/** - * Project Name:mudrod-core - * File Name:TFIDFGenerator.java - * Package Name:gov.nasa.jpl.mudrod.recommendation.pre - * Date:Aug 22, 201612:39:52 PM - * Copyright (c) 2016, [email protected] All Rights Reserved. - */ - -package gov.nasa.jpl.mudrod.recommendation.pre; - -import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract; -import gov.nasa.jpl.mudrod.driver.ESDriver; -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import gov.nasa.jpl.mudrod.recommendation.structure.MetadataOpt; -import gov.nasa.jpl.mudrod.utils.LabeledRowMatrix; -import gov.nasa.jpl.mudrod.utils.MatrixUtil; -import org.apache.spark.api.java.JavaPairRDD; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.ArrayList; -import java.util.List; -import java.util.Properties; - -/** - * ClassName: Generate TFIDF information of all metadata - */ -public class MetadataTFIDFGenerator extends DiscoveryStepAbstract { - - private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(MetadataTFIDFGenerator.class); - - /** - * Creates a new instance of MatrixGenerator. - * - * @param props the Mudrod configuration - * @param es the Elasticsearch drive - * @param spark the spark drive - */ - public MetadataTFIDFGenerator(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - } - - @Override - public Object execute() { - - LOG.info("Starting Dataset TF_IDF Matrix Generator"); - startTime = System.currentTimeMillis(); - try { - generateWordBasedTFIDF(); - } catch (Exception e) { - LOG.error("Error during Dataset TF_IDF Matrix Generation: {}", e); - } - endTime = System.currentTimeMillis(); - - LOG.info("Dataset TF_IDF Matrix Generation complete, time elaspsed: {}s", (endTime - startTime) / 1000); - - return null; - } - - @Override - public Object execute(Object o) { - return null; - } - - public LabeledRowMatrix generateWordBasedTFIDF() throws Exception { - - MetadataOpt opt = new MetadataOpt(props); - - JavaPairRDD<String, String> metadataContents = opt.loadAll(es, spark); - - JavaPairRDD<String, List<String>> metadataWords = opt.tokenizeData(metadataContents, " "); - - LabeledRowMatrix wordtfidfMatrix = opt.tFIDFTokens(metadataWords, spark); - - MatrixUtil.exportToCSV(wordtfidfMatrix.rowMatrix, wordtfidfMatrix.rowkeys, wordtfidfMatrix.colkeys, props.getProperty("metadata_word_tfidf_matrix")); - - return wordtfidfMatrix; - } - - public LabeledRowMatrix generateTermBasedTFIDF() throws Exception { - - MetadataOpt opt = new MetadataOpt(props); - - List<String> variables = new ArrayList<>(); - variables.add("DatasetParameter-Term"); - variables.add("DatasetParameter-Variable"); - variables.add("Dataset-ExtractTerm"); - - JavaPairRDD<String, String> metadataContents = opt.loadAll(es, spark, variables); - - JavaPairRDD<String, List<String>> metadataTokens = opt.tokenizeData(metadataContents, ","); - - LabeledRowMatrix tokentfidfMatrix = opt.tFIDFTokens(metadataTokens, spark); - - MatrixUtil.exportToCSV(tokentfidfMatrix.rowMatrix, tokentfidfMatrix.rowkeys, tokentfidfMatrix.colkeys, props.getProperty("metadata_term_tfidf_matrix")); - - return tokentfidfMatrix; - } -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/NormalizeVariables.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/NormalizeVariables.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/NormalizeVariables.java deleted file mode 100644 index f5eaa9c..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/NormalizeVariables.java +++ /dev/null @@ -1,223 +0,0 @@ -package gov.nasa.jpl.mudrod.recommendation.pre; - -import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract; -import gov.nasa.jpl.mudrod.driver.ESDriver; -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.action.update.UpdateRequest; -import org.elasticsearch.common.unit.TimeValue; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.SearchHit; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.HashMap; -import java.util.Map; -import java.util.Properties; -import java.util.regex.Pattern; - -public class NormalizeVariables extends DiscoveryStepAbstract { - - /** - * - */ - private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(NormalizeVariables.class); - // index name - private String indexName; - // type name of metadata in ES - private String metadataType; - - /** - * Creates a new instance of OHEncoder. - * - * @param props the Mudrod configuration - * @param es an instantiated {@link ESDriver} - * @param spark an instantiated {@link SparkDriver} - */ - public NormalizeVariables(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - indexName = props.getProperty("indexName"); - metadataType = props.getProperty("recom_metadataType"); - } - - @Override - public Object execute() { - LOG.info("*****************processing metadata variables starts******************"); - startTime = System.currentTimeMillis(); - - normalizeMetadataVariables(es); - - endTime = System.currentTimeMillis(); - LOG.info("*****************processing metadata variables ends******************Took {}s", (endTime - startTime) / 1000); - - return null; - } - - @Override - public Object execute(Object o) { - return null; - } - - public void normalizeMetadataVariables(ESDriver es) { - - es.createBulkProcessor(); - - SearchResponse scrollResp = es.getClient().prepareSearch(indexName).setTypes(metadataType).setScroll(new TimeValue(60000)).setQuery(QueryBuilders.matchAllQuery()).setSize(100).execute() - .actionGet(); - while (true) { - for (SearchHit hit : scrollResp.getHits().getHits()) { - Map<String, Object> metadata = hit.getSource(); - Map<String, Object> updatedValues = new HashMap<>(); - - this.normalizeSpatialVariables(metadata, updatedValues); - this.normalizeTemporalVariables(metadata, updatedValues); - this.normalizeOtherVariables(metadata, updatedValues); - - UpdateRequest ur = es.generateUpdateRequest(indexName, metadataType, hit.getId(), updatedValues); - es.getBulkProcessor().add(ur); - } - - scrollResp = es.getClient().prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(600000)).execute().actionGet(); - if (scrollResp.getHits().getHits().length == 0) { - break; - } - } - - es.destroyBulkProcessor(); - } - - private void normalizeOtherVariables(Map<String, Object> metadata, Map<String, Object> updatedValues) { - String shortname = (String) metadata.get("Dataset-ShortName"); - double versionNUm = getVersionNum(shortname); - updatedValues.put("Dataset-Derivative-VersionNum", versionNUm); - - } - - private Double getVersionNum(String version) { - if (version == null) { - return 0.0; - } - Double versionNum = 0.0; - Pattern p = Pattern.compile(".*[a-zA-Z].*"); - if ("Operational/Near-Real-Time".equals(version)) { - versionNum = 2.0; - } else if (version.matches("[0-9]{1}[a-zA-Z]{1}")) { - versionNum = Double.parseDouble(version.substring(0, 1)); - } else if (p.matcher(version).find()) { - versionNum = 0.0; - } else { - versionNum = Double.parseDouble(version); - if (versionNum >= 5) { - versionNum = 20.0; - } - } - return versionNum; - } - - private void normalizeSpatialVariables(Map<String, Object> metadata, Map<String, Object> updatedValues) { - - // get spatial resolution - Double spatialR; - if (metadata.get("Dataset-SatelliteSpatialResolution") != null) { - spatialR = (Double) metadata.get("Dataset-SatelliteSpatialResolution"); - } else { - Double gridR = (Double) metadata.get("Dataset-GridSpatialResolution"); - if (gridR != null) { - spatialR = 111 * gridR; - } else { - spatialR = 25.0; - } - } - updatedValues.put("Dataset-Derivative-SpatialResolution", spatialR); - - // Transform Longitude and calculate coverage area - double top = parseDouble((String) metadata.get("DatasetCoverage-NorthLat")); - double bottom = parseDouble((String) metadata.get("DatasetCoverage-SouthLat")); - double left = parseDouble((String) metadata.get("DatasetCoverage-WestLon")); - double right = parseDouble((String) metadata.get("DatasetCoverage-EastLon")); - - if (left > 180) { - left = left - 360; - } - - if (right > 180) { - right = right - 360; - } - - if (left == right) { - left = -180; - right = 180; - } - - double area = (top - bottom) * (right - left); - - updatedValues.put("DatasetCoverage-Derivative-EastLon", right); - updatedValues.put("DatasetCoverage-Derivative-WestLon", left); - updatedValues.put("DatasetCoverage-Derivative-NorthLat", top); - updatedValues.put("DatasetCoverage-Derivative-SouthLat", bottom); - updatedValues.put("DatasetCoverage-Derivative-Area", area); - - // get processing level - String processingLevel = (String) metadata.get("Dataset-ProcessingLevel"); - double dProLevel = this.getProLevelNum(processingLevel); - updatedValues.put("Dataset-Derivative-ProcessingLevel", dProLevel); - } - - private void normalizeTemporalVariables(Map<String, Object> metadata, Map<String, Object> updatedValues) { - - String trStr = (String) metadata.get("Dataset-TemporalResolution"); - if ("".equals(trStr)) { - trStr = (String) metadata.get("Dataset-TemporalRepeat"); - } - - updatedValues.put("Dataset-Derivative-TemporalResolution", covertTimeUnit(trStr)); - } - - private Double covertTimeUnit(String str) { - Double timeInHour; - if (str.contains("Hour")) { - timeInHour = Double.parseDouble(str.split(" ")[0]); - } else if (str.contains("Day")) { - timeInHour = Double.parseDouble(str.split(" ")[0]) * 24; - } else if (str.contains("Week")) { - timeInHour = Double.parseDouble(str.split(" ")[0]) * 24 * 7; - } else if (str.contains("Month")) { - timeInHour = Double.parseDouble(str.split(" ")[0]) * 24 * 7 * 30; - } else if (str.contains("Year")) { - timeInHour = Double.parseDouble(str.split(" ")[0]) * 24 * 7 * 30 * 365; - } else { - timeInHour = 0.0; - } - - return timeInHour; - } - - public Double getProLevelNum(String pro) { - if (pro == null) { - return 1.0; - } - Double proNum = 0.0; - Pattern p = Pattern.compile(".*[a-zA-Z].*"); - if (pro.matches("[0-9]{1}[a-zA-Z]{1}")) { - proNum = Double.parseDouble(pro.substring(0, 1)); - } else if (p.matcher(pro).find()) { - proNum = 1.0; - } else { - proNum = Double.parseDouble(pro); - } - - return proNum; - } - - private double parseDouble(String strNumber) { - if (strNumber != null && strNumber.length() > 0) { - try { - return Double.parseDouble(strNumber); - } catch (Exception e) { - return -1; - } - } else - return 0; - } -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/SessionCooccurence.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/SessionCooccurence.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/SessionCooccurence.java deleted file mode 100644 index 2aecce3..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/SessionCooccurence.java +++ /dev/null @@ -1,152 +0,0 @@ -/** - * Project Name:mudrod-core - * File Name:SessionCooccurenceMatrix.java - * Package Name:gov.nasa.jpl.mudrod.recommendation.pre - * Date:Aug 19, 20163:06:33 PM - * Copyright (c) 2016, [email protected] All Rights Reserved. - */ - -package gov.nasa.jpl.mudrod.recommendation.pre; - -import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract; -import gov.nasa.jpl.mudrod.driver.ESDriver; -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import gov.nasa.jpl.mudrod.main.MudrodConstants; -import gov.nasa.jpl.mudrod.utils.LabeledRowMatrix; -import gov.nasa.jpl.mudrod.utils.MatrixUtil; -import gov.nasa.jpl.mudrod.weblog.structure.SessionExtractor; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.function.PairFunction; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.common.unit.TimeValue; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.SearchHit; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import scala.Tuple2; - -import java.util.*; - -/** - * ClassName: SessionCooccurenceMatrix Function: Generate metadata session - * coocucurence matrix from web logs. Each row in the matrix is corresponding to - * a metadata, and each column is a session. - */ -public class SessionCooccurence extends DiscoveryStepAbstract { - - private static final long serialVersionUID = 1L; - private static final Logger LOG = LoggerFactory.getLogger(SessionCooccurence.class); - - /** - * Creates a new instance of SessionCooccurence. - * - * @param props - * the Mudrod configuration - * @param es - * the Elasticsearch drive - * @param spark - * the spark driver - */ - public SessionCooccurence(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - } - - @Override - public Object execute() { - - LOG.info("Starting dataset session-based similarity generation..."); - - startTime = System.currentTimeMillis(); - - // get all metadata session cooccurance data - SessionExtractor extractor = new SessionExtractor(); - JavaPairRDD<String, List<String>> sessionDatasetRDD = extractor.bulidSessionDatasetRDD(props, es, spark); - - // remove retired datasets - JavaPairRDD<String, List<String>> sessionFiltedDatasetsRDD = removeRetiredDataset(es, sessionDatasetRDD); - LabeledRowMatrix datasetSessionMatrix = MatrixUtil.createWordDocMatrix(sessionFiltedDatasetsRDD); - - // export - MatrixUtil.exportToCSV(datasetSessionMatrix.rowMatrix, datasetSessionMatrix.rowkeys, datasetSessionMatrix.colkeys, props.getProperty("session_metadata_Matrix")); - - endTime = System.currentTimeMillis(); - - LOG.info("Completed dataset session-based similarity generation. Time elapsed: {}s", (endTime - startTime) / 1000); - - return null; - } - - @Override - public Object execute(Object o) { - return null; - } - - /** - * filter out-of-data metadata - * - * @param es - * the Elasticsearch drive - * @param userDatasetsRDD - * dataset extracted from session - * @return filtered session datasets - */ - public JavaPairRDD<String, List<String>> removeRetiredDataset(ESDriver es, JavaPairRDD<String, List<String>> userDatasetsRDD) { - - Map<String, String> nameMap = this.getOnServiceMetadata(es); - - return userDatasetsRDD.mapToPair(new PairFunction<Tuple2<String, List<String>>, String, List<String>>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Tuple2<String, List<String>> call(Tuple2<String, List<String>> arg0) throws Exception { - List<String> oriDatasets = arg0._2; - List<String> newDatasets = new ArrayList<>(); - int size = oriDatasets.size(); - for (int i = 0; i < size; i++) { - String name = oriDatasets.get(i); - if (nameMap.containsKey(name)) { - newDatasets.add(nameMap.get(name)); - } - } - return new Tuple2<>(arg0._1, newDatasets); - } - }); - - } - - /** - * getMetadataNameMap: Get on service metadata names, key is lowcase of short - * name and value is the original short name - * - * @param es - * the elasticsearch client - * @return a map from lower case metadata name to original metadata name - */ - private Map<String, String> getOnServiceMetadata(ESDriver es) { - - String indexName = props.getProperty(MudrodConstants.ES_INDEX_NAME); - String metadataType = props.getProperty("recom_metadataType"); - - Map<String, String> shortnameMap = new HashMap<>(); - SearchResponse scrollResp = es.getClient().prepareSearch(indexName).setTypes(metadataType).setScroll(new TimeValue(60000)).setQuery(QueryBuilders.matchAllQuery()).setSize(100).execute() - .actionGet(); - while (true) { - for (SearchHit hit : scrollResp.getHits().getHits()) { - Map<String, Object> metadata = hit.getSource(); - String shortName = (String) metadata.get("Dataset-ShortName"); - shortnameMap.put(shortName.toLowerCase(), shortName); - } - - scrollResp = es.getClient().prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(600000)).execute().actionGet(); - if (scrollResp.getHits().getHits().length == 0) { - break; - } - } - - return shortnameMap; - } - -}
