http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/package-info.java deleted file mode 100644 index 2febf96..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/pre/package-info.java +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * This package includes the preprocessing required by recommendation module. - */ -package gov.nasa.jpl.mudrod.recommendation.pre; \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/AbstractBasedSimilarity.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/AbstractBasedSimilarity.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/AbstractBasedSimilarity.java deleted file mode 100644 index b0e93fc..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/AbstractBasedSimilarity.java +++ /dev/null @@ -1,74 +0,0 @@ -/** - * Project Name:mudrod-core - * File Name:TopicBasedCF.java - * Package Name:gov.nasa.jpl.mudrod.recommendation.process - * Date:Aug 22, 201610:45:55 AM - * Copyright (c) 2016, [email protected] All Rights Reserved. - */ - -package gov.nasa.jpl.mudrod.recommendation.process; - -import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract; -import gov.nasa.jpl.mudrod.driver.ESDriver; -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import gov.nasa.jpl.mudrod.semantics.SVDAnalyzer; -import gov.nasa.jpl.mudrod.utils.LinkageTriple; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.List; -import java.util.Properties; - -/** - * ClassName: Recommend metedata based on data content semantic similarity - */ -public class AbstractBasedSimilarity extends DiscoveryStepAbstract { - - private static final Logger LOG = LoggerFactory.getLogger(AbstractBasedSimilarity.class); - - /** - * Creates a new instance of TopicBasedCF. - * - * @param props the Mudrod configuration - * @param es the Elasticsearch client - * @param spark the spark drive - */ - public AbstractBasedSimilarity(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - } - - @Override - public Object execute() { - - LOG.info("*****************abstract similarity calculation starts******************"); - startTime = System.currentTimeMillis(); - - try { - /*String topicMatrixFile = props.getProperty("metadata_term_tfidf_matrix"); - SemanticAnalyzer analyzer = new SemanticAnalyzer(props, es, spark); - List<LinkageTriple> triples = analyzer - .calTermSimfromMatrix(topicMatrixFile); - analyzer.saveToES(triples, props.getProperty("indexName"), - props.getProperty("metadataTermTFIDFSimType"), true, true);*/ - - // for comparison - SVDAnalyzer svd = new SVDAnalyzer(props, es, spark); - svd.getSVDMatrix(props.getProperty("metadata_word_tfidf_matrix"), 150, props.getProperty("metadata_word_tfidf_matrix")); - List<LinkageTriple> tripleList = svd.calTermSimfromMatrix(props.getProperty("metadata_word_tfidf_matrix")); - svd.saveToES(tripleList, props.getProperty("indexName"), props.getProperty("metadataWordTFIDFSimType"), true, true); - - } catch (Exception e) { - e.printStackTrace(); - } - - endTime = System.currentTimeMillis(); - LOG.info("*****************abstract similarity calculation ends******************Took {}s", (endTime - startTime) / 1000); - - return null; - } - - @Override - public Object execute(Object o) { - return null; - } -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/VariableBasedSimilarity.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/VariableBasedSimilarity.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/VariableBasedSimilarity.java deleted file mode 100644 index 67aeeb8..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/VariableBasedSimilarity.java +++ /dev/null @@ -1,380 +0,0 @@ -package gov.nasa.jpl.mudrod.recommendation.process; - -import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract; -import gov.nasa.jpl.mudrod.driver.ESDriver; -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import org.elasticsearch.action.index.IndexRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.action.update.UpdateRequest; -import org.elasticsearch.common.unit.TimeValue; -import org.elasticsearch.common.xcontent.XContentBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.SearchHit; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.io.Serializable; -import java.text.DecimalFormat; -import java.util.*; - -import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; - -public class VariableBasedSimilarity extends DiscoveryStepAbstract implements Serializable { - - /** - * - */ - private static final long serialVersionUID = 1L; - - private static final Logger LOG = LoggerFactory.getLogger(VariableBasedSimilarity.class); - - private DecimalFormat df = new DecimalFormat("#.000"); - // a map from variable to its type - public Map<String, Integer> variableTypes; - public Map<String, Integer> variableWeights; - - private static final Integer VAR_SPATIAL = 1; - private static final Integer VAR_TEMPORAL = 2; - private static final Integer VAR_CATEGORICAL = 3; - private static final Integer VAR_ORDINAL = 4; - - // index name - private String indexName; - // type name of metadata in ES - private String metadataType; - private String variableSimType; - - /** - * Creates a new instance of OHEncoder. - * - * @param props the Mudrod configuration - * @param es an instantiated {@link ESDriver} - * @param spark an instantiated {@link SparkDriver} - */ - public VariableBasedSimilarity(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - - indexName = props.getProperty("indexName"); - metadataType = props.getProperty("recom_metadataType"); - variableSimType = props.getProperty("metadataCodeSimType"); - this.inital(); - } - - @Override - public Object execute() { - LOG.info("*****************calculating metadata variables based similarity starts******************"); - startTime = System.currentTimeMillis(); - es.deleteType(indexName, variableSimType); - addMapping(es, indexName, variableSimType); - - VariableBasedSimilarity(es); - es.refreshIndex(); - normalizeVariableWeight(es); - es.refreshIndex(); - endTime = System.currentTimeMillis(); - LOG.info("*****************calculating metadata variables based similarity ends******************Took {}s", (endTime - startTime) / 1000); - return null; - } - - @Override - public Object execute(Object o) { - return null; - } - - public void inital() { - this.initVariableType(); - this.initVariableWeight(); - } - - private void initVariableType() { - variableTypes = new HashMap<>(); - - variableTypes.put("DatasetParameter-Variable", VAR_CATEGORICAL); - variableTypes.put("DatasetRegion-Region", VAR_CATEGORICAL); - variableTypes.put("Dataset-ProjectionType", VAR_CATEGORICAL); - variableTypes.put("Dataset-ProcessingLevel", VAR_CATEGORICAL); - variableTypes.put("DatasetParameter-Topic", VAR_CATEGORICAL); - variableTypes.put("DatasetParameter-Term", VAR_CATEGORICAL); - variableTypes.put("DatasetParameter-Category", VAR_CATEGORICAL); - variableTypes.put("DatasetPolicy-DataFormat", VAR_CATEGORICAL); - variableTypes.put("Collection-ShortName", VAR_CATEGORICAL); - variableTypes.put("DatasetSource-Source-Type", VAR_CATEGORICAL); - variableTypes.put("DatasetSource-Source-ShortName", VAR_CATEGORICAL); - variableTypes.put("DatasetSource-Sensor-ShortName", VAR_CATEGORICAL); - variableTypes.put("DatasetPolicy-Availability", VAR_CATEGORICAL); - variableTypes.put("Dataset-Provider-ShortName", VAR_CATEGORICAL); - - variableTypes.put("Dataset-Derivative-ProcessingLevel", VAR_ORDINAL); - variableTypes.put("Dataset-Derivative-TemporalResolution", VAR_ORDINAL); - variableTypes.put("Dataset-Derivative-SpatialResolution", VAR_ORDINAL); - } - - private void initVariableWeight() { - variableWeights = new HashMap<>(); - - variableWeights.put("Dataset-Derivative-ProcessingLevel", 5); - variableWeights.put("DatasetParameter-Category", 5); - variableWeights.put("DatasetParameter-Variable", 5); - variableWeights.put("DatasetSource-Sensor-ShortName", 5); - - variableWeights.put("DatasetPolicy-Availability", 4); - variableWeights.put("DatasetRegion-Region", 4); - variableWeights.put("DatasetSource-Source-Type", 4); - variableWeights.put("DatasetSource-Source-ShortName", 4); - variableWeights.put("DatasetParameter-Term", 4); - variableWeights.put("DatasetPolicy-DataFormat", 4); - variableWeights.put("Dataset-Derivative-SpatialResolution", 4); - variableWeights.put("Temporal_Covergae", 4); - - variableWeights.put("DatasetParameter-Topic", 3); - variableWeights.put("Collection-ShortName", 3); - variableWeights.put("Dataset-Derivative-TemporalResolution", 3); - variableWeights.put("Spatial_Covergae", 3); - - variableWeights.put("Dataset-ProjectionType", 1); - variableWeights.put("Dataset-Provider-ShortName", 1); - } - - public void VariableBasedSimilarity(ESDriver es) { - - es.createBulkProcessor(); - - List<Map<String, Object>> metadatas = new ArrayList<>(); - SearchResponse scrollResp = es.getClient().prepareSearch(indexName).setTypes(metadataType).setScroll(new TimeValue(60000)).setQuery(QueryBuilders.matchAllQuery()).setSize(100).execute() - .actionGet(); - while (true) { - for (SearchHit hit : scrollResp.getHits().getHits()) { - Map<String, Object> metadataA = hit.getSource(); - metadatas.add(metadataA); - } - - scrollResp = es.getClient().prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(600000)).execute().actionGet(); - if (scrollResp.getHits().getHits().length == 0) { - break; - } - } - - int size = metadatas.size(); - for (int i = 0; i < size; i++) { - Map<String, Object> metadataA = metadatas.get(i); - String shortNameA = (String) metadataA.get("Dataset-ShortName"); - - for (int j = 0; j < size; j++) { - metadataA = metadatas.get(i); - Map<String, Object> metadataB = metadatas.get(j); - String shortNameB = (String) metadataB.get("Dataset-ShortName"); - - try { - XContentBuilder contentBuilder = jsonBuilder().startObject(); - contentBuilder.field("concept_A", shortNameA); - contentBuilder.field("concept_B", shortNameB); - - // spatial similarity - this.spatialSimilarity(metadataA, metadataB, contentBuilder); - // temporal similarity - this.temporalSimilarity(metadataA, metadataB, contentBuilder); - // categorical variables similarity - this.categoricalVariablesSimilarity(metadataA, metadataB, contentBuilder); - // ordinal variables similarity - this.ordinalVariablesSimilarity(metadataA, metadataB, contentBuilder); - - contentBuilder.endObject(); - - IndexRequest ir = new IndexRequest(indexName, variableSimType).source(contentBuilder); - es.getBulkProcessor().add(ir); - - } catch (IOException e1) { - e1.printStackTrace(); - } - - } - } - - es.destroyBulkProcessor(); - } - - /* - * refer to P. Frontiera, R. Larson, and J. Radke (2008) A comparison of - geometric approaches to assessing spatial similarity for GIR. - International Journal of Geographical Information Science, - 22(3) - */ - public void spatialSimilarity(Map<String, Object> metadataA, Map<String, Object> metadataB, XContentBuilder contentBuilder) throws IOException { - - double topA = (double) metadataA.get("DatasetCoverage-Derivative-NorthLat"); - double bottomA = (double) metadataA.get("DatasetCoverage-Derivative-SouthLat"); - double leftA = (double) metadataA.get("DatasetCoverage-Derivative-WestLon"); - double rightA = (double) metadataA.get("DatasetCoverage-Derivative-EastLon"); - double areaA = (double) metadataA.get("DatasetCoverage-Derivative-Area"); - - double topB = (double) metadataB.get("DatasetCoverage-Derivative-NorthLat"); - double bottomB = (double) metadataB.get("DatasetCoverage-Derivative-SouthLat"); - double leftB = (double) metadataB.get("DatasetCoverage-Derivative-WestLon"); - double rightB = (double) metadataB.get("DatasetCoverage-Derivative-EastLon"); - double areaB = (double) metadataB.get("DatasetCoverage-Derivative-Area"); - - // Intersect area - double xOverlap = Math.max(0, Math.min(rightA, rightB) - Math.max(leftA, leftB)); - double yOverlap = Math.max(0, Math.min(topA, topB) - Math.max(bottomA, bottomB)); - double overlapArea = xOverlap * yOverlap; - - // Calculate coverage similarity - double similarity = 0.0; - if (areaA > 0 && areaB > 0) { - similarity = (overlapArea / areaA + overlapArea / areaB) * 0.5; - } - - contentBuilder.field("Spatial_Covergae_Sim", similarity); - } - - public void temporalSimilarity(Map<String, Object> metadataA, Map<String, Object> metadataB, XContentBuilder contentBuilder) throws IOException { - - double similarity = 0.0; - double startTimeA = Double.parseDouble((String) metadataA.get("Dataset-DatasetCoverage-StartTimeLong")); - String endTimeAStr = (String) metadataA.get("Dataset-DatasetCoverage-StopTimeLong"); - double endTimeA = 0.0; - if ("".equals(endTimeAStr)) { - endTimeA = System.currentTimeMillis(); - } else { - endTimeA = Double.parseDouble(endTimeAStr); - } - double timespanA = endTimeA - startTimeA; - - double startTimeB = Double.parseDouble((String) metadataB.get("Dataset-DatasetCoverage-StartTimeLong")); - String endTimeBStr = (String) metadataB.get("Dataset-DatasetCoverage-StopTimeLong"); - double endTimeB = 0.0; - if ("".equals(endTimeBStr)) { - endTimeB = System.currentTimeMillis(); - } else { - endTimeB = Double.parseDouble(endTimeBStr); - } - double timespanB = endTimeB - startTimeB; - - double intersect = 0.0; - if (startTimeB >= endTimeA || endTimeB <= startTimeA) { - intersect = 0.0; - } else if (startTimeB >= startTimeA && endTimeB <= endTimeA) { - intersect = timespanB; - } else if (startTimeA >= startTimeB && endTimeA <= endTimeB) { - intersect = timespanA; - } else { - intersect = (startTimeA > startTimeB) ? (endTimeB - startTimeA) : (endTimeA - startTimeB); - } - - similarity = intersect / (Math.sqrt(timespanA) * Math.sqrt(timespanB)); - contentBuilder.field("Temporal_Covergae_Sim", similarity); - } - - public void categoricalVariablesSimilarity(Map<String, Object> metadataA, Map<String, Object> metadataB, XContentBuilder contentBuilder) throws IOException { - - for (String variable : variableTypes.keySet()) { - Integer type = variableTypes.get(variable); - if (type != VAR_CATEGORICAL) { - continue; - } - - double similarity = 0.0; - Object valueA = metadataA.get(variable); - Object valueB = metadataB.get(variable); - if (valueA instanceof ArrayList) { - ArrayList<String> aList = (ArrayList<String>) valueA; - ArrayList<String> bList = (ArrayList<String>) valueB; - if (aList != null && bList != null) { - - int lengthA = aList.size(); - int lengthB = bList.size(); - List<String> newAList = new ArrayList<>(aList); - List<String> newBList = new ArrayList<>(bList); - newAList.retainAll(newBList); - similarity = newAList.size() / lengthA; - } - - } else if (valueA instanceof String) { - if (valueA.equals(valueB)) { - similarity = 1.0; - } - } - - contentBuilder.field(variable + "_Sim", similarity); - } - } - - public void ordinalVariablesSimilarity(Map<String, Object> metadataA, Map<String, Object> metadataB, XContentBuilder contentBuilder) throws IOException { - for (String variable : variableTypes.keySet()) { - Integer type = variableTypes.get(variable); - if (type != VAR_ORDINAL) { - continue; - } - - double similarity = 0.0; - Object valueA = metadataA.get(variable); - Object valueB = metadataB.get(variable); - if (valueA != null && valueB != null) { - - double a = (double) valueA; - double b = (double) valueB; - if (a != 0.0) { - similarity = 1 - Math.abs(b - a) / a; - if (similarity < 0) { - similarity = 0.0; - } - } - } - - contentBuilder.field(variable + "_Sim", similarity); - } - } - - public static void addMapping(ESDriver es, String index, String type) { - XContentBuilder Mapping; - try { - Mapping = jsonBuilder().startObject().startObject(type).startObject("properties").startObject("concept_A").field("type", "string").field("index", "not_analyzed").endObject() - .startObject("concept_B").field("type", "string").field("index", "not_analyzed").endObject() - - .endObject().endObject().endObject(); - - es.getClient().admin().indices().preparePutMapping(index).setType(type).setSource(Mapping).execute().actionGet(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - public void normalizeVariableWeight(ESDriver es) { - - es.createBulkProcessor(); - - double totalWeight = 0.0; - for (String variable : variableWeights.keySet()) { - totalWeight += variableWeights.get(variable); - } - - SearchResponse scrollResp = es.getClient().prepareSearch(indexName).setTypes(variableSimType).setScroll(new TimeValue(60000)).setQuery(QueryBuilders.matchAllQuery()).setSize(100).execute() - .actionGet(); - while (true) { - for (SearchHit hit : scrollResp.getHits().getHits()) { - Map<String, Object> similarities = hit.getSource(); - - double totalSim = 0.0; - for (String variable : variableWeights.keySet()) { - if (similarities.containsKey(variable + "_Sim")) { - double value = (double) similarities.get(variable + "_Sim"); - double weight = variableWeights.get(variable); - totalSim += weight * value; - } - } - - double weight = totalSim / totalWeight; - UpdateRequest ur = es.generateUpdateRequest(indexName, variableSimType, hit.getId(), "weight", weight); - es.getBulkProcessor().add(ur); - } - - scrollResp = es.getClient().prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(600000)).execute().actionGet(); - if (scrollResp.getHits().getHits().length == 0) { - break; - } - } - - es.destroyBulkProcessor(); - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/package-info.java deleted file mode 100644 index 84231f7..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/package-info.java +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * This package includes the processing required by recommendation module. - */ -package gov.nasa.jpl.mudrod.recommendation.process; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/sessionBasedCF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/sessionBasedCF.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/sessionBasedCF.java deleted file mode 100644 index ae55769..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/process/sessionBasedCF.java +++ /dev/null @@ -1,74 +0,0 @@ -/** - * Project Name:mudrod-core - * File Name:sessionBasedCF.java - * Package Name:gov.nasa.jpl.mudrod.recommendation.process - * Date:Aug 19, 20163:17:00 PM - * Copyright (c) 2016, [email protected] All Rights Reserved. - */ - -package gov.nasa.jpl.mudrod.recommendation.process; - -import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract; -import gov.nasa.jpl.mudrod.driver.ESDriver; -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import gov.nasa.jpl.mudrod.semantics.SemanticAnalyzer; -import gov.nasa.jpl.mudrod.utils.LinkageTriple; -import gov.nasa.jpl.mudrod.utils.SimilarityUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.util.List; -import java.util.Properties; - -/** - * ClassName: Recommend metedata based on session level co-occurrence - */ -public class sessionBasedCF extends DiscoveryStepAbstract { - - private static final Logger LOG = LoggerFactory.getLogger(sessionBasedCF.class); - - /** - * Creates a new instance of sessionBasedCF. - * - * @param props - * the Mudrod configuration - * @param es - * the Elasticsearch drive - * @param spark - * the spark drive - */ - public sessionBasedCF(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - } - - @Override - public Object execute() { - LOG.info("*****************Session based metadata similarity starts******************"); - startTime = System.currentTimeMillis(); - - try { - String session_metadatFile = props.getProperty("session_metadata_Matrix"); - File f = new File(session_metadatFile); - if (f.exists()) { - SemanticAnalyzer analyzer = new SemanticAnalyzer(props, es, spark); - List<LinkageTriple> triples = analyzer.calTermSimfromMatrix(session_metadatFile, SimilarityUtil.SIM_PEARSON, 1); - analyzer.saveToES(triples, props.getProperty("indexName"), props.getProperty("metadataSessionBasedSimType"), true, false); - } - - } catch (Exception e) { - e.printStackTrace(); - } - - endTime = System.currentTimeMillis(); - LOG.info("*****************Session based metadata similarity ends******************Took {}s", (endTime - startTime) / 1000); - - return null; - } - - @Override - public Object execute(Object o) { - return null; - } - -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/structure/HybridRecommendation.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/structure/HybridRecommendation.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/structure/HybridRecommendation.java deleted file mode 100644 index 4163fda..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/structure/HybridRecommendation.java +++ /dev/null @@ -1,274 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.recommendation.structure; - -import com.google.gson.Gson; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; -import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract; -import gov.nasa.jpl.mudrod.driver.ESDriver; -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import gov.nasa.jpl.mudrod.main.MudrodEngine; -import org.elasticsearch.action.search.SearchRequestBuilder; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.SearchHit; -import org.elasticsearch.search.sort.SortOrder; - -import java.io.IOException; -import java.text.DecimalFormat; -import java.util.*; - -/** - * Recommend metadata using combination all two methods, including content-based - * similarity and session-level similarity - */ -public class HybridRecommendation extends DiscoveryStepAbstract { - /** - * - */ - private static final long serialVersionUID = 1L; - // recommended metadata list - protected transient List<LinkedTerm> termList = new ArrayList<>(); - // format decimal - DecimalFormat df = new DecimalFormat("#.00"); - // index name - protected static final String INDEX_NAME = "indexName"; - private static final String WEIGHT = "weight"; - - /** - * recommended data class Date: Sep 12, 2016 2:25:28 AM - */ - class LinkedTerm { - public String term = null; - public double weight = 0; - public String model = null; - - public LinkedTerm(String str, double w, String m) { - term = str; - weight = w; - model = m; - } - } - - public HybridRecommendation(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - } - - @Override - public Object execute() { - return null; - } - - @Override - public Object execute(Object o) { - return null; - } - - /** - * Get recommended data for a giving dataset - * - * @param input: a giving dataset - * @param num: the number of recommended dataset - * @return recommended dataset in json format - */ - public JsonObject getRecomDataInJson(String input, int num) { - JsonObject resultJson = new JsonObject(); - - String type = props.getProperty("metadataCodeSimType"); - Map<String, Double> sortedVariableSimMap = getRelatedData(type, input, num + 10); - - type = props.getProperty("metadataWordTFIDFSimType"); - Map<String, Double> sortedAbstractSimMap = getRelatedData(type, input, num + 10); - - type = props.getProperty("metadataSessionBasedSimType"); - Map<String, Double> sortedSessionSimMap = getRelatedData(type, input, num + 10); - - JsonElement variableSimJson = mapToJson(sortedVariableSimMap, num); - resultJson.add("variableSim", variableSimJson); - JsonElement abstractSimJson = mapToJson(sortedAbstractSimMap, num); - resultJson.add("abstractSim", abstractSimJson); - JsonElement sessionSimJson = mapToJson(sortedSessionSimMap, num); - resultJson.add("sessionSim", sessionSimJson); - - Map<String, Double> hybirdSimMap = new HashMap<String, Double>(); - - for (String name : sortedAbstractSimMap.keySet()) { - hybirdSimMap.put(name, sortedAbstractSimMap.get(name) /** 0.4 */); - } - - for (String name : sortedVariableSimMap.keySet()) { - if (hybirdSimMap.get(name) != null) { - double sim = hybirdSimMap.get(name) + sortedVariableSimMap.get(name) /** 0.3 */; - hybirdSimMap.put(name, Double.parseDouble(df.format(sim))); - } else { - double sim = sortedVariableSimMap.get(name); - hybirdSimMap.put(name, Double.parseDouble(df.format(sim))); - } - } - - for (String name : sortedSessionSimMap.keySet()) { - if (hybirdSimMap.get(name) != null) { - double sim = hybirdSimMap.get(name) + sortedSessionSimMap.get(name) /** 0.1 */; - hybirdSimMap.put(name, Double.parseDouble(df.format(sim))); - } else { - double sim = sortedSessionSimMap.get(name); - hybirdSimMap.put(name, Double.parseDouble(df.format(sim))); - } - } - - Map<String, Double> sortedHybirdSimMap = this.sortMapByValue(hybirdSimMap); - - JsonElement linkedJson = mapToJson(sortedHybirdSimMap, num); - resultJson.add("linked", linkedJson); - - return resultJson; - } - - /** - * Method of converting hashmap to JSON - * - * @param wordweights a map from related metadata to weights - * @param num the number of converted elements - * @return converted JSON object - */ - protected JsonElement mapToJson(Map<String, Double> wordweights, int num) { - Gson gson = new Gson(); - - List<JsonObject> nodes = new ArrayList<>(); - Set<String> words = wordweights.keySet(); - int i = 0; - for (String wordB : words) { - JsonObject node = new JsonObject(); - node.addProperty("name", wordB); - node.addProperty("weight", wordweights.get(wordB)); - nodes.add(node); - - i += 1; - if (i >= num) { - break; - } - } - - String nodesJson = gson.toJson(nodes); - JsonElement nodesElement = gson.fromJson(nodesJson, JsonElement.class); - - return nodesElement; - } - - /** - * Get recommend dataset for a giving dataset - * - * @param type recommend method - * @param input a giving dataset - * @param num the number of recommended dataset - * @return recommended dataset map, key is dataset name, value is similarity - * value - */ - public Map<String, Double> getRelatedData(String type, String input, int num) { - termList = new ArrayList<>(); - Map<String, Double> termsMap = new HashMap<>(); - Map<String, Double> sortedMap = new HashMap<>(); - try { - List<LinkedTerm> links = getRelatedDataFromES(type, input, num); - int size = links.size(); - for (int i = 0; i < size; i++) { - termsMap.put(links.get(i).term, links.get(i).weight); - } - - sortedMap = sortMapByValue(termsMap); // terms_map will be empty - } catch (Exception e) { - e.printStackTrace(); - } - - return sortedMap; - } - - /** - * Get recommend dataset for a giving dataset - * - * @param type recommend method - * @param input a giving dataset - * @param num the number of recommended dataset - * @return recommended dataset list - */ - public List<LinkedTerm> getRelatedDataFromES(String type, String input, int num) { - - SearchRequestBuilder builder = es.getClient().prepareSearch(props.getProperty(INDEX_NAME)).setTypes(type).setQuery(QueryBuilders.termQuery("concept_A", input)).addSort(WEIGHT, SortOrder.DESC) - .setSize(num); - - SearchResponse usrhis = builder.execute().actionGet(); - - for (SearchHit hit : usrhis.getHits().getHits()) { - Map<String, Object> result = hit.getSource(); - String conceptB = (String) result.get("concept_B"); - - if (!conceptB.equals(input)) { - LinkedTerm lTerm = new LinkedTerm(conceptB, (double) result.get(WEIGHT), type); - termList.add(lTerm); - } - } - - return termList; - } - - /** - * Method of sorting a map by value - * - * @param passedMap input map - * @return sorted map - */ - public Map<String, Double> sortMapByValue(Map<String, Double> passedMap) { - List<String> mapKeys = new ArrayList<>(passedMap.keySet()); - List<Double> mapValues = new ArrayList<>(passedMap.values()); - Collections.sort(mapValues, Collections.reverseOrder()); - Collections.sort(mapKeys, Collections.reverseOrder()); - - LinkedHashMap<String, Double> sortedMap = new LinkedHashMap<>(); - - Iterator<Double> valueIt = mapValues.iterator(); - while (valueIt.hasNext()) { - Object val = valueIt.next(); - Iterator<String> keyIt = mapKeys.iterator(); - - while (keyIt.hasNext()) { - Object key = keyIt.next(); - String comp1 = passedMap.get(key).toString(); - String comp2 = val.toString(); - - if (comp1.equals(comp2)) { - passedMap.remove(key); - mapKeys.remove(key); - sortedMap.put((String) key, (Double) val); - break; - } - } - } - return sortedMap; - } - - public static void main(String[] args) throws IOException { - - MudrodEngine me = new MudrodEngine(); - Properties props = me.loadConfig(); - ESDriver es = new ESDriver(me.getConfig()); - HybridRecommendation test = new HybridRecommendation(props, es, null); - - // String input = "NSCAT_LEVEL_1.7_V2"; - String input = "AQUARIUS_L3_SSS_SMIA_MONTHLY-CLIMATOLOGY_V4"; - JsonObject json = test.getRecomDataInJson(input, 10); - - System.out.println(json.toString()); - } -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/structure/MetadataOpt.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/structure/MetadataOpt.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/structure/MetadataOpt.java deleted file mode 100644 index 69dc878..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/structure/MetadataOpt.java +++ /dev/null @@ -1,150 +0,0 @@ -package gov.nasa.jpl.mudrod.recommendation.structure; - -import gov.nasa.jpl.mudrod.driver.ESDriver; -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import gov.nasa.jpl.mudrod.main.MudrodConstants; -import gov.nasa.jpl.mudrod.utils.LabeledRowMatrix; -import gov.nasa.jpl.mudrod.utils.MatrixUtil; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.mllib.linalg.distributed.RowMatrix; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.common.unit.TimeValue; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.SearchHit; -import scala.Tuple2; - -import java.io.Serializable; -import java.util.*; - -public class MetadataOpt implements Serializable { - - /** - * - */ - private static final long serialVersionUID = 1L; - private String indexName; - private String metadataType; - private List<String> variables; - - public static final String SPLIT_BLANK = " "; - public static final String SPLIT_COMMA = ","; - - public MetadataOpt(Properties props) { - indexName = props.getProperty(MudrodConstants.ES_INDEX_NAME); - metadataType = props.getProperty("recom_metadataType"); - - variables = new ArrayList<>(); - variables.add("DatasetParameter-Term"); - variables.add("DatasetParameter-Variable"); - variables.add("Dataset-Description"); - variables.add("Dataset-LongName"); - } - - public JavaPairRDD<String, String> loadAll(ESDriver es, SparkDriver spark) throws Exception { - List<Tuple2<String, String>> datasetsTokens = this.loadMetadataFromES(es, variables); - return this.parallizeData(spark, datasetsTokens); - } - - public JavaPairRDD<String, String> loadAll(ESDriver es, SparkDriver spark, List<String> variables) throws Exception { - List<Tuple2<String, String>> datasetsTokens = this.loadMetadataFromES(es, variables); - return this.parallizeData(spark, datasetsTokens); - } - - private JavaPairRDD<String, String> parallizeData(SparkDriver spark, List<Tuple2<String, String>> datasetContent) { - - JavaRDD<Tuple2<String, String>> datasetContentRDD = spark.sc.parallelize(datasetContent); - - return datasetContentRDD.mapToPair(new PairFunction<Tuple2<String, String>, String, String>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Tuple2<String, String> call(Tuple2<String, String> term) throws Exception { - return term; - } - }); - - } - - public JavaPairRDD<String, List<String>> tokenizeData(JavaPairRDD<String, String> datasetsContentRDD, String splitter) throws Exception { - - return datasetsContentRDD.mapToPair(new PairFunction<Tuple2<String, String>, String, List<String>>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Tuple2<String, List<String>> call(Tuple2<String, String> arg) throws Exception { - String content = arg._2; - List<String> tokens = getTokens(content, splitter); - - return new Tuple2<>(arg._1, tokens); - } - }); - - } - - public List<String> getTokens(String str, String splitter) throws Exception { - String[] tokens = null; - if (splitter.equals(SPLIT_BLANK)) { - tokens = str.split(" "); - } else if (splitter.equals(SPLIT_COMMA)) { - tokens = str.split(","); - } - return java.util.Arrays.asList(tokens); - } - - public List<Tuple2<String, String>> loadMetadataFromES(ESDriver es, List<String> variables) throws Exception { - - SearchResponse scrollResp = es.getClient().prepareSearch(indexName).setTypes(metadataType).setQuery(QueryBuilders.matchAllQuery()).setScroll(new TimeValue(60000)).setSize(100).execute() - .actionGet(); - - List<Tuple2<String, String>> datasetsTokens = new ArrayList<>(); - while (true) { - - for (SearchHit hit : scrollResp.getHits().getHits()) { - Map<String, Object> result = hit.getSource(); - String shortName = (String) result.get("Dataset-ShortName"); - - String filedStr = ""; - int size = variables.size(); - for (int i = 0; i < size; i++) { - String filed = variables.get(i); - Object filedValue = result.get(filed); - - if (filedValue != null) { - filedStr = es.customAnalyzing(indexName, filedValue.toString()); - } - } - - datasetsTokens.add(new Tuple2<String, String>(shortName, filedStr)); - } - - scrollResp = es.getClient().prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(600000)).execute().actionGet(); - if (scrollResp.getHits().getHits().length == 0) { - break; - } - } - - return datasetsTokens; - } - - public LabeledRowMatrix tFIDFTokens(JavaPairRDD<String, List<String>> datasetTokensRDD, SparkDriver spark) { - - LabeledRowMatrix labelMatrix = MatrixUtil.createDocWordMatrix(datasetTokensRDD, spark.sc); - - RowMatrix docwordMatrix = labelMatrix.rowMatrix; - - RowMatrix docwordTFIDFMatrix = MatrixUtil.createTFIDFMatrix(docwordMatrix); - - labelMatrix.rowMatrix = docwordTFIDFMatrix; - - return labelMatrix; - } - -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/structure/RecomData.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/structure/RecomData.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/structure/RecomData.java deleted file mode 100644 index 9025156..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/structure/RecomData.java +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.recommendation.structure; - -import com.google.gson.Gson; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; -import gov.nasa.jpl.mudrod.discoveryengine.DiscoveryStepAbstract; -import gov.nasa.jpl.mudrod.driver.ESDriver; -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import gov.nasa.jpl.mudrod.main.MudrodEngine; -import org.elasticsearch.action.search.SearchRequestBuilder; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.SearchHit; -import org.elasticsearch.search.sort.SortOrder; - -import java.io.IOException; -import java.text.DecimalFormat; -import java.util.*; - -/** - * This class is used to test recommendation result similarity and session-level - * similarity - */ -public class RecomData extends DiscoveryStepAbstract { - - /** - * - */ - private static final long serialVersionUID = 1L; - protected transient List<LinkedTerm> termList = new ArrayList<>(); - DecimalFormat df = new DecimalFormat("#.00"); - protected static final String INDEX_NAME = "indexName"; - private static final String WEIGHT = "weight"; - - class LinkedTerm { - public String term = null; - public double weight = 0; - public String model = null; - - public LinkedTerm(String str, double w, String m) { - term = str; - weight = w; - model = m; - } - } - - public RecomData(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - } - - @Override - public Object execute() { - return null; - } - - @Override - public Object execute(Object o) { - return null; - } - - public JsonObject getRecomDataInJson(String input, int num) { - String type = props.getProperty("metadataTermTFIDFSimType"); - Map<String, Double> sortedOBSimMap = getRelatedData(type, input, num + 5); - JsonElement linkedJson = mapToJson(sortedOBSimMap, num); - - // type = props.getProperty("metadataTermTFIDFSimType"); - type = props.getProperty("metadataCodeSimType"); - - Map<String, Double> sortedMBSimMap = getRelatedData(type, input, num + 5); - JsonElement relatedJson = mapToJson(sortedMBSimMap, num); - - JsonObject json = new JsonObject(); - - json.add("TFIDFSim", linkedJson); - json.add("TopicSim", relatedJson); - - return json; - } - - protected JsonElement mapToJson(Map<String, Double> wordweights, int num) { - Gson gson = new Gson(); - - List<JsonObject> nodes = new ArrayList<>(); - Set<String> words = wordweights.keySet(); - int i = 0; - for (String wordB : words) { - JsonObject node = new JsonObject(); - node.addProperty("name", wordB); - node.addProperty("weight", wordweights.get(wordB)); - nodes.add(node); - - i += 1; - if (i >= num) { - break; - } - } - - String nodesJson = gson.toJson(nodes); - JsonElement nodesElement = gson.fromJson(nodesJson, JsonElement.class); - - return nodesElement; - } - - public Map<String, Double> getRelatedData(String type, String input, int num) { - termList = new ArrayList<>(); - Map<String, Double> termsMap = new HashMap<>(); - Map<String, Double> sortedMap = new HashMap<>(); - try { - List<LinkedTerm> links = getRelatedDataFromES(type, input, num); - int size = links.size(); - for (int i = 0; i < size; i++) { - termsMap.put(links.get(i).term, links.get(i).weight); - } - - sortedMap = sortMapByValue(termsMap); // terms_map will be empty - } catch (Exception e) { - e.printStackTrace(); - } - - return sortedMap; - } - - public List<LinkedTerm> getRelatedDataFromES(String type, String input, int num) { - SearchRequestBuilder builder = es.getClient().prepareSearch(props.getProperty(INDEX_NAME)).setTypes(type).setQuery(QueryBuilders.termQuery("concept_A", input)).addSort(WEIGHT, SortOrder.DESC) - .setSize(num); - - SearchResponse usrhis = builder.execute().actionGet(); - - for (SearchHit hit : usrhis.getHits().getHits()) { - Map<String, Object> result = hit.getSource(); - String conceptB = (String) result.get("concept_B"); - - if (!conceptB.equals(input)) { - LinkedTerm lTerm = new LinkedTerm(conceptB, (double) result.get(WEIGHT), type); - termList.add(lTerm); - } - } - - return termList; - } - - public Map<String, Double> sortMapByValue(Map<String, Double> passedMap) { - List<String> mapKeys = new ArrayList<>(passedMap.keySet()); - List<Double> mapValues = new ArrayList<>(passedMap.values()); - Collections.sort(mapValues, Collections.reverseOrder()); - Collections.sort(mapKeys, Collections.reverseOrder()); - - LinkedHashMap<String, Double> sortedMap = new LinkedHashMap<>(); - - Iterator<Double> valueIt = mapValues.iterator(); - while (valueIt.hasNext()) { - Object val = valueIt.next(); - Iterator<String> keyIt = mapKeys.iterator(); - - while (keyIt.hasNext()) { - Object key = keyIt.next(); - String comp1 = passedMap.get(key).toString(); - String comp2 = val.toString(); - - if (comp1.equals(comp2)) { - passedMap.remove(key); - mapKeys.remove(key); - sortedMap.put((String) key, (Double) val); - break; - } - } - } - return sortedMap; - } - - public static void main(String[] args) throws IOException { - - MudrodEngine me = new MudrodEngine(); - Properties props = me.loadConfig(); - ESDriver es = new ESDriver(me.getConfig()); - RecomData test = new RecomData(props, es, null); - - String input = "AQUARIUS_L3_SSS_SMIA_MONTHLY-CLIMATOLOGY_V4"; - JsonObject json = test.getRecomDataInJson(input, 10); - - System.out.println(json.toString()); - } -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/structure/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/structure/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/structure/package-info.java deleted file mode 100644 index 99199ca..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/recommendation/structure/package-info.java +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * This package includes the data structure required by recommendation module. - */ -package gov.nasa.jpl.mudrod.recommendation.structure; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/semantics/SVDAnalyzer.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/semantics/SVDAnalyzer.java b/core/src/main/java/gov/nasa/jpl/mudrod/semantics/SVDAnalyzer.java deleted file mode 100644 index 3e63b04..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/semantics/SVDAnalyzer.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.semantics; - -import gov.nasa.jpl.mudrod.driver.ESDriver; -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import gov.nasa.jpl.mudrod.utils.MatrixUtil; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.distributed.RowMatrix; - -import java.util.ArrayList; -import java.util.List; -import java.util.Properties; - -/** - * ClassName: SVDAnalyzer Function: Analyze semantic relationship through SVD - * method - */ -public class SVDAnalyzer extends SemanticAnalyzer { - - /** - * - */ - private static final long serialVersionUID = 1L; - - /** - * Creates a new instance of SVDAnalyzer. - * - * @param props the Mudrod configuration - * @param es the Elasticsearch drive - * @param spark the spark drive - */ - public SVDAnalyzer(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - } - - /** - * GetSVDMatrix: Create SVD matrix csv file from original csv file. - * - * @param csvFileName each row is a term, and each column is a document. - * @param svdDimention Dimension of SVD matrix - * @param svdMatrixFileName CSV file name of SVD matrix - */ - public void getSVDMatrix(String csvFileName, int svdDimention, String svdMatrixFileName) { - - JavaPairRDD<String, Vector> importRDD = MatrixUtil.loadVectorFromCSV(spark, csvFileName, 1); - JavaRDD<Vector> vectorRDD = importRDD.values(); - RowMatrix wordDocMatrix = new RowMatrix(vectorRDD.rdd()); - RowMatrix tfidfMatrix = MatrixUtil.createTFIDFMatrix(wordDocMatrix); - RowMatrix svdMatrix = MatrixUtil.buildSVDMatrix(tfidfMatrix, svdDimention); - - List<String> rowKeys = importRDD.keys().collect(); - List<String> colKeys = new ArrayList<>(); - for (int i = 0; i < svdDimention; i++) { - colKeys.add("dimension" + i); - } - MatrixUtil.exportToCSV(svdMatrix, rowKeys, colKeys, svdMatrixFileName); - } -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/semantics/SemanticAnalyzer.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/semantics/SemanticAnalyzer.java b/core/src/main/java/gov/nasa/jpl/mudrod/semantics/SemanticAnalyzer.java deleted file mode 100644 index be8b2b3..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/semantics/SemanticAnalyzer.java +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.semantics; - -import gov.nasa.jpl.mudrod.discoveryengine.MudrodAbstract; -import gov.nasa.jpl.mudrod.driver.ESDriver; -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import gov.nasa.jpl.mudrod.utils.LinkageTriple; -import gov.nasa.jpl.mudrod.utils.MatrixUtil; -import gov.nasa.jpl.mudrod.utils.SimilarityUtil; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import java.util.Properties; - -/** - * ClassName: SemanticAnalyzer Function: Semantic analyzer - */ -public class SemanticAnalyzer extends MudrodAbstract { - - /** - * - */ - private static final long serialVersionUID = 1L; - - /** - * Creates a new instance of SemanticAnalyzer. - * - * @param props - * the Mudrod configuration - * @param es - * the Elasticsearch drive - * @param spark - * the spark drive - */ - public SemanticAnalyzer(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - } - - /** - * Calculate term similarity from CSV matrix. - * - * @param csvFileName - * csv file of matrix, each row is a term, and each column is a - * dimension in feature space - * @return Linkage triple list - */ - public List<LinkageTriple> calTermSimfromMatrix(String csvFileName) { - File f = new File(csvFileName); - if (!f.exists()) { - return null; - } - return this.calTermSimfromMatrix(csvFileName, 1); - } - - /** - * Calculate term similarity from CSV matrix. - * - * @param csvFileName csv file of matrix, each row is a term, and each column is a - * dimension in feature space - * @param skipRow number of rows to skip in input CSV file e.g. header - * @return Linkage triple list - */ - public List<LinkageTriple> calTermSimfromMatrix(String csvFileName, int skipRow) { - - JavaPairRDD<String, Vector> importRDD = MatrixUtil.loadVectorFromCSV(spark, csvFileName, skipRow); - if (importRDD == null || importRDD.values().first().size() == 0) { - return null; - } - - CoordinateMatrix simMatrix = SimilarityUtil.calculateSimilarityFromVector(importRDD.values()); - JavaRDD<String> rowKeyRDD = importRDD.keys(); - return SimilarityUtil.matrixToTriples(rowKeyRDD, simMatrix); - } - - /** - * Calculate term similarity from CSV matrix. - * - * @param csvFileName csv file of matrix, each row is a term, and each column is a - * dimension in feature space - * @param simType the type of similary calculation to execute e.g. - * <ul> - * <li>{@link gov.nasa.jpl.mudrod.utils.SimilarityUtil#SIM_COSINE} - 3,</li> - * <li>{@link gov.nasa.jpl.mudrod.utils.SimilarityUtil#SIM_HELLINGER} - 2,</li> - * <li>{@link gov.nasa.jpl.mudrod.utils.SimilarityUtil#SIM_PEARSON} - 1</li> - * </ul> - * @param skipRow number of rows to skip in input CSV file e.g. header - * @return Linkage triple list - */ - public List<LinkageTriple> calTermSimfromMatrix(String csvFileName, int simType, int skipRow) { - - JavaPairRDD<String, Vector> importRDD = MatrixUtil.loadVectorFromCSV(spark, csvFileName, skipRow); - if (importRDD.values().first().size() == 0) { - return null; - } - - JavaRDD<LinkageTriple> triples = SimilarityUtil.calculateSimilarityFromVector(importRDD, simType); - - return triples.collect(); - } - - public void saveToES(List<LinkageTriple> tripleList, String index, String type) { - try { - LinkageTriple.insertTriples(es, tripleList, index, type); - } catch (IOException e) { - e.printStackTrace(); - } - } - - /** - * Method of saving linkage triples to Elasticsearch. - * - * @param tripleList - * linkage triple list - * @param index - * index name - * @param type - * type name - * @param bTriple - * bTriple - * @param bSymmetry - * bSymmetry - */ - public void saveToES(List<LinkageTriple> tripleList, String index, String type, boolean bTriple, boolean bSymmetry) { - try { - LinkageTriple.insertTriples(es, tripleList, index, type, bTriple, bSymmetry); - } catch (IOException e) { - e.printStackTrace(); - - } - } -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/semantics/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/semantics/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/semantics/package-info.java deleted file mode 100644 index 9c2e8ac..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/semantics/package-info.java +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * This package includes SVD transformation function, methods of calculating - * similarity from CSV, and saving triples into Elasticsearch - */ -package gov.nasa.jpl.mudrod.semantics; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ClickstreamImporter.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ClickstreamImporter.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ClickstreamImporter.java deleted file mode 100644 index 5cb130c..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ClickstreamImporter.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.ssearch; - -import gov.nasa.jpl.mudrod.discoveryengine.MudrodAbstract; -import gov.nasa.jpl.mudrod.driver.ESDriver; -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import gov.nasa.jpl.mudrod.main.MudrodConstants; - -import org.elasticsearch.action.index.IndexRequest; -import org.elasticsearch.common.xcontent.XContentBuilder; - -import java.io.BufferedReader; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.IOException; -import java.util.Properties; - -import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; - -/** - * Supports ability to import click stream data into Elasticsearch - * through .csv file - */ -public class ClickstreamImporter extends MudrodAbstract { - /** - * - */ - private static final long serialVersionUID = 1L; - - public ClickstreamImporter(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - addClickStreamMapping(); - } - - /** - * Method to add Elasticsearch mapping for click stream data - */ - public void addClickStreamMapping() { - XContentBuilder Mapping; - try { - Mapping = jsonBuilder().startObject().startObject( - props.getProperty(MudrodConstants.CLICK_STREAM_MATRIX_TYPE)).startObject( - "properties").startObject("query").field("type", "string").field( - "index", "not_analyzed").endObject().startObject("dataID").field( - "type", "string").field("index", "not_analyzed").endObject() - - .endObject().endObject().endObject(); - - es.getClient().admin().indices().preparePutMapping( - props.getProperty(MudrodConstants.ES_INDEX_NAME)).setType( - props.getProperty(MudrodConstants.CLICK_STREAM_MATRIX_TYPE)).setSource( - Mapping).execute().actionGet(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - /** - * Method to import click stream CSV into Elasticsearch - */ - public void importfromCSVtoES() { - es.deleteType(props.getProperty(MudrodConstants.ES_INDEX_NAME), - props.getProperty(MudrodConstants.CLICK_STREAM_MATRIX_TYPE)); - es.createBulkProcessor(); - - BufferedReader br = null; - String cvsSplitBy = ","; - - try { - br = new BufferedReader(new FileReader(props.getProperty("clickstreamMatrix"))); - String line = br.readLine(); - // first item needs to be skipped - String[] dataList = line.split(cvsSplitBy); - while ((line = br.readLine()) != null) { - String[] clicks = line.split(cvsSplitBy); - for (int i = 1; i < clicks.length; i++) { - if (!"0.0".equals(clicks[i])) { - IndexRequest ir = new IndexRequest(props.getProperty(MudrodConstants.ES_INDEX_NAME), - props.getProperty(MudrodConstants.CLICK_STREAM_MATRIX_TYPE)) - .source(jsonBuilder().startObject().field("query", clicks[0]).field( - "dataID", dataList[i]).field("clicks", clicks[i]).endObject()); - es.getBulkProcessor().add(ir); - } - } - } - } catch (FileNotFoundException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } finally { - if (br != null) { - try { - br.close(); - es.destroyBulkProcessor(); - } catch (IOException e) { - e.printStackTrace(); - } - } - } - } - -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/Dispatcher.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/Dispatcher.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/Dispatcher.java deleted file mode 100644 index a0f3a2c..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/Dispatcher.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.ssearch; - -import gov.nasa.jpl.mudrod.discoveryengine.MudrodAbstract; -import gov.nasa.jpl.mudrod.driver.ESDriver; -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import gov.nasa.jpl.mudrod.integration.LinkageIntegration; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.MatchQueryBuilder; -import org.elasticsearch.index.query.MultiMatchQueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.HashMap; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Properties; - -/** - * Supports ability to transform regular user query into a semantic query - */ -public class Dispatcher extends MudrodAbstract { - private static final Logger LOG = LoggerFactory.getLogger(Dispatcher.class); - - public Dispatcher(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - } - - /** - * Method of getting semantically most related terms by number - * - * @param input regular input query - * @param num the number of most related terms - * @return a map from term to similarity - */ - public Map<String, Double> getRelatedTerms(String input, int num) { - LinkageIntegration li = new LinkageIntegration(props, this.es, null); - Map<String, Double> sortedMap = li.appyMajorRule(input); - Map<String, Double> selected_Map = new HashMap<>(); - int count = 0; - for (Entry<String, Double> entry : sortedMap.entrySet()) { - if (count < num) { - selected_Map.put(entry.getKey(), entry.getValue()); - } - count++; - } - return selected_Map; - } - - /** - * Method of getting semantically most related terms by similarity threshold - * - * @param input regular input query - * @param T value of threshold, raning from 0 to 1 - * @return a map from term to similarity - */ - public Map<String, Double> getRelatedTermsByT(String input, double T) { - LinkageIntegration li = new LinkageIntegration(this.props, this.es, null); - Map<String, Double> sortedMap = li.appyMajorRule(input); - Map<String, Double> selected_Map = new HashMap<>(); - - for (Entry<String, Double> entry : sortedMap.entrySet()) { - if (entry.getValue() >= T) { - selected_Map.put(entry.getKey(), entry.getValue()); - } - } - return selected_Map; - } - - /** - * Method of creating semantic query based on Threshold - * - * @param input regular query - * @param T threshold raning from 0 to 1 - * @param query_operator query mode - * @return a multiMatch query builder - */ - public BoolQueryBuilder createSemQuery(String input, double T, String query_operator) { - Map<String, Double> selected_Map = getRelatedTermsByT(input, T); - selected_Map.put(input, (double) 1); - - String fieldsList[] = { "Dataset-Metadata", "Dataset-ShortName", "Dataset-LongName", - "DatasetParameter-Topic", "DatasetParameter-VariableDetail", "DatasetParameter-Category", - "DatasetParameter-Variable", "DatasetParameter-Term", - "DatasetSource-Source-LongName", "DatasetSource-Source-LongName-Full", - "DatasetSource-Source-ShortName", "DatasetSource-Source-ShortName-Full", - "DatasetSource-Sensor-LongName", "DatasetSource-Sensor-LongName-Full", "DatasetSource-Sensor-ShortName", - "DatasetSource-Sensor-ShortName-Full" }; - BoolQueryBuilder qb = new BoolQueryBuilder(); - for (Entry<String, Double> entry : selected_Map.entrySet()) { - if (query_operator.toLowerCase().trim().equals("phrase")) { - qb.should(QueryBuilders.multiMatchQuery(entry.getKey(), fieldsList).boost(entry.getValue().floatValue()).type(MultiMatchQueryBuilder.Type.PHRASE).tieBreaker((float) 0.5)); // when - // set - // to - // 1.0, - // it - // would - // be - // equal - // to - // "most - // fields" - // query - } else if (query_operator.toLowerCase().trim().equals("and")) { - qb.should(QueryBuilders.multiMatchQuery(entry.getKey(), fieldsList).boost(entry.getValue().floatValue()).operator(MatchQueryBuilder.DEFAULT_OPERATOR.AND).tieBreaker((float) 0.5)); - } else { - qb.should(QueryBuilders.multiMatchQuery(entry.getKey(), fieldsList).boost(entry.getValue().floatValue()).operator(MatchQueryBuilder.DEFAULT_OPERATOR.OR).tieBreaker((float) 0.5)); - } - } - - // LOG.info(qb.toString()); - return qb; - } - -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/Ranker.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/Ranker.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/Ranker.java deleted file mode 100644 index 32830d5..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/Ranker.java +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.ssearch; - -import gov.nasa.jpl.mudrod.discoveryengine.MudrodAbstract; -import gov.nasa.jpl.mudrod.driver.ESDriver; -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import gov.nasa.jpl.mudrod.main.MudrodConstants; -import gov.nasa.jpl.mudrod.ssearch.ranking.Learner; -import gov.nasa.jpl.mudrod.ssearch.structure.SResult; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.regression.LabeledPoint; - -import java.io.Serializable; -import java.text.DecimalFormat; -import java.util.*; - -/** - * Supports the ability to calculating ranking score - */ -public class Ranker extends MudrodAbstract implements Serializable { - /** - * - */ - private static final long serialVersionUID = 1L; - transient List<SResult> resultList = new ArrayList<>(); - - String learnerType = null; - Learner le = null; - - public Ranker(Properties props, ESDriver es, SparkDriver spark, String learnerType) { - super(props, es, spark); - this.learnerType = learnerType; - le = new Learner(learnerType, spark, props.getProperty(MudrodConstants.SVM_SGD_MODEL)); - } - - /** - * Method of comparing results based on final score - */ - public class ResultComparator implements Comparator<SResult> { - @Override - public int compare(SResult o1, SResult o2) { - return o2.below.compareTo(o1.below); - } - } - - /** - * Method of calculating mean value - * - * @param attribute the attribute name that need to be calculated on - * @param resultList an array list of result - * @return mean value - */ - private double getMean(String attribute, List<SResult> resultList) { - double sum = 0.0; - for (SResult a : resultList) { - sum += (double) SResult.get(a, attribute); - } - return getNDForm(sum / resultList.size()); - } - - /** - * Method of calculating variance value - * - * @param attribute the attribute name that need to be calculated on - * @param resultList an array list of result - * @return variance value - */ - private double getVariance(String attribute, List<SResult> resultList) { - double mean = getMean(attribute, resultList); - double temp = 0.0; - double val; - for (SResult a : resultList) { - val = (Double) SResult.get(a, attribute); - temp += (mean - val) * (mean - val); - } - - return getNDForm(temp / resultList.size()); - } - - /** - * Method of calculating standard variance - * - * @param attribute the attribute name that need to be calculated on - * @param resultList an array list of result - * @return standard variance - */ - private double getStdDev(String attribute, List<SResult> resultList) { - return getNDForm(Math.sqrt(getVariance(attribute, resultList))); - } - - /** - * Method of calculating Z score - * - * @param val the value of an attribute - * @param mean the mean value of an attribute - * @param std the standard deviation of an attribute - * @return Z score - */ - private double getZscore(double val, double mean, double std) { - if (!equalComp(std, 0)) { - return getNDForm((val - mean) / std); - } else { - return 0; - } - } - - private boolean equalComp(double a, double b) { - return Math.abs(a - b) < 0.0001; - } - - /** - * Get the first N decimals of a double value - * - * @param d double value that needs to be processed - * @return processed double value - */ - private double getNDForm(double d) { - DecimalFormat ndForm = new DecimalFormat("#.###"); - return Double.valueOf(ndForm.format(d)); - } - - /** - * Method of ranking a list of result - * - * @param resultList result list - * @return ranked result list - */ - public List<SResult> rank(List<SResult> resultList) { - for (int i = 0; i < resultList.size(); i++) { - for (int m = 0; m < SResult.rlist.length; m++) { - String att = SResult.rlist[m].split("_")[0]; - double val = SResult.get(resultList.get(i), att); - double mean = getMean(att, resultList); - double std = getStdDev(att, resultList); - double score = getZscore(val, mean, std); - String scoreId = SResult.rlist[m]; - SResult.set(resultList.get(i), scoreId, score); - } - } - - // using collection.sort directly would cause an "not transitive" error - // this is because the training model is not a overfitting model - for (int j = 0; j < resultList.size(); j++) { - for (int k = 0; k < resultList.size(); k++) { - if (k != j) { - resultList.get(j).below += comp(resultList.get(j), resultList.get(k)); - } - } - } - - Collections.sort(resultList, new ResultComparator()); - return resultList; - } - - /** - * Method of compare two search resutls - * - * @param o1 search result 1 - * @param o2 search result 2 - * @return 1 if o1 is greater than o2, 0 otherwise - */ - public int comp(SResult o1, SResult o2) { - List<Double> instList = new ArrayList<>(); - for (int i = 0; i < SResult.rlist.length; i++) { - double o2Score = SResult.get(o2, SResult.rlist[i]); - double o1Score = SResult.get(o1, SResult.rlist[i]); - instList.add(o2Score - o1Score); - } - - double[] ins = instList.stream().mapToDouble(i -> i).toArray(); - LabeledPoint insPoint = new LabeledPoint(99.0, Vectors.dense(ins)); - double prediction = le.classify(insPoint); - if (equalComp(prediction, 1)) { //different from weka where the return value is 1 or 2 - return 0; - } else { - return 1; - } - } - -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/Searcher.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/Searcher.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/Searcher.java deleted file mode 100644 index f407f92..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/Searcher.java +++ /dev/null @@ -1,282 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.ssearch; - -import com.google.gson.Gson; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; -import gov.nasa.jpl.mudrod.discoveryengine.MudrodAbstract; -import gov.nasa.jpl.mudrod.driver.ESDriver; -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import gov.nasa.jpl.mudrod.ssearch.structure.SResult; - -import org.elasticsearch.action.search.SearchRequestBuilder; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.SearchHit; -import org.elasticsearch.search.sort.SortBuilder; -import org.elasticsearch.search.sort.SortOrder; - -import java.io.Serializable; -import java.text.DecimalFormat; -import java.text.SimpleDateFormat; -import java.util.*; -import java.util.regex.Pattern; - -/** - * Supports ability to performance semantic search with a given query - */ -public class Searcher extends MudrodAbstract implements Serializable { - /** - * - */ - private static final long serialVersionUID = 1L; - DecimalFormat NDForm = new DecimalFormat("#.##"); - final Integer MAX_CHAR = 700; - - public Searcher(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - } - - /** - * Method of converting processing level string into a number - * - * @param pro processing level string - * @return processing level number - */ - public Double getProLevelNum(String pro) { - if (pro == null) { - return 1.0; - } - Double proNum; - Pattern p = Pattern.compile(".*[a-zA-Z].*"); - if (pro.matches("[0-9]{1}[a-zA-Z]{1}")) { - proNum = Double.parseDouble(pro.substring(0, 1)); - } else if (p.matcher(pro).find()) { - proNum = 1.0; - } else { - proNum = Double.parseDouble(pro); - } - - return proNum; - } - - public Double getPop(Double pop) { - if (pop > 1000) { - pop = 1000.0; - } - return pop; - } - - /** - * Method of checking if query exists in a certain attribute - * - * @param strList attribute value in the form of ArrayList - * @param query query string - * @return 1 means query exists, 0 otherwise - */ - public Double exists(ArrayList<String> strList, String query) { - Double val = 0.0; - if (strList != null) { - String str = String.join(", ", strList); - if (str != null && str.length() != 0 && str.toLowerCase().trim().contains(query)) { - val = 1.0; - } - } - return val; - } - - /** - * Main method of semantic search - * - * @param index index name in Elasticsearch - * @param type type name in Elasticsearch - * @param query regular query string - * @param queryOperator query mode- query, or, and - * @return a list of search result - */ - @SuppressWarnings("unchecked") - public List<SResult> searchByQuery(String index, String type, String query, String queryOperator, String rankOption) { - boolean exists = es.getClient().admin().indices().prepareExists(index).execute().actionGet().isExists(); - if (!exists) { - return new ArrayList<>(); - } - - SortOrder order = null; - String sortFiled = ""; - switch (rankOption) { - case "Rank-AllTimePopularity": - sortFiled = "Dataset-AllTimePopularity"; - order = SortOrder.DESC; - break; - case "Rank-MonthlyPopularity": - sortFiled = "Dataset-MonthlyPopularity"; - order = SortOrder.DESC; - break; - case "Rank-UserPopularity": - sortFiled = "Dataset-UserPopularity"; - order = SortOrder.DESC; - break; - case "Rank-LongName-Full": - sortFiled = "Dataset-LongName.raw"; - order = SortOrder.ASC; - break; - case "Rank-ShortName-Full": - sortFiled = "Dataset-ShortName.raw"; - order = SortOrder.ASC; - break; - case "Rank-GridSpatialResolution": - sortFiled = "Dataset-GridSpatialResolution"; - order = SortOrder.DESC; - break; - case "Rank-SatelliteSpatialResolution": - sortFiled = "Dataset-SatelliteSpatialResolution"; - order = SortOrder.DESC; - break; - case "Rank-StartTimeLong-Long": - sortFiled = "DatasetCoverage-StartTimeLong-Long"; - order = SortOrder.ASC; - break; - case "Rank-StopTimeLong-Long": - sortFiled = "DatasetCoverage-StopTimeLong-Long"; - order = SortOrder.DESC; - break; - default: - sortFiled = "Dataset-ShortName.raw"; - order = SortOrder.ASC; - break; - } - - Dispatcher dp = new Dispatcher(this.getConfig(), this.getES(), null); - BoolQueryBuilder qb = dp.createSemQuery(query, 1.0, queryOperator); - List<SResult> resultList = new ArrayList<>(); - - SearchRequestBuilder builder = es.getClient().prepareSearch(index).setTypes(type).setQuery(qb).addSort(sortFiled, order).setSize(500).setTrackScores(true); - SearchResponse response = builder.execute().actionGet(); - - for (SearchHit hit : response.getHits().getHits()) { - Map<String, Object> result = hit.getSource(); - Double relevance = Double.valueOf(NDForm.format(hit.getScore())); - String shortName = (String) result.get("Dataset-ShortName"); - String longName = (String) result.get("Dataset-LongName"); - - ArrayList<String> topicList = (ArrayList<String>) result.get("DatasetParameter-Variable"); - String topic = ""; - if (null != topicList) { - topic = String.join(", ", topicList); - } - String content = (String) result.get("Dataset-Description"); - - if (!"".equals(content)) { - int maxLength = (content.length() < MAX_CHAR) ? content.length() : MAX_CHAR; - content = content.trim().substring(0, maxLength - 1) + "..."; - } - - ArrayList<String> longdate = (ArrayList<String>) result.get("DatasetCitation-ReleaseDateLong"); - Date date = new Date(Long.valueOf(longdate.get(0)).longValue()); - SimpleDateFormat df2 = new SimpleDateFormat("MM/dd/yyyy"); - String dateText = df2.format(date); - - // start date - Long start = (Long) result.get("DatasetCoverage-StartTimeLong-Long"); - Date startDate = new Date(start); - String startDateTxt = df2.format(startDate); - - // end date - String end = (String) result.get("Dataset-DatasetCoverage-StopTimeLong"); - String endDateTxt = ""; - if ("".equals(end)) { - endDateTxt = "Present"; - } else { - Date endDate = new Date(Long.valueOf(end)); - endDateTxt = df2.format(endDate); - } - - String processingLevel = (String) result.get("Dataset-ProcessingLevel"); - Double proNum = getProLevelNum(processingLevel); - - Double userPop = getPop(((Integer) result.get("Dataset-UserPopularity")).doubleValue()); - Double allPop = getPop(((Integer) result.get("Dataset-AllTimePopularity")).doubleValue()); - Double monthPop = getPop(((Integer) result.get("Dataset-MonthlyPopularity")).doubleValue()); - - List<String> sensors = (List<String>) result.get("DatasetSource-Sensor-ShortName"); - - SResult re = new SResult(shortName, longName, topic, content, dateText); - - SResult.set(re, "term", relevance); - SResult.set(re, "releaseDate", Long.valueOf(longdate.get(0)).doubleValue()); - SResult.set(re, "processingLevel", processingLevel); - SResult.set(re, "processingL", proNum); - SResult.set(re, "userPop", userPop); - SResult.set(re, "allPop", allPop); - SResult.set(re, "monthPop", monthPop); - SResult.set(re, "startDate", startDateTxt); - SResult.set(re, "endDate", endDateTxt); - SResult.set(re, "sensors", String.join(", ", sensors)); - - QueryBuilder queryLabelSearch = QueryBuilders.boolQuery().must(QueryBuilders.termQuery("query", query)).must(QueryBuilders.termQuery("dataID", shortName)); - SearchResponse labelRes = es.getClient().prepareSearch(index).setTypes("trainingranking").setQuery(queryLabelSearch).setSize(5).execute().actionGet(); - String labelString = null; - for (SearchHit label : labelRes.getHits().getHits()) { - Map<String, Object> labelItem = label.getSource(); - labelString = (String) labelItem.get("label"); - } - SResult.set(re, "label", labelString); - resultList.add(re); - } - - return resultList; - } - - /** - * Method of semantic search to generate JSON string - * - * @param index index name in Elasticsearch - * @param type type name in Elasticsearch - * @param query regular query string - * @param queryOperator query mode- query, or, and - * @param rr selected ranking method - * @return search results - */ - public String ssearch(String index, String type, String query, String queryOperator, String rankOption, Ranker rr) { - List<SResult> li = searchByQuery(index, type, query, queryOperator, rankOption); - if ("Rank-SVM".equals(rankOption)) { - li = rr.rank(li); - } - Gson gson = new Gson(); - List<JsonObject> fileList = new ArrayList<>(); - - for (int i = 0; i < li.size(); i++) { - JsonObject file = new JsonObject(); - file.addProperty("Short Name", (String) SResult.get(li.get(i), "shortName")); - file.addProperty("Long Name", (String) SResult.get(li.get(i), "longName")); - file.addProperty("Topic", (String) SResult.get(li.get(i), "topic")); - file.addProperty("Description", (String) SResult.get(li.get(i), "description")); - file.addProperty("Release Date", (String) SResult.get(li.get(i), "relase_date")); - fileList.add(file); - - file.addProperty("Start/End Date", (String) SResult.get(li.get(i), "startDate") + " - " + (String) SResult.get(li.get(i), "endDate")); - file.addProperty("Processing Level", (String) SResult.get(li.get(i), "processingLevel")); - - file.addProperty("Sensor", (String) SResult.get(li.get(i), "sensors")); - } - JsonElement fileListElement = gson.toJsonTree(fileList); - - JsonObject pDResults = new JsonObject(); - pDResults.add("PDResults", fileListElement); - return pDResults.toString(); - } -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/package-info.java deleted file mode 100644 index da6bea3..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/package-info.java +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * This package includes classes for semantic search, such as click stream importer, - * query dispatcher, semantic searcher, and ranker (ranksvm, ordinal/linear regression) - */ -package gov.nasa.jpl.mudrod.ssearch; \ No newline at end of file
