http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/DataGenerator.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/DataGenerator.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/DataGenerator.java deleted file mode 100644 index 598ad61..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/DataGenerator.java +++ /dev/null @@ -1,313 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.ssearch.ranking; - -import au.com.bytecode.opencsv.CSVReader; -import au.com.bytecode.opencsv.CSVWriter; - -import java.io.*; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; - -/** - * SVMData is a program designed to create appropriate input data for the RankSVM - * algorithm that involves Pairwise Classification. Specifically, instead of working in - * the space of query-document vectors, e.g. x1, x2, x3, we transform them into a new space - * in which a pair of documents is represented as the difference between their feature vectors. - */ -public class DataGenerator { - private static String mySourceDir; - private static String myResultDir; - private static boolean isMultFiles; - - private static String[] myHeader; - private static List<List<String>> myMasterList = new ArrayList<List<String>>(); - - // HashMap used for comparing evaluation classes - public static final HashMap<String, Integer> map1 = new HashMap<String, Integer>(); - - static { - map1.put("Excellent", 7); - map1.put("Very good", 6); - map1.put("Good", 5); - map1.put("OK", 4); - map1.put("Bad", 3); - map1.put("Very bad", 2); - map1.put("Terrible", 1); - } - - /** - * Constructor which takes in path containing one or multiple files to process. - * Also takes in argument specifying whether or not a single file needs to be processed, - * or multiple files need to be processed. - * - * @param sourceDir directory containing single file or multiple files to be processed - * @param resultDir output folder - * @param multFiles true if multiple files in directory need to be processed and false if - * only a single file needs to be processed - */ - public DataGenerator(String sourceDir, String resultDir, boolean multFiles) { - mySourceDir = sourceDir; - myResultDir = resultDir; - isMultFiles = multFiles; - } - - /** - * Responsible for invoking the processing of data file(s) and their subsequent storage - * into a user specified directory. - */ - public void process() { - parseFile(); - writeCSVfile(myMasterList); - } - - /** - * Parses the original user-specified CSV file, storing the contents for future calculations - * and formatting. - */ - public static void parseFile() { - String[][] dataArr = null; - try { - String sourceDir = mySourceDir; - - if (isMultFiles == true) // Case where multiple files have to be processed - { - // Iterate over files in directory - File directory = new File(sourceDir); - File[] directoryListing = directory.listFiles(); - - if (directoryListing != null) { - for (File child : directoryListing) { - CSVReader csvReader = new CSVReader(new FileReader(child)); - List<String[]> list = csvReader.readAll(); - - // Store into 2D array by transforming array list to normal array - dataArr = new String[list.size()][]; - dataArr = list.toArray(dataArr); - - calculateVec(dataArr); - - csvReader.close(); - } - storeHead(dataArr); // Store the header - } - } else // Process only one file - { - File file = new File(sourceDir); - - if (file != null) { - CSVReader csvReader = new CSVReader(new FileReader(file)); - List<String[]> list = csvReader.readAll(); - - // Store into 2D array by transforming array list to normal array - dataArr = new String[list.size()][]; - dataArr = list.toArray(dataArr); - - storeHead(dataArr); // Store the header - calculateVec(dataArr); - - csvReader.close(); - } - } - } catch (FileNotFoundException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - /** - * Performs the necessary vector calculations on each possible combination of vectors, - * also storing a value that indicates the evaluation. - * - * @param arr the parsed contents of the original CSV file - */ - public static void calculateVec(String[][] arr) { - List<List<String>> listofLists = new ArrayList<List<String>>(); // Holds calculations - - int rowStart = 1; - for (int row = rowStart; row < arr.length; row++) // Start at row 1 because row 0 is heading lol - { - for (int i = 1; i < arr.length - row; i++) { - List<String> colList = new ArrayList<String>(); // create vector to store all values inside of a column, which is stored inside 2D vector - for (int col = 0; col < arr[0].length - 1; col++) // Columns go until the next to last column - { - //System.out.println(col + " " + arr[row][col]); - // Extract double value from each cell - double x1 = Double.parseDouble(arr[row][col]); - double x2 = Double.parseDouble(arr[row + i][col]); - - // Perform calculation for each cell - double result = x1 - x2; - - // Convert this double value into string, and store inside array list - String strResult = Double.toString(result); - colList.add(strResult); - } - - // Finally, add either 1, -1, or do not add row at all when encountering evaluation value - int addEvalNum = compareEvaluation(arr[row][arr[0].length - 1], arr[row + i][arr[0].length - 1]); - if (addEvalNum == 1) { - colList.add("1"); - listofLists.add(colList); // Add this list to 2D list - row is finished now, move on - } else if (addEvalNum == -1) { - colList.add("-1"); - listofLists.add(colList); // Add this list to 2D list - row is finished now, move on - } - // Else, they are equal, do not even add this row to 2D vector - } - } - - // After all processing takes place, send to method that recreates data with equal # of 1's and -1's - List<List<String>> equalizedList = equalizeList(listofLists); - myMasterList.addAll(equalizedList); - } - - /** - * Taking in two vector evaluation parameters, compares these two evaluations, returning a 1 - * if the first evaluation is greater than the second, a -1 in the case the first evaluation is - * less than the second, and a 10 in the case that the two are equal, meaning this vector will - * not be used. - * - * @param eval1 evaluation from first vector - * @param eval2 evaluation from second vector - * @return 1 if first evaluation is greater than the second, -1 if first evaluation is less than the second, and - * 10 in the case that the two are equal - */ - public static int compareEvaluation(String eval1, String eval2) { - int evalNum1 = map1.get(eval1); - int evalNum2 = map1.get(eval2); - - if (evalNum1 > evalNum2) // ">" means it is more relevant - assign a 1 - { - return 1; - } else if (evalNum1 < evalNum2) { - return -1; - } else { - return 10; // Return 10 if they are equal - signifies you cannot use the row - } - } - - /** - * After vector calculations and new evaluation values are set, produces refined output data such that - * there is an equal or close to equal number of rows containing both "1" and "-1" as the new evaluation value. - * - * @param rawList originally calculated data from the input CSV file - * @return data that has an equal distribution of evaluation values - */ - public static List<List<String>> equalizeList(List<List<String>> rawList) { - // Create two sets - one containing row index for +1 and the other for -1 - List<Integer> pos1List = new ArrayList<Integer>(); - List<Integer> neg1List = new ArrayList<Integer>(); - - for (int i = 0; i < rawList.size(); i++) // Iterate through all rows to get indexes - { - int evalNum = Integer.parseInt(rawList.get(i).get(rawList.get(0).size() - 1)); // Get 1 or -1 from original array list - if (evalNum == 1) { - pos1List.add(i); // Add row index that has 1 - } else if (evalNum == -1) { - neg1List.add(i); // Add row index that has -1 - } - } - - int totPosCount = pos1List.size(); // Total # of 1's - int totNegCount = neg1List.size(); // Total # of -1's - - if ((totPosCount - totNegCount) >= 1) // There are more 1's than -1's, equalize them - { - int indexOfPosList = 0; // Start getting indexes from the first index of positive index location list - while ((totPosCount - totNegCount) >= 1) // Keep going until we have acceptable amount of both +1 and -1 - { - int pos1IndexVal = pos1List.get(indexOfPosList); // Get index from previously made list of indexes - for (int col = 0; col < rawList.get(0).size(); col++) // Go through elements of indexed row, negating it to transform to -1 row - { - double d = Double.parseDouble(rawList.get(pos1IndexVal).get(col)); // Transform to double first - d = d * -1; // Negate it - String negatedValue = Double.toString(d); // Change back to String - rawList.get(pos1IndexVal).set(col, negatedValue);// Put this value back into dat row - } - - totPosCount--; // We changed a +1 row to a -1 row, decrement count of positives - totNegCount++; // Increment count of negatives - indexOfPosList++; // Get next +1 location in raw data - } - } else if ((totNegCount - totPosCount) > 1) // There are more -1's than 1's, equalize them - { - int indexOfNegList = 0; - while ((totNegCount - totPosCount) > 1) // Keep going until we have acceptable amount of both +1 and -1 - { - int neg1IndexVal = neg1List.get(indexOfNegList); // Get index from previously made list of indexes - for (int col = 0; col < rawList.get(0).size(); col++) // Go through elements of indexed row, negating it to transform to +1 row - { - double d = Double.parseDouble(rawList.get(neg1IndexVal).get(col)); // Transform to double first - d = d * -1; // Negate it - String negatedValue = Double.toString(d); // Change back to String - rawList.get(neg1IndexVal).set(col, negatedValue);// Put this value back into dat row - } - - totNegCount--; // We changed a -1 row to a +1 row, decrement count of negatives now - totPosCount++; // Increment count of positives - indexOfNegList++; // Get next -1 location in raw data - } - } else { - // Do nothing - rows are within acceptable equality bounds of plus or minus 1 - } - - return rawList; - } - - /** - * Retrieves the heading from a file to be processed so it can be written to the output file later. - * - * @param arr 2D array containing the parsed information from input file - */ - public static void storeHead(String[][] arr) { - myHeader = new String[arr[0].length]; // Reside private variable - - for (int col = 0; col < arr[0].length; col++) { - myHeader[col] = arr[0][col]; - } - } - - /** - * Writes newly calculated and equally distributed vector data to user specified CSV file. - * - * @param list finalized vector data to write to user specified output file - */ - public static void writeCSVfile(List<List<String>> list) { - String outputFile = myResultDir; - boolean alreadyExists = new File(outputFile).exists(); - - try { - CSVWriter csvOutput = new CSVWriter(new FileWriter(outputFile), ','); // Create new instance of CSVWriter to write to file output - - if (!alreadyExists) { - csvOutput.writeNext(myHeader); // Write the text headers first before data - - for (int i = 0; i < list.size(); i++) // Iterate through all rows in 2D array - { - String[] temp = new String[list.get(i).size()]; // Convert row array list in 2D array to regular string array - temp = list.get(i).toArray(temp); - csvOutput.writeNext(temp); // Write this array to the file - } - } - - csvOutput.close(); // Close csvWriter - } catch (IOException e) { - e.printStackTrace(); - } - } - -}
http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/Evaluator.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/Evaluator.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/Evaluator.java deleted file mode 100644 index 8edb6ad..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/Evaluator.java +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.ssearch.ranking; - -import java.util.Collections; -import java.util.Comparator; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.IntStream; - -/** - * Supports ability to evaluating ranking results - */ -public class Evaluator { - /** - * Method of calculating NDCG score - * - * @param list a list of integer with each integer element indicating - * the performance at its position - * @param K the number of elements needed to be included in the calculation - * @return NDCG score - */ - public double getNDCG(int[] list, int K) { - double dcg = this.getDCG(list, K); - double idcg = this.getIDCG(list, K); - double ndcg = 0.0; - if (idcg > 0.0) { - ndcg = dcg / idcg; - } - return ndcg; - } - - /** - * Method of getting the precision of a list at position K - * - * @param list a list of integer with each integer element indicating - * the performance at its position - * @param K the number of elements needed to be included in the calculation - * @return precision at K - */ - public double getPrecision(int[] list, int K) { - int size = list.length; - if (size == 0 || K == 0) { - return 0; - } - - if (K > size) { - K = size; - } - - int rel_doc_num = this.getRelevantDocNum(list, K); - double precision = (double) rel_doc_num / (double) K; - return precision; - } - - /** - * Method of getting the number of relevant element in a ranking results - * - * @param list a list of integer with each integer element indicating - * the performance at its position - * @param K the number of elements needed to be included in the calculation - * @return the number of relevant element - */ - private int getRelevantDocNum(int[] list, int K) { - int size = list.length; - if (size == 0 || K == 0) { - return 0; - } - - if (K > size) { - K = size; - } - - int rel_num = 0; - for (int i = 0; i < K; i++) { - if (list[i] > 3) { // 3 refers to "OK" - rel_num++; - } - } - return rel_num; - } - - /** - * Method of calculating DCG score from a list of ranking results - * - * @param list a list of integer with each integer element indicating - * the performance at its position - * @param K the number of elements needed to be included in the calculation - * @return DCG score - */ - private double getDCG(int[] list, int K) { - int size = list.length; - if (size == 0 || K == 0) { - return 0.0; - } - - if (K > size) { - K = size; - } - - double dcg = list[0]; - for (int i = 1; i < K; i++) { - int rel = list[i]; - int pos = i + 1; - double rel_log = Math.log(pos) / Math.log(2); - dcg += rel / rel_log; - } - return dcg; - } - - /** - * Method of calculating ideal DCG score from a list of ranking results - * - * @param list a list of integer with each integer element indicating - * the performance at its position - * @param K the number of elements needed to be included in the calculation - * @return IDCG score - */ - private double getIDCG(int[] list, int K) { - Comparator<Integer> comparator = new Comparator<Integer>() { - @Override - public int compare(Integer o1, Integer o2) { - return o2.compareTo(o1); - } - }; - List<Integer> sortlist = IntStream.of(list).boxed().collect(Collectors.toList()); - ; - Collections.sort(sortlist, comparator); - int[] sortedArr = sortlist.stream().mapToInt(i -> i).toArray(); - double idcg = this.getDCG(sortedArr, K); - return idcg; - } - -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/Learner.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/Learner.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/Learner.java deleted file mode 100644 index d1c5199..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/Learner.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.ssearch.ranking; - -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import org.apache.spark.SparkContext; -import org.apache.spark.mllib.classification.SVMModel; -import org.apache.spark.mllib.regression.LabeledPoint; - -import java.io.Serializable; - -/** - * Supports the ability to importing classifier into memory - */ -public class Learner implements Serializable { - /** - * - */ - private static final long serialVersionUID = 1L; - private static final String SPARKSVM = "SparkSVM"; - SVMModel model = null; - transient SparkContext sc = null; - - /** - * Constructor to load in spark SVM classifier - * - * @param classifierName classifier type - * @param skd an instance of spark driver - * @param svmSgdModel path to a trained model - */ - public Learner(String classifierName, SparkDriver skd, String svmSgdModel) { - if (classifierName.equals(SPARKSVM)) { - sc = skd.sc.sc(); - sc.addFile(svmSgdModel, true); - model = SVMModel.load(sc, svmSgdModel); - } - } - - /** - * Method of classifying instance - * - * @param p the instance that needs to be classified - * @return the class id - */ - public double classify(LabeledPoint p) { - return model.predict(p.features()); - } - -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/SparkFormatter.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/SparkFormatter.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/SparkFormatter.java deleted file mode 100644 index ba46d41..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/SparkFormatter.java +++ /dev/null @@ -1,55 +0,0 @@ -package gov.nasa.jpl.mudrod.ssearch.ranking; - -import java.io.*; -import java.text.DecimalFormat; - -public class SparkFormatter { - DecimalFormat NDForm = new DecimalFormat("#.###"); - - public SparkFormatter() { - } - - public void toSparkSVMformat(String inputCSVFileName, String outputTXTFileName) { - File file = new File(outputTXTFileName); - if (file.exists()) { - file.delete(); - } - try { - file.createNewFile(); - FileWriter fw = new FileWriter(outputTXTFileName); - BufferedWriter bw = new BufferedWriter(fw); - - BufferedReader br = new BufferedReader(new FileReader(inputCSVFileName)); - br.readLine(); - String line = br.readLine(); - while (line != null) { - String[] list = line.split(","); - String output = ""; - Double label = Double.parseDouble(list[list.length - 1].replace("\"", "")); - if (label == -1.0) { - output = "0 "; - } else if (label == 1.0) { - output = "1 "; - } - - for (int i = 0; i < list.length - 1; i++) { - int index = i + 1; - output += index + ":" + NDForm.format(Double.parseDouble(list[i].replace("\"", ""))) + " "; - } - bw.write(output + "\n"); - - line = br.readLine(); - } - br.close(); - bw.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - public static void main(String[] args) { - SparkFormatter sf = new SparkFormatter(); - sf.toSparkSVMformat("C:/mudrodCoreTestData/rankingResults/inputDataForSVM.csv", "C:/mudrodCoreTestData/rankingResults/inputDataForSVM_spark.txt"); - } - -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/SparkSVM.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/SparkSVM.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/SparkSVM.java deleted file mode 100644 index 1ddebf3..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/SparkSVM.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.ssearch.ranking; - -import gov.nasa.jpl.mudrod.main.MudrodEngine; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.classification.SVMModel; -import org.apache.spark.mllib.classification.SVMWithSGD; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; - -public class SparkSVM { - - private SparkSVM() { - //public constructor - } - - public static void main(String[] args) { - MudrodEngine me = new MudrodEngine(); - - JavaSparkContext jsc = me.startSparkDriver().sc; - - String path = SparkSVM.class.getClassLoader().getResource("inputDataForSVM_spark.txt").toString(); - JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD(); - - // Run training algorithm to build the model. - int numIterations = 100; - final SVMModel model = SVMWithSGD.train(data.rdd(), numIterations); - - // Save and load model - model.save(jsc.sc(), SparkSVM.class.getClassLoader().getResource("javaSVMWithSGDModel").toString()); - - jsc.sc().stop(); - - } - -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/TrainingImporter.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/TrainingImporter.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/TrainingImporter.java deleted file mode 100644 index ae48b55..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/TrainingImporter.java +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.ssearch.ranking; - -import gov.nasa.jpl.mudrod.discoveryengine.MudrodAbstract; -import gov.nasa.jpl.mudrod.driver.ESDriver; -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import gov.nasa.jpl.mudrod.main.MudrodConstants; -import org.elasticsearch.action.index.IndexRequest; -import org.elasticsearch.common.xcontent.XContentBuilder; -import org.elasticsearch.index.query.QueryBuilders; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.Properties; - -import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; - -/** - * Supports the ability to importing training set into Elasticsearch - */ -public class TrainingImporter extends MudrodAbstract { - /** - * - */ - private static final long serialVersionUID = 1L; - - public TrainingImporter(Properties props, ESDriver es, SparkDriver spark) { - super(props, es, spark); - es.deleteAllByQuery(props.getProperty(MudrodConstants.ES_INDEX_NAME), "trainingranking", QueryBuilders.matchAllQuery()); - addMapping(); - } - - /** - * Method of adding mapping to traning set type - */ - public void addMapping() { - XContentBuilder Mapping; - try { - Mapping = jsonBuilder().startObject().startObject("trainingranking").startObject("properties").startObject("query").field("type", "string").field("index", "not_analyzed").endObject() - .startObject("dataID").field("type", "string").field("index", "not_analyzed").endObject().startObject("label").field("type", "string").field("index", "not_analyzed").endObject().endObject() - .endObject().endObject(); - - es.getClient().admin().indices().preparePutMapping(props.getProperty("indexName")).setType("trainingranking").setSource(Mapping).execute().actionGet(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - /** - * Method of importing training set in to Elasticsearch - * - * @param dataFolder the path to the traing set - * @throws IOException IOException - */ - public void importTrainingSet(String dataFolder) throws IOException { - es.createBulkProcessor(); - - File[] files = new File(dataFolder).listFiles(); - for (File file : files) { - BufferedReader br = new BufferedReader(new FileReader(file.getAbsolutePath())); - br.readLine(); - String line = br.readLine(); - while (line != null) { - String[] list = line.split(","); - String query = file.getName().replace(".csv", ""); - if (list.length > 0) { - IndexRequest ir = new IndexRequest(props.getProperty("indexName"), "trainingranking") - .source(jsonBuilder().startObject().field("query", query).field("dataID", list[0]).field("label", list[list.length - 1]).endObject()); - es.getBulkProcessor().add(ir); - } - line = br.readLine(); - } - br.close(); - } - es.destroyBulkProcessor(); - } -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/package-info.java deleted file mode 100644 index 205e7a7..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/ranking/package-info.java +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * This package includes classes for importing training data, ML models, - * generating input data for RankSVM, and evaluating ranking results - */ -package gov.nasa.jpl.mudrod.ssearch.ranking; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/structure/SResult.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/structure/SResult.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/structure/SResult.java deleted file mode 100644 index 33b6233..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/structure/SResult.java +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.ssearch.structure; - -import java.lang.reflect.Field; - -/** - * Data structure class for search result - */ -public class SResult { - public static final String rlist[] = { "term_score", "releaseDate_score", /*"versionNum_score",*/ - "processingL_score", "allPop_score", "monthPop_score", "userPop_score"/*, "termAndv_score"*/ }; - String shortName = null; - String longName = null; - String topic = null; - String description = null; - String relase_date = null; - - public Double final_score = 0.0; - public Double term_score = 0.0; - public Double releaseDate_score = 0.0; - public Double versionNum_score = 0.0; - public Double processingL_score = 0.0; - public Double click_score = 0.0; - public Double allPop_score = 0.0; - public Double monthPop_score = 0.0; - public Double userPop_score = 0.0; - public Double termAndv_score = 0.0; - public Integer below = 0; - - public Double Dataset_LongName_score = null; - public Double Dataset_Metadata_score = null; - public Double DatasetParameter_Term_score = null; - public Double DatasetSource_Source_LongName_score = null; - public Double DatasetSource_Sensor_LongName_score = null; - - public String version = null; - public String processingLevel = null; - public String latency = null; - public String stopDateLong = null; - public String stopDateFormat = null; - public Double spatialR_Sat = null; - public Double spatialR_Grid = null; - public String temporalR = null; - - public Double releaseDate = null; - public Double click = null; - public Double term = null; - public Double versionNum = null; - public Double processingL = null; - public Double allPop = null; - public Double monthPop = null; - public Double userPop = null; - public Double termAndv = null; - - public Double Dataset_LongName = null; - public Double Dataset_Metadata = null; - public Double DatasetParameter_Term = null; - public Double DatasetSource_Source_LongName = null; - public Double DatasetSource_Sensor_LongName = null; - - public Double prediction = 0.0; - public String label = null; - - //add by quintinali - public String startDate; - public String endDate; - public String sensors; - - /** - * @param shortName short name of dataset - * @param longName long name of dataset - * @param topic topic of dataset - * @param description description of dataset - * @param date release date of dataset - */ - public SResult(String shortName, String longName, String topic, String description, String date) { - this.shortName = shortName; - this.longName = longName; - this.topic = topic; - this.description = description; - this.relase_date = date; - } - - public SResult(SResult sr) { - for (int i = 0; i < rlist.length; i++) { - set(this, rlist[i], get(sr, rlist[i])); - } - } - - /** - * Method of getting export header - * - * @param delimiter the delimiter used to separate strings - * @return header - */ - public static String getHeader(String delimiter) { - String str = ""; - for (int i = 0; i < rlist.length; i++) { - str += rlist[i] + delimiter; - } - str = str + "label" + "\n"; - return "ShortName" + delimiter + "below" + delimiter + str; - } - - /** - * Method of get a search results as string - * - * @param delimiter the delimiter used to separate strings - * @return search result as string - */ - public String toString(String delimiter) { - String str = ""; - for (int i = 0; i < rlist.length; i++) { - double score = get(this, rlist[i]); - str += score + delimiter; - } - str = str + label + "\n"; - return shortName + delimiter + below + delimiter + str; - } - - /** - * Generic setter method - * - * @param object instance of SResult - * @param fieldName field name that needs to be set on - * @param fieldValue field value that needs to be set to - * @return 1 means success, and 0 otherwise - */ - public static boolean set(Object object, String fieldName, Object fieldValue) { - Class<?> clazz = object.getClass(); - while (clazz != null) { - try { - Field field = clazz.getDeclaredField(fieldName); - field.setAccessible(true); - field.set(object, fieldValue); - return true; - } catch (NoSuchFieldException e) { - clazz = clazz.getSuperclass(); - } catch (Exception e) { - throw new IllegalStateException(e); - } - } - return false; - } - - /** - * Generic getter method - * - * @param object instance of SResult - * @param fieldName field name of search result - * @param <V> data type - * @return the value of the filed in the object - */ - @SuppressWarnings("unchecked") - public static <V> V get(Object object, String fieldName) { - Class<?> clazz = object.getClass(); - while (clazz != null) { - try { - Field field = clazz.getDeclaredField(fieldName); - field.setAccessible(true); - return (V) field.get(object); - } catch (NoSuchFieldException e) { - clazz = clazz.getSuperclass(); - } catch (Exception e) { - throw new IllegalStateException(e); - } - } - return null; - } - -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/structure/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/structure/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/structure/package-info.java deleted file mode 100644 index a0f9ce5..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/ssearch/structure/package-info.java +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * This package includes data structure needed for ranking process - */ -package gov.nasa.jpl.mudrod.ssearch.structure; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/utils/ESTransportClient.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/utils/ESTransportClient.java b/core/src/main/java/gov/nasa/jpl/mudrod/utils/ESTransportClient.java deleted file mode 100644 index 151ac8d..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/utils/ESTransportClient.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package gov.nasa.jpl.mudrod.utils; - -import org.elasticsearch.client.transport.TransportClient; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.index.reindex.ReindexPlugin; -import org.elasticsearch.percolator.PercolatorPlugin; -import org.elasticsearch.plugins.Plugin; -import org.elasticsearch.script.mustache.MustachePlugin; -import org.elasticsearch.transport.Netty3Plugin; - -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; - -/** - * A builder to create an instance of {@link TransportClient} This class - * pre-installs the {@link Netty3Plugin}, for the client. These plugins are all - * elasticsearch core modules required. - */ -@SuppressWarnings({ "unchecked", "varargs" }) -public class ESTransportClient extends TransportClient { - - private static final Collection<Class<? extends Plugin>> PRE_INSTALLED_PLUGINS = Collections - .unmodifiableList(Arrays.asList(ReindexPlugin.class, PercolatorPlugin.class, MustachePlugin.class, Netty3Plugin.class)); - - @SafeVarargs - public ESTransportClient(Settings settings, Class<? extends Plugin>... plugins) { - this(settings, Arrays.asList(plugins)); - } - - public ESTransportClient(Settings settings, Collection<Class<? extends Plugin>> plugins) { - super(settings, Settings.EMPTY, addPlugins(plugins, PRE_INSTALLED_PLUGINS), null); - - } - - @Override - public void close() { - super.close(); - } - -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/utils/HttpRequest.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/utils/HttpRequest.java b/core/src/main/java/gov/nasa/jpl/mudrod/utils/HttpRequest.java deleted file mode 100644 index be0a46d..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/utils/HttpRequest.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.utils; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.BufferedReader; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.net.HttpURLConnection; -import java.net.URL; - -/** - * ClassName: HttpRequest - * Function: Http request tool. - */ -public class HttpRequest { - - private static final Logger LOG = LoggerFactory.getLogger(HttpRequest.class); - - public HttpRequest() { - } - - public String getRequest(String requestUrl) { - String line = null; - try { - URL url = new URL(requestUrl); - - HttpURLConnection connection = (HttpURLConnection) url.openConnection(); - connection.setDoOutput(true); - - connection.setConnectTimeout(5000); - connection.setReadTimeout(5000); - int code = connection.getResponseCode(); - if (code != HttpURLConnection.HTTP_OK) { - line = "{\"exception\":\"Service failed\"}"; - LOG.info(line); - } else { - InputStream content = connection.getInputStream(); - BufferedReader in = new BufferedReader(new InputStreamReader(content)); - line = in.readLine(); - } - } catch (Exception e) { - line = "{\"exception\":\"No service was found\"}"; - LOG.error(line); - } - return line; - } -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/utils/LabeledRowMatrix.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/utils/LabeledRowMatrix.java b/core/src/main/java/gov/nasa/jpl/mudrod/utils/LabeledRowMatrix.java deleted file mode 100644 index d1d144b..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/utils/LabeledRowMatrix.java +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.utils; - -import org.apache.spark.mllib.linalg.distributed.RowMatrix; - -import java.util.List; - -/** - * ClassName: LabeledRowMatrix - * Function: LabeledRowMatrix strut. - */ -public class LabeledRowMatrix { - - // words: matrix row titles - public List<String> rowkeys; - // docs: matrix column titles - public List<String> colkeys; - // wordDocMatrix: a matrix in which each row is corresponding to a term and - // each column is a doc. - public RowMatrix rowMatrix; - - public LabeledRowMatrix() { - // TODO Auto-generated constructor stub - } - -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/utils/LinkageTriple.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/utils/LinkageTriple.java b/core/src/main/java/gov/nasa/jpl/mudrod/utils/LinkageTriple.java deleted file mode 100644 index 90b1568..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/utils/LinkageTriple.java +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.utils; - -import gov.nasa.jpl.mudrod.driver.ESDriver; -import org.elasticsearch.action.index.IndexRequest; -import org.elasticsearch.action.search.SearchRequestBuilder; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.action.update.UpdateRequest; -import org.elasticsearch.common.unit.TimeValue; -import org.elasticsearch.common.xcontent.XContentBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.SearchHit; -import org.elasticsearch.search.aggregations.AggregationBuilders; -import org.elasticsearch.search.aggregations.bucket.terms.Terms; -import org.elasticsearch.search.sort.SortOrder; - -import java.io.IOException; -import java.io.Serializable; -import java.text.DecimalFormat; -import java.util.List; -import java.util.Map; - -import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; - -/** - * ClassName: LinkageTriple Function: Vocabulary linkage operations - */ -public class LinkageTriple implements Serializable { - - /** - * - */ - private static final long serialVersionUID = 1L; - // keyAId: ID of term A - public long keyAId; - // keyBId: ID of term B - public long keyBId; - // weight: The relationship between term A and Term B - public double weight; - // keyA: TermA - public String keyA; - // keyB: TermB - public String keyB; - // df: Format number - public static DecimalFormat df = new DecimalFormat("#.00"); - - public LinkageTriple() { - // TODO Auto-generated constructor stub - } - - /** - * TODO Output linkage triples in string format. - * - * @see java.lang.Object#toString() - */ - @Override - public String toString() { - return keyA + "," + keyB + ":" + weight; - } - - public static void insertTriples(ESDriver es, List<LinkageTriple> triples, String index, String type) throws IOException { - LinkageTriple.insertTriples(es, triples, index, type, false, false); - } - - public static void insertTriples(ESDriver es, List<LinkageTriple> triples, String index, String type, Boolean bTriple, boolean bSymmetry) throws IOException { - es.deleteType(index, type); - if (bTriple) { - LinkageTriple.addMapping(es, index, type); - } - - if (triples == null) { - return; - } - - es.createBulkProcessor(); - int size = triples.size(); - for (int i = 0; i < size; i++) { - - XContentBuilder jsonBuilder = jsonBuilder().startObject(); - if (bTriple) { - - jsonBuilder.field("concept_A", triples.get(i).keyA); - jsonBuilder.field("concept_B", triples.get(i).keyB); - - } else { - jsonBuilder.field("keywords", triples.get(i).keyA + "," + triples.get(i).keyB); - } - - jsonBuilder.field("weight", Double.parseDouble(df.format(triples.get(i).weight))); - jsonBuilder.endObject(); - - IndexRequest ir = new IndexRequest(index, type).source(jsonBuilder); - es.getBulkProcessor().add(ir); - - if (bTriple && bSymmetry) { - XContentBuilder symmetryJsonBuilder = jsonBuilder().startObject(); - symmetryJsonBuilder.field("concept_A", triples.get(i).keyB); - symmetryJsonBuilder.field("concept_B", triples.get(i).keyA); - - symmetryJsonBuilder.field("weight", Double.parseDouble(df.format(triples.get(i).weight))); - - symmetryJsonBuilder.endObject(); - - IndexRequest symmetryir = new IndexRequest(index, type).source(symmetryJsonBuilder); - es.getBulkProcessor().add(symmetryir); - } - } - es.destroyBulkProcessor(); - } - - public static void addMapping(ESDriver es, String index, String type) { - XContentBuilder Mapping; - try { - Mapping = jsonBuilder().startObject().startObject(type).startObject("properties").startObject("concept_A").field("type", "string").field("index", "not_analyzed").endObject() - .startObject("concept_B").field("type", "string").field("index", "not_analyzed").endObject() - - .endObject().endObject().endObject(); - - es.getClient().admin().indices().preparePutMapping(index).setType(type).setSource(Mapping).execute().actionGet(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - public static void standardTriples(ESDriver es, String index, String type) throws IOException { - es.createBulkProcessor(); - - SearchResponse sr = es.getClient().prepareSearch(index).setTypes(type).setQuery(QueryBuilders.matchAllQuery()).setSize(0) - .addAggregation(AggregationBuilders.terms("concepts").field("concept_A").size(0)).execute().actionGet(); - Terms concepts = sr.getAggregations().get("concepts"); - - for (Terms.Bucket entry : concepts.getBuckets()) { - String concept = (String) entry.getKey(); - double maxSim = LinkageTriple.getMaxSimilarity(es, index, type, concept); - if (maxSim == 1.0) { - continue; - } - - SearchResponse scrollResp = es.getClient().prepareSearch(index).setTypes(type).setScroll(new TimeValue(60000)).setQuery(QueryBuilders.termQuery("concept_A", concept)) - .addSort("weight", SortOrder.DESC).setSize(100).execute().actionGet(); - - while (true) { - for (SearchHit hit : scrollResp.getHits().getHits()) { - Map<String, Object> metadata = hit.getSource(); - double sim = (double) metadata.get("weight"); - double newSim = sim / maxSim; - UpdateRequest ur = es.generateUpdateRequest(index, type, hit.getId(), "weight", Double.parseDouble(df.format(newSim))); - es.getBulkProcessor().add(ur); - } - - scrollResp = es.getClient().prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(600000)).execute().actionGet(); - if (scrollResp.getHits().getHits().length == 0) { - break; - } - } - } - - es.destroyBulkProcessor(); - } - - private static double getMaxSimilarity(ESDriver es, String index, String type, String concept) { - - double maxSim = 1.0; - SearchRequestBuilder builder = es.getClient().prepareSearch(index).setTypes(type).setQuery(QueryBuilders.termQuery("concept_A", concept)).addSort("weight", SortOrder.DESC).setSize(1); - - SearchResponse usrhis = builder.execute().actionGet(); - SearchHit[] hits = usrhis.getHits().getHits(); - if (hits.length == 1) { - SearchHit hit = hits[0]; - Map<String, Object> result = hit.getSource(); - maxSim = (double) result.get("weight"); - } - - if (maxSim == 0.0) { - maxSim = 1.0; - } - - return maxSim; - } -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/utils/MatrixUtil.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/utils/MatrixUtil.java b/core/src/main/java/gov/nasa/jpl/mudrod/utils/MatrixUtil.java deleted file mode 100644 index 942f1e0..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/utils/MatrixUtil.java +++ /dev/null @@ -1,488 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.utils; - -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.Optional; -import org.apache.spark.api.java.function.*; -import org.apache.spark.mllib.feature.IDF; -import org.apache.spark.mllib.feature.IDFModel; -import org.apache.spark.mllib.linalg.*; -import org.apache.spark.mllib.linalg.distributed.IndexedRow; -import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix; -import org.apache.spark.mllib.linalg.distributed.RowMatrix; -import scala.Tuple2; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; -import java.util.stream.Stream; - -/** - * Matrix utility tool - */ -public class MatrixUtil { - - private MatrixUtil() { - } - - /** - * buildSVDMatrix: Generate SVD matrix from TF-IDF matrix. Please make sure - * the TF-IDF matrix has been already built from the original documents. - * - * @param tfidfMatrix, - * each row is a term and each column is a document name and each - * cell is the TF-IDF value of the term in the corresponding - * document. - * @param dimension - * Column number of the SVD matrix - * @return RowMatrix, each row is a term and each column is a dimension in the - * feature space, each cell is value of the term in the corresponding - * dimension. - */ - public static RowMatrix buildSVDMatrix(RowMatrix tfidfMatrix, int dimension) { - int matrixCol = (int) tfidfMatrix.numCols(); - if (matrixCol < dimension) { - dimension = matrixCol; - } - - SingularValueDecomposition<RowMatrix, Matrix> svd = tfidfMatrix.computeSVD(dimension, true, 1.0E-9d); - RowMatrix u = svd.U(); - Vector s = svd.s(); - return u.multiply(Matrices.diag(s)); - } - - /** - * buildSVDMatrix: Generate SVD matrix from Vector RDD. - * - * @param vecRDD - * vectors of terms in feature space - * @param dimension - * Column number of the SVD matrix - * @return RowMatrix, each row is a term and each column is a dimension in the - * feature space, each cell is value of the term in the corresponding - * dimension. - */ - public static RowMatrix buildSVDMatrix(JavaRDD<Vector> vecRDD, int dimension) { - RowMatrix tfidfMatrix = new RowMatrix(vecRDD.rdd()); - SingularValueDecomposition<RowMatrix, Matrix> svd = tfidfMatrix.computeSVD(dimension, true, 1.0E-9d); - RowMatrix u = svd.U(); - Vector s = svd.s(); - return u.multiply(Matrices.diag(s)); - } - - /** - * Create TF-IDF matrix from word-doc matrix. - * - * @param wordDocMatrix, - * each row is a term, each column is a document name and each cell - * is number of the term in the corresponding document. - * @return RowMatrix, each row is a term and each column is a document name - * and each cell is the TF-IDF value of the term in the corresponding - * document. - */ - public static RowMatrix createTFIDFMatrix(RowMatrix wordDocMatrix) { - JavaRDD<Vector> newcountRDD = wordDocMatrix.rows().toJavaRDD(); - IDFModel idfModel = new IDF().fit(newcountRDD); - JavaRDD<Vector> idf = idfModel.transform(newcountRDD); - return new RowMatrix(idf.rdd()); - } - - /** - * Create matrix from doc-terms JavaPairRDD. - * - * @param uniqueDocRDD - * doc-terms JavaPairRDD, in which each key is a doc name, and value - * is term list extracted from that doc - * @return LabeledRowMatrix {@link LabeledRowMatrix} - */ - public static LabeledRowMatrix createWordDocMatrix(JavaPairRDD<String, List<String>> uniqueDocRDD) { - // Index documents with unique IDs - JavaPairRDD<List<String>, Long> corpus = uniqueDocRDD.values().zipWithIndex(); - // cal word-doc numbers - JavaPairRDD<Tuple2<String, Long>, Double> worddocNumRDD = corpus.flatMapToPair(new PairFlatMapFunction<Tuple2<List<String>, Long>, Tuple2<String, Long>, Double>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Iterator<Tuple2<Tuple2<String, Long>, Double>> call(Tuple2<List<String>, Long> docwords) throws Exception { - List<Tuple2<Tuple2<String, Long>, Double>> pairs = new ArrayList<>(); - List<String> words = docwords._1; - int n = words.size(); - for (int i = 0; i < n; i++) { - Tuple2<String, Long> worddoc = new Tuple2<>(words.get(i), docwords._2); - pairs.add(new Tuple2<Tuple2<String, Long>, Double>(worddoc, 1.0)); - } - return pairs.iterator(); - } - }).reduceByKey(new Function2<Double, Double, Double>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Double call(Double first, Double second) throws Exception { - return first + second; - } - }); - // cal word doc-numbers - JavaPairRDD<String, Tuple2<List<Long>, List<Double>>> wordDocnumRDD = worddocNumRDD - .mapToPair(new PairFunction<Tuple2<Tuple2<String, Long>, Double>, String, Tuple2<List<Long>, List<Double>>>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Tuple2<String, Tuple2<List<Long>, List<Double>>> call(Tuple2<Tuple2<String, Long>, Double> worddocNum) throws Exception { - List<Long> docs = new ArrayList<>(); - docs.add(worddocNum._1._2); - List<Double> nums = new ArrayList<>(); - nums.add(worddocNum._2); - Tuple2<List<Long>, List<Double>> docmums = new Tuple2<>(docs, nums); - return new Tuple2<>(worddocNum._1._1, docmums); - } - }); - // trans to vector - final int corporsize = (int) uniqueDocRDD.keys().count(); - JavaPairRDD<String, Vector> wordVectorRDD = wordDocnumRDD.reduceByKey(new Function2<Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Tuple2<List<Long>, List<Double>> call(Tuple2<List<Long>, List<Double>> arg0, Tuple2<List<Long>, List<Double>> arg1) throws Exception { - arg0._1.addAll(arg1._1); - arg0._2.addAll(arg1._2); - return new Tuple2<>(arg0._1, arg0._2); - } - }).mapToPair(new PairFunction<Tuple2<String, Tuple2<List<Long>, List<Double>>>, String, Vector>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Tuple2<String, Vector> call(Tuple2<String, Tuple2<List<Long>, List<Double>>> arg0) throws Exception { - int docsize = arg0._2._1.size(); - int[] intArray = new int[docsize]; - double[] doubleArray = new double[docsize]; - for (int i = 0; i < docsize; i++) { - intArray[i] = arg0._2._1.get(i).intValue(); - doubleArray[i] = arg0._2._2.get(i).intValue(); - } - Vector sv = Vectors.sparse(corporsize, intArray, doubleArray); - return new Tuple2<>(arg0._1, sv); - } - }); - - RowMatrix wordDocMatrix = new RowMatrix(wordVectorRDD.values().rdd()); - - LabeledRowMatrix labeledRowMatrix = new LabeledRowMatrix(); - labeledRowMatrix.rowMatrix = wordDocMatrix; - labeledRowMatrix.rowkeys = wordVectorRDD.keys().collect(); - labeledRowMatrix.colkeys = uniqueDocRDD.keys().collect(); - return labeledRowMatrix; - } - - public static LabeledRowMatrix createDocWordMatrix(JavaPairRDD<String, List<String>> uniqueDocRDD, JavaSparkContext sc) { - // Index word with unique IDs - JavaPairRDD<String, Long> wordIDRDD = uniqueDocRDD.values().flatMap(new FlatMapFunction<List<String>, String>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Iterator<String> call(List<String> arg0) throws Exception { - return arg0.iterator(); - } - }).distinct().zipWithIndex(); - - // - JavaPairRDD<Tuple2<String, String>, Double> docwordNumRDD = uniqueDocRDD.flatMapToPair(new PairFlatMapFunction<Tuple2<String, List<String>>, Tuple2<String, String>, Double>() { - - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Iterator<Tuple2<Tuple2<String, String>, Double>> call(Tuple2<String, List<String>> docwords) throws Exception { - List<Tuple2<Tuple2<String, String>, Double>> pairs = new ArrayList<>(); - List<String> words = docwords._2; - int n = words.size(); - for (int i = 0; i < n; i++) { - Tuple2<String, String> worddoc = new Tuple2<>(docwords._1, words.get(i)); - pairs.add(new Tuple2<Tuple2<String, String>, Double>(worddoc, 1.0)); - } - return pairs.iterator(); - } - }).reduceByKey(new Function2<Double, Double, Double>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Double call(Double first, Double second) throws Exception { - return first + second; - } - }); - - // - JavaPairRDD<String, Tuple2<String, Double>> wordDocnumRDD = docwordNumRDD.mapToPair(new PairFunction<Tuple2<Tuple2<String, String>, Double>, String, Tuple2<String, Double>>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Tuple2<String, Tuple2<String, Double>> call(Tuple2<Tuple2<String, String>, Double> arg0) throws Exception { - - Tuple2<String, Double> wordmums = new Tuple2<>(arg0._1._1, arg0._2); - return new Tuple2<>(arg0._1._2, wordmums); - } - }); - - // - - JavaPairRDD<String, Tuple2<Tuple2<String, Double>, Optional<Long>>> testRDD = wordDocnumRDD.leftOuterJoin(wordIDRDD); - - int wordsize = (int) wordIDRDD.count(); - JavaPairRDD<String, Vector> docVectorRDD = testRDD.mapToPair(new PairFunction<Tuple2<String, Tuple2<Tuple2<String, Double>, Optional<Long>>>, String, Tuple2<List<Long>, List<Double>>>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Tuple2<String, Tuple2<List<Long>, List<Double>>> call(Tuple2<String, Tuple2<Tuple2<String, Double>, Optional<Long>>> arg0) throws Exception { - Optional<Long> oid = arg0._2._2; - Long wordId = (long) 0; - if (oid.isPresent()) { - wordId = oid.get(); - } - - List<Long> word = new ArrayList<>(); - word.add(wordId); - - List<Double> count = new ArrayList<>(); - count.add(arg0._2._1._2); - - Tuple2<List<Long>, List<Double>> wordcount = new Tuple2<>(word, count); - - return new Tuple2<>(arg0._2._1._1, wordcount); - } - - }).reduceByKey(new Function2<Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Tuple2<List<Long>, List<Double>> call(Tuple2<List<Long>, List<Double>> arg0, Tuple2<List<Long>, List<Double>> arg1) throws Exception { - arg0._1.addAll(arg1._1); - arg0._2.addAll(arg1._2); - return new Tuple2<>(arg0._1, arg0._2); - } - }).mapToPair(new PairFunction<Tuple2<String, Tuple2<List<Long>, List<Double>>>, String, Vector>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Tuple2<String, Vector> call(Tuple2<String, Tuple2<List<Long>, List<Double>>> arg0) throws Exception { - int docsize = arg0._2._1.size(); - int[] intArray = new int[docsize]; - double[] doubleArray = new double[docsize]; - for (int i = 0; i < docsize; i++) { - intArray[i] = arg0._2._1.get(i).intValue(); - doubleArray[i] = arg0._2._2.get(i).intValue(); - } - Vector sv = Vectors.sparse(wordsize, intArray, doubleArray); - return new Tuple2<>(arg0._1, sv); - } - }); - - RowMatrix docwordMatrix = new RowMatrix(docVectorRDD.values().rdd()); - - LabeledRowMatrix labeledRowMatrix = new LabeledRowMatrix(); - labeledRowMatrix.rowMatrix = docwordMatrix; - labeledRowMatrix.rowkeys = docVectorRDD.keys().collect(); - labeledRowMatrix.colkeys = wordIDRDD.keys().collect(); - - return labeledRowMatrix; - } - - /** - * loadVectorFromCSV: Load term vector from csv file. - * - * @param spark - * spark instance - * @param csvFileName - * csv matrix file - * @param skipNum - * the numbers of rows which should be skipped Ignore the top skip - * number rows of the csv file - * @return JavaPairRDD, each key is a term, and value is the vector of the - * term in feature space. - */ - public static JavaPairRDD<String, Vector> loadVectorFromCSV(SparkDriver spark, String csvFileName, int skipNum) { - // skip the first line (header), important! - JavaRDD<String> importRDD = spark.sc.textFile(csvFileName); - JavaPairRDD<String, Long> importIdRDD = importRDD.zipWithIndex().filter(new Function<Tuple2<String, Long>, Boolean>() { - /** */ - private static final long serialVersionUID = 1L; - - @Override - public Boolean call(Tuple2<String, Long> v1) throws Exception { - if (v1._2 > (skipNum - 1)) { - return true; - } - return false; - } - }); - - if (importIdRDD.count() == 0) { - return null; - } - - return importIdRDD.mapToPair(new PairFunction<Tuple2<String, Long>, String, Vector>() { - /** */ - private static final long serialVersionUID = 1L; - - @Override - public Tuple2<String, Vector> call(Tuple2<String, Long> t) throws Exception { - String[] fields = t._1.split(","); - String word = fields[0]; - int fieldsize = fields.length; - int nStart = 1; - int nEnd = fieldsize - 1; - if (fieldsize < 2) { - nStart = 0; - nEnd = 0; - } - String[] numfields = Arrays.copyOfRange(fields, nStart, nEnd); - - double[] nums = Stream.of(numfields).mapToDouble(Double::parseDouble).toArray(); - Vector vec = Vectors.dense(nums); - return new Tuple2<>(word, vec); - } - }); - } - - /** - * Convert vectorRDD to indexed row matrix. - * - * @param vecs - * Vector RDD - * @return IndexedRowMatrix - */ - public static IndexedRowMatrix buildIndexRowMatrix(JavaRDD<Vector> vecs) { - JavaRDD<IndexedRow> indexrows = vecs.zipWithIndex().map(new Function<Tuple2<Vector, Long>, IndexedRow>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public IndexedRow call(Tuple2<Vector, Long> docId) { - return new IndexedRow(docId._2, docId._1); - } - }); - return new IndexedRowMatrix(indexrows.rdd()); - } - - /** - * Transpose matrix - * - * @param indexedMatrix - * spark indexed matrix - * @return rowmatrix, each row is corresponding to the column in the original - * matrix and vice versa - */ - public static RowMatrix transposeMatrix(IndexedRowMatrix indexedMatrix) { - return indexedMatrix.toCoordinateMatrix().transpose().toRowMatrix(); - } - - /** - * Output matrix to a CSV file. - * - * @param matrix - * spark row matrix - * @param rowKeys - * matrix row names - * @param colKeys - * matrix coloum names - * @param fileName - * csv file name - */ - public static void exportToCSV(RowMatrix matrix, List<String> rowKeys, List<String> colKeys, String fileName) { - - if (matrix.rows().isEmpty()) { - return; - } - - int rownum = (int) matrix.numRows(); - int colnum = (int) matrix.numCols(); - List<Vector> rows = matrix.rows().toJavaRDD().collect(); - - File file = new File(fileName); - if (file.exists()) { - file.delete(); - } - try { - file.createNewFile(); - FileWriter fw = new FileWriter(file.getAbsoluteFile()); - BufferedWriter bw = new BufferedWriter(fw); - String coltitle = " Num" + ","; - for (int j = 0; j < colnum; j++) { - coltitle += "\"" + colKeys.get(j) + "\","; - } - coltitle = coltitle.substring(0, coltitle.length() - 1); - bw.write(coltitle + "\n"); - - for (int i = 0; i < rownum; i++) { - double[] rowvlaue = rows.get(i).toArray(); - String row = rowKeys.get(i) + ","; - for (int j = 0; j < colnum; j++) { - row += rowvlaue[j] + ","; - } - row = row.substring(0, row.length() - 1); - bw.write(row + "\n"); - } - - bw.close(); - - } catch (IOException e) { - e.printStackTrace(); - - } - } -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/utils/RDDUtil.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/utils/RDDUtil.java b/core/src/main/java/gov/nasa/jpl/mudrod/utils/RDDUtil.java deleted file mode 100644 index 3fc45f4..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/utils/RDDUtil.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.utils; - -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.function.FlatMapFunction; - -import java.util.Iterator; -import java.util.List; - -/** - * ClassName: RDDUtil Function: Mudrod Spark RDD common methods - */ -public class RDDUtil { - - public RDDUtil() { - } - - /** - * getAllWordsInDoc: Extracted all unique terms from all docs. - * - * @param docwordRDD Pair RDD, each key is a doc, and value is term list extracted from - * that doc. - * @return unique term list - */ - public static JavaRDD<String> getAllWordsInDoc(JavaPairRDD<String, List<String>> docwordRDD) { - JavaRDD<String> wordRDD = docwordRDD.values().flatMap(new FlatMapFunction<List<String>, String>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Iterator<String> call(List<String> list) { - return list.iterator(); - } - }).distinct(); - - return wordRDD; - } -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/utils/SVDUtil.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/utils/SVDUtil.java b/core/src/main/java/gov/nasa/jpl/mudrod/utils/SVDUtil.java deleted file mode 100644 index 1982996..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/utils/SVDUtil.java +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.utils; - -import gov.nasa.jpl.mudrod.discoveryengine.MudrodAbstract; -import gov.nasa.jpl.mudrod.driver.ESDriver; -import gov.nasa.jpl.mudrod.driver.SparkDriver; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix; -import org.apache.spark.mllib.linalg.distributed.RowMatrix; - -import java.io.IOException; -import java.util.List; -import java.util.Properties; - -/** - * Singular value decomposition - */ -public class SVDUtil extends MudrodAbstract { - - /** - * - */ - private static final long serialVersionUID = 1L; - // wordRDD: terms extracted from all documents - JavaRDD<String> wordRDD; - // svdMatrix: svd matrix - private RowMatrix svdMatrix; - // simMatrix: similarity matrix - private CoordinateMatrix simMatrix; - - /** - * Creates a new instance of SVDUtil. - * - * @param config the Mudrod configuration - * @param es the Elasticsearch drive - * @param spark the spark driver - */ - public SVDUtil(Properties config, ESDriver es, SparkDriver spark) { - super(config, es, spark); - } - - /** - * Build SVD matrix from docment-terms pairs. - * - * @param docwordRDD JavaPairRDD, key is short name of data set and values are terms in - * the corresponding data set - * @param svdDimension: Dimension of matrix after singular value decomposition - * @return row matrix - */ - public RowMatrix buildSVDMatrix(JavaPairRDD<String, List<String>> docwordRDD, int svdDimension) { - - RowMatrix svdMatrix = null; - LabeledRowMatrix wordDocMatrix = MatrixUtil.createWordDocMatrix(docwordRDD); - RowMatrix ifIdfMatrix = MatrixUtil.createTFIDFMatrix(wordDocMatrix.rowMatrix); - svdMatrix = MatrixUtil.buildSVDMatrix(ifIdfMatrix, svdDimension); - this.svdMatrix = svdMatrix; - this.wordRDD = RDDUtil.getAllWordsInDoc(docwordRDD); - return svdMatrix; - } - - /** - * Build svd matrix from CSV file. - * - * @param tfidfCSVfile tf-idf matrix csv file - * @param svdDimension: Dimension of matrix after singular value decomposition - * @return row matrix - */ - public RowMatrix buildSVDMatrix(String tfidfCSVfile, int svdDimension) { - RowMatrix svdMatrix = null; - JavaPairRDD<String, Vector> tfidfRDD = MatrixUtil.loadVectorFromCSV(spark, tfidfCSVfile, 2); - JavaRDD<Vector> vectorRDD = tfidfRDD.values(); - - svdMatrix = MatrixUtil.buildSVDMatrix(vectorRDD, svdDimension); - this.svdMatrix = svdMatrix; - - this.wordRDD = tfidfRDD.keys(); - - return svdMatrix; - } - - /** - * Calculate similarity - */ - public void calSimilarity() { - CoordinateMatrix simMatrix = SimilarityUtil.calculateSimilarityFromMatrix(svdMatrix); - this.simMatrix = simMatrix; - } - - /** - * Insert linkage triples to elasticsearch - * - * @param index index name - * @param type linkage triple name - */ - public void insertLinkageToES(String index, String type) { - List<LinkageTriple> triples = SimilarityUtil.matrixToTriples(wordRDD, simMatrix); - try { - LinkageTriple.insertTriples(es, triples, index, type); - } catch (IOException e) { - e.printStackTrace(); - } - } - -} http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/utils/SimilarityUtil.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/utils/SimilarityUtil.java b/core/src/main/java/gov/nasa/jpl/mudrod/utils/SimilarityUtil.java deleted file mode 100644 index 8ae9770..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/utils/SimilarityUtil.java +++ /dev/null @@ -1,277 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package gov.nasa.jpl.mudrod.utils; - -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.Optional; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix; -import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix; -import org.apache.spark.mllib.linalg.distributed.MatrixEntry; -import org.apache.spark.mllib.linalg.distributed.RowMatrix; -import scala.Tuple2; - -import java.util.List; - -/** - * Similarity and distrance calculation utilities - */ -public class SimilarityUtil { - - public static final int SIM_COSINE = 3; - public static final int SIM_HELLINGER = 2; - public static final int SIM_PEARSON = 1; - /** - * CalSimilarityFromMatrix: Calculate term similarity from matrix. - * - * @param svdMatrix. Each row is corresponding to a term, and each column is - * corresponding to a dimension of feature - * @return CoordinateMatrix, each row is corresponding to a term, and each - * column is also a term, the cell value is the similarity between the - * two terms - */ - public static CoordinateMatrix calculateSimilarityFromMatrix(RowMatrix svdMatrix) { - JavaRDD<Vector> vecs = svdMatrix.rows().toJavaRDD(); - return SimilarityUtil.calculateSimilarityFromVector(vecs); - } - - /** - * CalSimilarityFromVector:Calculate term similarity from vector. - * - * @param vecs Each vector is corresponding to a term in the feature space. - * @return CoordinateMatrix, each row is corresponding to a term, and each - * column is also a term, the cell value is the similarity between the - * two terms - */ - public static CoordinateMatrix calculateSimilarityFromVector(JavaRDD<Vector> vecs) { - IndexedRowMatrix indexedMatrix = MatrixUtil.buildIndexRowMatrix(vecs); - RowMatrix transposeMatrix = MatrixUtil.transposeMatrix(indexedMatrix); - return transposeMatrix.columnSimilarities(); - } - - /** - * Calculate term similarity from vector. - * - * @param importRDD the {@link org.apache.spark.api.java.JavaPairRDD} - * data structure containing the vectors. - * @param simType the similarity calculation to execute e.g. - * <ul> - * <li>{@link gov.nasa.jpl.mudrod.utils.SimilarityUtil#SIM_COSINE} - 3,</li> - * <li>{@link gov.nasa.jpl.mudrod.utils.SimilarityUtil#SIM_HELLINGER} - 2,</li> - * <li>{@link gov.nasa.jpl.mudrod.utils.SimilarityUtil#SIM_PEARSON} - 1</li> - * </ul> - * @return a new {@link org.apache.spark.api.java.JavaPairRDD} - */ - public static JavaRDD<LinkageTriple> calculateSimilarityFromVector(JavaPairRDD<String, Vector> importRDD, int simType) { - JavaRDD<Tuple2<String, Vector>> importRDD1 = importRDD.map(f -> new Tuple2<String, Vector>(f._1, f._2)); - JavaPairRDD<Tuple2<String, Vector>, Tuple2<String, Vector>> cartesianRDD = importRDD1.cartesian(importRDD1); - - return cartesianRDD.map(new Function<Tuple2<Tuple2<String, Vector>, Tuple2<String, Vector>>, LinkageTriple>() { - - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public LinkageTriple call(Tuple2<Tuple2<String, Vector>, Tuple2<String, Vector>> arg) { - String keyA = arg._1._1; - String keyB = arg._2._1; - - if (keyA.equals(keyB)) { - return null; - } - - Vector vecA = arg._1._2; - Vector vecB = arg._2._2; - Double weight = 0.0; - - if (simType == SimilarityUtil.SIM_PEARSON) { - weight = SimilarityUtil.pearsonDistance(vecA, vecB); - } else if (simType == SimilarityUtil.SIM_HELLINGER) { - weight = SimilarityUtil.hellingerDistance(vecA, vecB); - } - - LinkageTriple triple = new LinkageTriple(); - triple.keyA = keyA; - triple.keyB = keyB; - triple.weight = weight; - return triple; - } - }).filter(new Function<LinkageTriple, Boolean>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Boolean call(LinkageTriple arg0) throws Exception { - if (arg0 == null) { - return false; - } - return true; - } - }); - } - - /** - * MatrixtoTriples:Convert term similarity matrix to linkage triple list. - * - * @param keys each key is a term - * @param simMatirx term similarity matrix, in which each row and column is a term and - * the cell value is the similarity between the two terms - * @return linkage triple list - */ - public static List<LinkageTriple> matrixToTriples(JavaRDD<String> keys, CoordinateMatrix simMatirx) { - if (simMatirx.numCols() != keys.count()) { - return null; - } - - // index words - JavaPairRDD<Long, String> keyIdRDD = JavaPairRDD.fromJavaRDD(keys.zipWithIndex().map(new Function<Tuple2<String, Long>, Tuple2<Long, String>>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Tuple2<Long, String> call(Tuple2<String, Long> docId) { - return docId.swap(); - } - })); - - JavaPairRDD<Long, LinkageTriple> entriesRowRDD = simMatirx.entries().toJavaRDD().mapToPair(new PairFunction<MatrixEntry, Long, LinkageTriple>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Tuple2<Long, LinkageTriple> call(MatrixEntry t) throws Exception { - LinkageTriple triple = new LinkageTriple(); - triple.keyAId = t.i(); - triple.keyBId = t.j(); - triple.weight = t.value(); - return new Tuple2<>(triple.keyAId, triple); - } - }); - - JavaPairRDD<Long, LinkageTriple> entriesColRDD = entriesRowRDD.leftOuterJoin(keyIdRDD).values().mapToPair(new PairFunction<Tuple2<LinkageTriple, Optional<String>>, Long, LinkageTriple>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public Tuple2<Long, LinkageTriple> call(Tuple2<LinkageTriple, Optional<String>> t) throws Exception { - LinkageTriple triple = t._1; - Optional<String> stra = t._2; - if (stra.isPresent()) { - triple.keyA = stra.get(); - } - return new Tuple2<>(triple.keyBId, triple); - } - }); - - JavaRDD<LinkageTriple> tripleRDD = entriesColRDD.leftOuterJoin(keyIdRDD).values().map(new Function<Tuple2<LinkageTriple, Optional<String>>, LinkageTriple>() { - /** - * - */ - private static final long serialVersionUID = 1L; - - @Override - public LinkageTriple call(Tuple2<LinkageTriple, Optional<String>> t) throws Exception { - LinkageTriple triple = t._1; - Optional<String> strb = t._2; - if (strb.isPresent()) { - triple.keyB = strb.get(); - } - return triple; - } - }); - return tripleRDD.collect(); - } - - /** - * Calculate similarity (Hellinger Distance) between vectors - * - * @param vecA initial vector from which to calculate a similarity - * @param vecB second vector involved in similarity calculation - * @return similarity between two vectors - */ - public static double hellingerDistance(Vector vecA, Vector vecB) { - double[] arrA = vecA.toArray(); - double[] arrB = vecB.toArray(); - - double sim = 0.0; - - int arrsize = arrA.length; - for (int i = 0; i < arrsize; i++) { - double a = arrA[i]; - double b = arrB[i]; - double sqrtDiff = Math.sqrt(a) - Math.sqrt(b); - sim += sqrtDiff * sqrtDiff; - } - - sim = sim / Math.sqrt(2); - - return sim; - } - - /** - * Calculate similarity (Pearson Distance) between vectors - * - * @param vecA initial vector from which to calculate a similarity - * @param vecB second vector involved in similarity calculation - * @return similarity between two vectors - */ - public static double pearsonDistance(Vector vecA, Vector vecB) { - double[] arrA = vecA.toArray(); - double[] arrB = vecB.toArray(); - - int viewA = 0; - int viewB = 0; - int viewAB = 0; - - int arrsize = arrA.length; - for (int i = 0; i < arrsize; i++) { - if (arrA[i] > 0) { - viewA++; - } - - if (arrB[i] > 0) { - viewB++; - } - - if (arrB[i] > 0 && arrA[i] > 0) { - viewAB++; - } - } - return viewAB / (Math.sqrt(viewA) * Math.sqrt(viewB)); - } - - /** - * calculate similarity between vectors - * - * @param vecA initial vector from which to calculate a similarity - * @param vecB second vector involved in similarity calculation - * @return similarity between two vectors - */ - public static double cosineDistance(Vector vecA, Vector vecB) { - return 1; - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/utils/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/utils/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/utils/package-info.java deleted file mode 100644 index 3fcd95e..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/utils/package-info.java +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * This package includes utilities classes for calculating similarity and - * parsing HTTP request - */ -package gov.nasa.jpl.mudrod.utils; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-sdap-mudrod/blob/39379fa9/core/src/main/java/gov/nasa/jpl/mudrod/weblog/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/gov/nasa/jpl/mudrod/weblog/package-info.java b/core/src/main/java/gov/nasa/jpl/mudrod/weblog/package-info.java deleted file mode 100644 index f4a8b86..0000000 --- a/core/src/main/java/gov/nasa/jpl/mudrod/weblog/package-info.java +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); you - * may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * This package includes web log pre-processing, processing, and data structure - * classes. - */ -package gov.nasa.jpl.mudrod.weblog; \ No newline at end of file
