lewismc closed pull request #15: SDAP-55
URL: https://github.com/apache/incubator-sdap-mudrod/pull/15
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/main/MudrodConstants.java 
b/core/src/main/java/org/apache/sdap/mudrod/main/MudrodConstants.java
index 0c8bcc2..84ba347 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/main/MudrodConstants.java
+++ b/core/src/main/java/org/apache/sdap/mudrod/main/MudrodConstants.java
@@ -98,8 +98,6 @@
   
   public static final String ONTOLOGY_INPUT_PATH = 
"mudrod.ontology.input.path";
 
-  public static final String PROCESS_TYPE = "mudrod.processing.type";
-
   /** Defined on CLI */
   public static final String METADATA_DOWNLOAD = "mudrod.metadata.download";
   
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/recommendation/pre/SessionCooccurence.java
 
b/core/src/main/java/org/apache/sdap/mudrod/recommendation/pre/SessionCooccurence.java
index a4a79e0..4a5d4bb 100644
--- 
a/core/src/main/java/org/apache/sdap/mudrod/recommendation/pre/SessionCooccurence.java
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/recommendation/pre/SessionCooccurence.java
@@ -19,7 +19,7 @@
 import org.apache.sdap.mudrod.main.MudrodConstants;
 import org.apache.sdap.mudrod.utils.LabeledRowMatrix;
 import org.apache.sdap.mudrod.utils.MatrixUtil;
-import org.apache.sdap.mudrod.weblog.structure.SessionExtractor;
+import org.apache.sdap.mudrod.weblog.structure.session.SessionExtractor;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.function.PairFunction;
 import org.elasticsearch.action.search.SearchResponse;
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/ClickStreamGenerator.java
 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/ClickStreamGenerator.java
index 886cd4a..2c38d29 100644
--- 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/ClickStreamGenerator.java
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/ClickStreamGenerator.java
@@ -19,8 +19,8 @@
 import org.apache.sdap.mudrod.main.MudrodConstants;
 import org.apache.sdap.mudrod.utils.LabeledRowMatrix;
 import org.apache.sdap.mudrod.utils.MatrixUtil;
-import org.apache.sdap.mudrod.weblog.structure.ClickStream;
-import org.apache.sdap.mudrod.weblog.structure.SessionExtractor;
+import org.apache.sdap.mudrod.weblog.structure.session.ClickStream;
+import org.apache.sdap.mudrod.weblog.structure.session.SessionExtractor;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.slf4j.Logger;
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/CrawlerDetection.java 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/CrawlerDetection.java
index 3e782a7..704ccfd 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/CrawlerDetection.java
+++ b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/CrawlerDetection.java
@@ -79,7 +79,7 @@ public Object execute() {
     LOG.info("Starting Crawler detection {}.", httpType);
     startTime = System.currentTimeMillis();
     try {
-      checkByRate();
+      checkByRateInParallel();
     } catch (InterruptedException | IOException e) {
       LOG.error("Encountered an error whilst detecting Web crawlers.", e);
     }
@@ -103,40 +103,6 @@ public boolean checkKnownCrawler(String agent) {
     return false;
   }
 
-  public void checkByRate() throws InterruptedException, IOException {
-    String processingType = props.getProperty(MudrodConstants.PROCESS_TYPE);
-    if (processingType.equals("sequential")) {
-      checkByRateInSequential();
-    } else if (processingType.equals("parallel")) {
-      checkByRateInParallel();
-    }
-  }
-
-  /**
-   * Check crawler by request sending rate, which is read from configruation
-   * file
-   *
-   * @throws InterruptedException InterruptedException
-   * @throws IOException          IOException
-   */
-  public void checkByRateInSequential() throws InterruptedException, 
IOException {
-    es.createBulkProcessor();
-
-    int rate = 
Integer.parseInt(props.getProperty(MudrodConstants.REQUEST_RATE));
-
-    Terms users = this.getUserTerms(this.httpType);
-    LOG.info("Original User count: {}", 
Integer.toString(users.getBuckets().size()));
-
-    int userCount = 0;
-    for (Terms.Bucket entry : users.getBuckets()) {
-      String user = entry.getKey().toString();
-      int count = checkByRate(es, user);
-      userCount += count;
-    }
-    es.destroyBulkProcessor();
-    LOG.info("User count: {}", Integer.toString(userCount));
-  }
-
   void checkByRateInParallel() throws InterruptedException, IOException {
 
     JavaRDD<String> userRDD = getUserRDD(this.httpType);
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/ImportLogFile.java 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/ImportLogFile.java
index c7622d6..6e5f473 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/ImportLogFile.java
+++ b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/ImportLogFile.java
@@ -16,8 +16,8 @@
 import org.apache.sdap.mudrod.driver.ESDriver;
 import org.apache.sdap.mudrod.driver.SparkDriver;
 import org.apache.sdap.mudrod.main.MudrodConstants;
-import org.apache.sdap.mudrod.weblog.structure.ApacheAccessLog;
-import org.apache.sdap.mudrod.weblog.structure.FtpLog;
+import org.apache.sdap.mudrod.weblog.structure.log.ApacheAccessLog;
+import org.apache.sdap.mudrod.weblog.structure.log.FtpLog;
 import org.apache.spark.api.java.JavaRDD;
 import org.elasticsearch.action.index.IndexRequest;
 import org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
@@ -55,6 +55,11 @@
   public static final int NUM_FIELDS = 9;
   Pattern p = Pattern.compile(logEntryPattern);
   transient Matcher matcher;
+  
+  @Override
+  public Object execute(Object o) {
+    return null;
+  }
 
   /**
    * Constructor supporting a number of parameters documented below.
@@ -146,31 +151,7 @@ public void readFile() {
       return;
     }
 
-    String processingType = props.getProperty(MudrodConstants.PROCESS_TYPE, 
"parallel");
-    if (processingType.equals("sequential")) {
-      readFileInSequential(httplogpath, ftplogpath);
-    } else if (processingType.equals("parallel")) {
-      readFileInParallel(httplogpath, ftplogpath);
-    }
-  }
-
-  /**
-   * Read the FTP or HTTP log path with the intention of processing lines from
-   * log files.
-   *
-   * @param httplogpath path to the parent directory containing http logs
-   * @param ftplogpath  path to the parent directory containing ftp logs
-   */
-  public void readFileInSequential(String httplogpath, String ftplogpath) {
-    es.createBulkProcessor();
-    try {
-      readLogFile(httplogpath, "http", logIndex, httpType);
-      readLogFile(ftplogpath, "FTP", logIndex, ftpType);
-
-    } catch (IOException e) {
-      LOG.error("Error whilst reading log file.", e);
-    }
-    es.destroyBulkProcessor();
+    readFileInParallel(httplogpath, ftplogpath);
   }
 
   /**
@@ -181,7 +162,6 @@ public void readFileInSequential(String httplogpath, String 
ftplogpath) {
    * @param ftplogpath  path to the parent directory containing ftp logs
    */
   public void readFileInParallel(String httplogpath, String ftplogpath) {
-
     importHttpfile(httplogpath);
     importFtpfile(ftplogpath);
   }
@@ -189,171 +169,12 @@ public void readFileInParallel(String httplogpath, 
String ftplogpath) {
   public void importHttpfile(String httplogpath) {
     // import http logs
     JavaRDD<String> accessLogs = spark.sc.textFile(httplogpath, 
this.partition).map(s -> ApacheAccessLog.parseFromLogLine(s, 
props)).filter(ApacheAccessLog::checknull);
-
     JavaEsSpark.saveJsonToEs(accessLogs, logIndex + "/" + this.httpType);
   }
 
   public void importFtpfile(String ftplogpath) {
     // import ftp logs
     JavaRDD<String> ftpLogs = spark.sc.textFile(ftplogpath, 
this.partition).map(s -> FtpLog.parseFromLogLine(s, 
props)).filter(FtpLog::checknull);
-
     JavaEsSpark.saveJsonToEs(ftpLogs, logIndex + "/" + this.ftpType);
   }
-
-  /**
-   * Process a log path on local file system which contains the relevant
-   * parameters as below.
-   *
-   * @param fileName the {@link java.lang.String} path to the log directory on 
file
-   *                 system
-   * @param protocol whether to process 'http' or 'FTP'
-   * @param index    the index name to write logs to
-   * @param type     one of the available protocols from which Mudrod logs are 
obtained.
-   * @throws IOException if there is an error reading anything from the 
fileName provided.
-   */
-  public void readLogFile(String fileName, String protocol, String index, 
String type) throws IOException {
-    BufferedReader br = new BufferedReader(new FileReader(fileName));
-    int count = 0;
-    try {
-      String line = br.readLine();
-      while (line != null) {
-        if ("FTP".equals(protocol)) {
-          parseSingleLineFTP(line, index, type);
-        } else {
-          parseSingleLineHTTP(line, index, type);
-        }
-        line = br.readLine();
-        count++;
-      }
-    } catch (FileNotFoundException e) {
-      LOG.error("File not found.", e);
-    } catch (IOException e) {
-      LOG.error("Error reading input directory.", e);
-    } finally {
-      br.close();
-      LOG.info("Num of {} entries:\t{}", protocol, count);
-    }
-  }
-
-  /**
-   * Parse a single FTP log entry
-   *
-   * @param log   a single log line
-   * @param index the index name we wish to persist the log line to
-   * @param type  one of the available protocols from which Mudrod logs are 
obtained.
-   */
-  public void parseSingleLineFTP(String log, String index, String type) {
-    String ip = log.split(" +")[6];
-
-    String time = log.split(" +")[1] + ":" + log.split(" +")[2] + ":" + 
log.split(" +")[3] + ":" + log.split(" +")[4];
-
-    time = switchtoNum(time);
-    SimpleDateFormat formatter = new SimpleDateFormat("MM:dd:HH:mm:ss:yyyy");
-    Date date = null;
-    try {
-      date = formatter.parse(time);
-    } catch (ParseException e) {
-      LOG.error("Error whilst parsing the date.", e);
-    }
-    String bytes = log.split(" +")[7];
-
-    String request = log.split(" +")[8].toLowerCase();
-
-    if (!request.contains("/misc/") && !request.contains("readme")) {
-      IndexRequest ir;
-      try {
-        ir = new IndexRequest(index, type)
-            .source(jsonBuilder()
-                    .startObject()
-                    .field("LogType", MudrodConstants.FTP_LOG)
-                    .field("IP", ip)
-                    .field("Time", date)
-                    .field("Request", request)
-                    .field("Bytes", Long.parseLong(bytes))
-                    .endObject());
-        es.getBulkProcessor().add(ir);
-      } catch (NumberFormatException e) {
-        LOG.error("Error whilst processing numbers", e);
-      } catch (IOException e) {
-        LOG.error("IOError whilst adding to the bulk processor.", e);
-      }
-    }
-
-  }
-
-  /**
-   * Parse a single HTTP log entry
-   *
-   * @param log   a single log line
-   * @param index the index name we wish to persist the log line to
-   * @param type  one of the available protocols from which Mudrod logs are 
obtained.
-   */
-  public void parseSingleLineHTTP(String log, String index, String type) {
-    matcher = p.matcher(log);
-    if (!matcher.matches() || NUM_FIELDS != matcher.groupCount()) {
-      return;
-    }
-    String time = matcher.group(4);
-    time = switchtoNum(time);
-    SimpleDateFormat formatter = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss");
-    Date date = null;
-    try {
-      date = formatter.parse(time);
-    } catch (ParseException e) {
-      LOG.error("Error whilst attempting to parse date.", e);
-    }
-
-    String bytes = matcher.group(7);
-    if ("-".equals(bytes)) {
-      bytes = "0";
-    }
-
-    String request = matcher.group(5).toLowerCase();
-    String agent = matcher.group(9);
-    CrawlerDetection crawlerDe = new CrawlerDetection(this.props, this.es, 
this.spark);
-    if (!crawlerDe.checkKnownCrawler(agent)) {
-      boolean tag = false;
-      String[] mimeTypes = 
props.getProperty(MudrodConstants.BLACK_LIST_REQUEST).split(",");
-      for(String str:mimeTypes) {
-        if (request.contains(str.trim())) {
-          tag = true;
-          break;
-        }
-      }
-
-      if (!tag) {
-        IndexRequest ir = null;
-        executeBulkRequest(ir, index, type, matcher, date, bytes);
-      }
-    }
-  }
-
-  private void executeBulkRequest(IndexRequest ir, String index, String type, 
Matcher matcher, Date date, String bytes) {
-    IndexRequest newIr = ir;
-    try {
-      newIr = new IndexRequest(index, type)
-              .source(jsonBuilder()
-                      .startObject()
-                      .field("LogType", MudrodConstants.HTTP_LOG)
-                      .field("IP", matcher.group(1))
-                      .field("Time", date)
-                      .field("Request", matcher.group(5))
-                      .field("Response", matcher.group(6))
-                      .field("Bytes", Integer.parseInt(bytes))
-                      .field("Referer", matcher.group(8))
-                      .field("Browser", matcher.group(9))
-                      .endObject());
-
-      es.getBulkProcessor().add(newIr);
-    } catch (NumberFormatException e) {
-      LOG.error("Error whilst processing numbers", e);
-    } catch (IOException e) {
-      LOG.error("IOError whilst adding to the bulk processor.", e);
-    }
-  }
-
-  @Override
-  public Object execute(Object o) {
-    return null;
-  }
 }
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/RankingTrainDataGenerator.java
 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/RankingTrainDataGenerator.java
index 766e853..de41d56 100644
--- 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/RankingTrainDataGenerator.java
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/RankingTrainDataGenerator.java
@@ -3,8 +3,8 @@
 import org.apache.sdap.mudrod.discoveryengine.DiscoveryStepAbstract;
 import org.apache.sdap.mudrod.driver.ESDriver;
 import org.apache.sdap.mudrod.driver.SparkDriver;
-import org.apache.sdap.mudrod.weblog.structure.RankingTrainData;
-import org.apache.sdap.mudrod.weblog.structure.SessionExtractor;
+import org.apache.sdap.mudrod.weblog.structure.session.RankingTrainData;
+import org.apache.sdap.mudrod.weblog.structure.session.SessionExtractor;
 import org.apache.spark.api.java.JavaRDD;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/SessionGenerator.java 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/SessionGenerator.java
index 4e170d7..4ce1535 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/SessionGenerator.java
+++ b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/SessionGenerator.java
@@ -16,7 +16,7 @@
 import org.apache.sdap.mudrod.driver.ESDriver;
 import org.apache.sdap.mudrod.driver.SparkDriver;
 import org.apache.sdap.mudrod.main.MudrodConstants;
-import org.apache.sdap.mudrod.weblog.structure.Session;
+import org.apache.sdap.mudrod.weblog.structure.session.Session;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.api.java.function.Function2;
@@ -93,21 +93,15 @@ public void generateSession() {
   }
 
   public void genSessionByReferer(int timeThres) throws InterruptedException, 
IOException {
-    String processingType = props.getProperty(MudrodConstants.PROCESS_TYPE);
-    if (processingType.equals("sequential")) {
-      genSessionByRefererInSequential(timeThres);
-    } else if (processingType.equals("parallel")) {
+    
       genSessionByRefererInParallel(timeThres);
-    }
+  
   }
 
   public void combineShortSessions(int timeThres) throws InterruptedException, 
IOException {
-    String processingType = props.getProperty(MudrodConstants.PROCESS_TYPE);
-    if (processingType.equals("sequential")) {
-      combineShortSessionsInSequential(timeThres);
-    } else if (processingType.equals("parallel")) {
+   
       combineShortSessionsInParallel(timeThres);
-    }
+    
   }
 
   /**
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/SessionStatistic.java 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/SessionStatistic.java
index 3cee9c7..981bece 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/SessionStatistic.java
+++ b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/SessionStatistic.java
@@ -16,7 +16,7 @@
 import org.apache.sdap.mudrod.driver.ESDriver;
 import org.apache.sdap.mudrod.driver.SparkDriver;
 import org.apache.sdap.mudrod.main.MudrodConstants;
-import org.apache.sdap.mudrod.weblog.structure.RequestUrl;
+import org.apache.sdap.mudrod.weblog.structure.log.RequestUrl;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.api.java.function.Function2;
@@ -81,27 +81,7 @@ public Object execute() {
   }
 
   public void processSession() throws InterruptedException, IOException, 
ExecutionException {
-    String processingType = props.getProperty(MudrodConstants.PROCESS_TYPE);
-    if ("sequential".equals(processingType)) {
-      processSessionInSequential();
-    } else if ("parallel".equals(processingType)) {
       processSessionInParallel();
-    }
-  }
-
-  public void processSessionInSequential() throws IOException, 
InterruptedException, ExecutionException {
-    es.createBulkProcessor();
-    Terms sessions = this.getSessionTerms();
-    int sessionCount = 0;
-    for (Terms.Bucket entry : sessions.getBuckets()) {
-      if (entry.getDocCount() >= 3 && !"invalid".equals(entry.getKey())) {
-        String sessionid = entry.getKey().toString();
-        int sessionNum = processSession(es, sessionid);
-        sessionCount += sessionNum;
-      }
-    }
-    LOG.info("Final Session count: {}", Integer.toString(sessionCount));
-    es.destroyBulkProcessor();
   }
 
   /**
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/ApacheAccessLog.java
 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/ApacheAccessLog.java
similarity index 98%
rename from 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/ApacheAccessLog.java
rename to 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/ApacheAccessLog.java
index 050d19d..8224b33 100644
--- 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/ApacheAccessLog.java
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/ApacheAccessLog.java
@@ -11,7 +11,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.log;
 
 import com.google.gson.Gson;
 
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/Coordinates.java 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/Coordinates.java
similarity index 92%
rename from 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/Coordinates.java
rename to 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/Coordinates.java
index 5e6fd07..43a7642 100644
--- 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/Coordinates.java
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/Coordinates.java
@@ -11,7 +11,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.log;
 
 public class Coordinates {
   /*
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/FtpLog.java 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/FtpLog.java
similarity index 97%
rename from 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/FtpLog.java
rename to 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/FtpLog.java
index 9f39655..91fa228 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/FtpLog.java
+++ b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/FtpLog.java
@@ -11,7 +11,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.log;
 
 import com.google.gson.Gson;
 
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/GeoIp.java 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/GeoIp.java
similarity index 96%
rename from 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/GeoIp.java
rename to 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/GeoIp.java
index d3e94dc..154cd98 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/GeoIp.java
+++ b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/GeoIp.java
@@ -11,7 +11,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.log;
 
 import org.apache.sdap.mudrod.utils.HttpRequest;
 
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/RequestUrl.java 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/RequestUrl.java
similarity index 99%
rename from 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/RequestUrl.java
rename to 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/RequestUrl.java
index 05a3395..0c365a4 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/RequestUrl.java
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/RequestUrl.java
@@ -11,7 +11,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.log;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/WebLog.java 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/WebLog.java
similarity index 97%
rename from 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/WebLog.java
rename to 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/WebLog.java
index 93a9747..9046992 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/WebLog.java
+++ b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/WebLog.java
@@ -11,7 +11,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.log;
 
 import java.io.Serializable;
 
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/ClickStream.java 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/ClickStream.java
similarity index 98%
rename from 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/ClickStream.java
rename to 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/ClickStream.java
index 2f0c34d..7e5069e 100644
--- 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/ClickStream.java
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/ClickStream.java
@@ -11,7 +11,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.session;
 
 import org.codehaus.jettison.json.JSONException;
 import org.codehaus.jettison.json.JSONObject;
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/RankingTrainData.java
 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/RankingTrainData.java
similarity index 98%
rename from 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/RankingTrainData.java
rename to 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/RankingTrainData.java
index cf4ec23..bdf477a 100644
--- 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/RankingTrainData.java
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/RankingTrainData.java
@@ -1,4 +1,4 @@
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.session;
 
 import java.io.Serializable;
 import java.util.Map;
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/Session.java 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/Session.java
similarity index 97%
rename from 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/Session.java
rename to 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/Session.java
index 31bef0c..2c917a6 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/Session.java
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/Session.java
@@ -11,7 +11,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.session;
 
 import com.google.gson.Gson;
 import com.google.gson.JsonElement;
@@ -188,8 +188,10 @@ public JsonObject getSessionDetail(String indexName, 
String type, String session
     } catch (UnsupportedEncodingException e) {
       LOG.error("Erro whilst obtaining the Session Tree: {}", e);
     }
+    
+    //tree.printTree(tree.root);
 
-    List<ClickStream> clickthroughs = tree.getClickStreamList();
+    List<ClickStream> clickthroughs = tree.getClickStreamList(props);
     return clickthroughs;
   }
 
@@ -215,7 +217,7 @@ private SessionTree getSessionTree(String indexName, String 
type, String session
       String logType = (String) result.get("LogType");
       String referer = (String) result.get("Referer");
 
-      SessionNode node = new SessionNode(request, logType, referer, 
props.getProperty(MudrodConstants.BASE_URL), time, seq);
+      SessionNode node = new SessionNode(props, request, logType, referer, 
time, seq);
       tree.insert(node);
       seq++;
     }
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/SessionExtractor.java
 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionExtractor.java
similarity index 90%
rename from 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/SessionExtractor.java
rename to 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionExtractor.java
index 85b6961..f7eb602 100644
--- 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/SessionExtractor.java
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionExtractor.java
@@ -11,7 +11,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.session;
 
 import org.apache.sdap.mudrod.driver.ESDriver;
 import org.apache.sdap.mudrod.driver.SparkDriver;
@@ -70,47 +70,7 @@ public SessionExtractor() {
    * @return clickstream list in JavaRDD format {@link ClickStream}
    */
   public JavaRDD<ClickStream> extractClickStreamFromES(Properties props, 
ESDriver es, SparkDriver spark) {
-    switch (props.getProperty(MudrodConstants.PROCESS_TYPE)) {
-      case "sequential":
-        List<ClickStream> queryList = this.getClickStreamList(props, es);
-        return spark.sc.parallelize(queryList);
-      case "parallel":
-        return getClickStreamListInParallel(props, spark, es);
-      default:
-      LOG.error("Error finding processing type for '{}'. Please check your 
config.xml.", props.getProperty(MudrodConstants.PROCESS_TYPE));
-    }
-    return null;
-  }
-
-  /**
-   * getClickStreamList:Extract click streams from logs stored in 
Elasticsearch.
-   *
-   * @param props
-   *          the Mudrod configuration
-   * @param es
-   *          the Elasticsearch driver
-   * @return clickstream list {@link ClickStream}
-   */
-  protected List<ClickStream> getClickStreamList(Properties props, ESDriver 
es) {
-    List<String> logIndexList = 
es.getIndexListWithPrefix(props.getProperty(MudrodConstants.LOG_INDEX));
-
-    List<ClickStream> result = new ArrayList<>();
-    for (String logIndex : logIndexList) {
-      List<String> sessionIdList;
-      try {
-        sessionIdList = this.getSessions(props, es, logIndex);
-        Session session = new Session(props, es);
-        for (String aSessionIdList : sessionIdList) {
-          String[] sArr = aSessionIdList.split(",");
-          List<ClickStream> datas = session.getClickStreamList(sArr[1], 
sArr[2], sArr[0]);
-          result.addAll(datas);
-        }
-      } catch (Exception e) {
-        LOG.error("Error during extraction of Clickstreams from log index. 
{}", e);
-      }
-    }
-
-    return result;
+       return getClickStreamListInParallel(props, spark, es);
   }
 
   protected JavaRDD<ClickStream> getClickStreamListInParallel(Properties 
props, SparkDriver spark, ESDriver es) {
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/SessionNode.java 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionNode.java
similarity index 91%
rename from 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/SessionNode.java
rename to 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionNode.java
index 6378615..91a29e1 100644
--- 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/SessionNode.java
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionNode.java
@@ -11,7 +11,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.session;
 
 import java.io.UnsupportedEncodingException;
 import java.net.URLDecoder;
@@ -19,6 +19,8 @@
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import org.apache.sdap.mudrod.main.MudrodConstants;
+
 /**
  * ClassName: SessionNode Function: Functions related to a node in a session
  * tree sturcture.
@@ -65,13 +67,13 @@ public SessionNode() {
    * @param time:    request time of node
    * @param seq:     sequence of this node
    */
-  public SessionNode(String request, String logType, String referer, String 
basicUrl, String time, int seq) {
+  public SessionNode(Properties props, String request, String logType, String 
referer, String time, int seq) {
     this.logType = logType;
     this.time = time;
     this.seq = seq;
     this.setRequest(request);
-    this.setReferer(referer, basicUrl);
-    this.setKey(request, logType);
+    this.setReferer(referer, props.getProperty(MudrodConstants.BASE_URL));
+    this.setKey(props, request, logType);
   }
 
   /**
@@ -94,7 +96,7 @@ public void setReferer(String referer, String basicUrl) {
    */
   public void setRequest(String req) {
     this.request = req;
-    if (this.logType.equals("PO.DAAC")) {
+    if (this.logType.equals(MudrodConstants.HTTP_LOG)) {
       this.parseRequest(req);
     }
   }
@@ -156,19 +158,19 @@ public Boolean bSame(SessionNode node) {
    * @param request request url
    * @param logType url type
    */
-  public void setKey(String request, String logType) {
+  public void setKey(Properties props, String request, String logType) {
     this.key = "";
-    String datasetlist = "/datasetlist?";
-    String dataset = "/dataset/";
+    String datasetlist = props.getProperty(MudrodConstants.SEARCH_MARKER);
+    String dataset = props.getProperty(MudrodConstants.VIEW_MARKER);
     if (logType.equals("ftp")) {
       this.key = "ftp";
     } else if (logType.equals("root")) {
       this.key = "root";
     } else {
       if (request.contains(datasetlist)) {
-        this.key = "datasetlist";
+        this.key = MudrodConstants.SEARCH_MARKER;
       } else if (request.contains(dataset) /* || request.contains(granule) */) 
{
-        this.key = "dataset";
+        this.key = MudrodConstants.VIEW_MARKER;
       }
     }
   }
diff --git 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/SessionTree.java 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionTree.java
similarity index 95%
rename from 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/SessionTree.java
rename to 
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionTree.java
index 7d31129..5531f83 100644
--- 
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/SessionTree.java
+++ 
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionTree.java
@@ -11,7 +11,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.session;
 
 import com.google.gson.Gson;
 import com.google.gson.JsonElement;
@@ -20,7 +20,7 @@
 import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract;
 import org.apache.sdap.mudrod.driver.ESDriver;
 import org.apache.sdap.mudrod.main.MudrodConstants;
-
+import org.apache.sdap.mudrod.weblog.structure.log.RequestUrl;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -68,7 +68,7 @@
    */
   public SessionTree(Properties props, ESDriver es, SessionNode rootData, 
String sessionID, String cleanupType) {
     super(props, es, null);
-    root = new SessionNode("root", "root", "", 
props.getProperty(MudrodConstants.BASE_URL), "", 0);
+    root = new SessionNode(props, "root", "root", "", "", 0);
     tmpnode = root;
     this.sessionID = sessionID;
     this.cleanupType = cleanupType;
@@ -84,7 +84,7 @@ public SessionTree(Properties props, ESDriver es, SessionNode 
rootData, String s
    */
   public SessionTree(Properties props, ESDriver es, String sessionID, String 
cleanupType) {
     super(props, es, null);
-    root = new SessionNode("root", "root", "", 
props.getProperty(MudrodConstants.BASE_URL), "", 0);
+    root = new SessionNode(props, "root", "root", "", "", 0);
     root.setParent(root);
     tmpnode = root;
     this.sessionID = sessionID;
@@ -99,15 +99,15 @@ public SessionTree(Properties props, ESDriver es, String 
sessionID, String clean
    */
   public SessionNode insert(SessionNode node) {
     // begin with datasetlist
-    if 
(props.getProperty(MudrodConstants.SEARCH_MARKER).equals(node.getKey())) {
+    if (MudrodConstants.SEARCH_MARKER.equals(node.getKey())) {
       this.binsert = true;
     }
     if (!this.binsert) {
       return null;
     }
     // remove unrelated node
-    if 
(!props.getProperty(MudrodConstants.SEARCH_MARKER).equals(node.getKey()) &&
-            
!props.getProperty(MudrodConstants.VIEW_MARKER).equals(node.getKey()) &&
+    if (!MudrodConstants.SEARCH_MARKER.equals(node.getKey()) &&
+            !MudrodConstants.VIEW_MARKER.equals(node.getKey()) &&
             !MudrodConstants.FTP_LOG.equals(node.getKey())) {
       return null;
     }
@@ -125,7 +125,7 @@ public SessionNode insert(SessionNode node) {
 
     // record insert node
     tmpnode = node;
-    if ("dataset".equals(node.getKey())) {
+    if (MudrodConstants.VIEW_MARKER.equals(node.getKey())) {
       latestDatasetnode = node;
     }
 
@@ -190,7 +190,7 @@ public JsonObject treeToJson(SessionNode node) {
    *
    * @return {@link ClickStream}
    */
-  public List<ClickStream> getClickStreamList() {
+  public List<ClickStream> getClickStreamList(Properties props) {
 
     List<ClickStream> clickthroughs = new ArrayList<>();
     List<SessionNode> viewnodes = this.getViewNodes(this.root);
@@ -198,7 +198,7 @@ public JsonObject treeToJson(SessionNode node) {
       SessionNode parent = viewnode.getParent();
       List<SessionNode> children = viewnode.getChildren();
 
-      if (!"datasetlist".equals(parent.getKey())) {
+      if (!MudrodConstants.SEARCH_MARKER.equals(parent.getKey())) {
         continue;
       }
 
@@ -413,7 +413,7 @@ private boolean insertHelper(SessionNode entry, SessionNode 
node) {
   private List<SessionNode> getViewNodes(SessionNode node) {
 
     List<SessionNode> viewnodes = new ArrayList<>();
-    if ("dataset".equals(node.getKey())) {
+    if (MudrodConstants.VIEW_MARKER.equals(node.getKey())) {
       viewnodes.add(node);
     }
 
@@ -428,7 +428,7 @@ private boolean insertHelper(SessionNode entry, SessionNode 
node) {
   }
 
   private List<SessionNode> getQueryNodes(SessionNode node) {
-    return this.getNodes(node, "datasetlist");
+    return this.getNodes(node, MudrodConstants.SEARCH_MARKER);
   }
 
   private List<SessionNode> getNodes(SessionNode node, String nodeKey) {
diff --git a/core/src/main/resources/config.properties 
b/core/src/main/resources/config.properties
index 6e2bd54..495d29c 100644
--- a/core/src/main/resources/config.properties
+++ b/core/src/main/resources/config.properties
@@ -29,7 +29,7 @@ mudrod.spark.optimize = repartition
 mudrod.log.index = log
 mudrod.ftp.prefix = FTP.
 mudrod.http.prefix = WWW.
-mudrod.base.url = http://podaac.jpl.nasa.gov
+mudrod.base.url = http://podaac.jpl.nasa.gov/
 mudrod.black.request.list = .js, .css, .jpg, .png, .ico, image_captcha, 
autocomplete, .gif, /alldata/, /api/, get / http/1.1, .jpeg, /ws/
 mudrod.black.agent.list = crawler, googlebot, bingbot, slurp, yacybot, 
rogerbot, yandexbot, -, apache-httpclient, java, curl
 mudrod.search.freq = 100
diff --git 
a/service/src/main/java/org/apache/sdap/mudrod/services/search/SessionDetailResource.java
 
b/service/src/main/java/org/apache/sdap/mudrod/services/search/SessionDetailResource.java
index dc31993..074378d 100644
--- 
a/service/src/main/java/org/apache/sdap/mudrod/services/search/SessionDetailResource.java
+++ 
b/service/src/main/java/org/apache/sdap/mudrod/services/search/SessionDetailResource.java
@@ -18,7 +18,7 @@
 
 import org.apache.sdap.mudrod.main.MudrodConstants;
 import org.apache.sdap.mudrod.main.MudrodEngine;
-import org.apache.sdap.mudrod.weblog.structure.Session;
+import org.apache.sdap.mudrod.weblog.structure.session.Session;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to