This is an automated email from the ASF dual-hosted git repository.
lewismc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-sdap-mudrod.git
The following commit(s) were added to refs/heads/master by this push:
new c2dce18 SDAP-55 (#15)
c2dce18 is described below
commit c2dce1846ef1742601cb89db9271e03857a9978f
Author: quintinali <[email protected]>
AuthorDate: Mon May 14 04:37:05 2018 +0800
SDAP-55 (#15)
* clean weblog module
(1) remove sequential log processing code
(2) fix bugs of clickstream extration
* clean code
remove useless blank line
* try to solve conflicts
---
.../apache/sdap/mudrod/main/MudrodConstants.java | 2 -
.../recommendation/pre/SessionCooccurence.java | 2 +-
.../mudrod/weblog/pre/ClickStreamGenerator.java | 4 +-
.../sdap/mudrod/weblog/pre/CrawlerDetection.java | 36 +---
.../sdap/mudrod/weblog/pre/ImportLogFile.java | 195 +--------------------
.../weblog/pre/RankingTrainDataGenerator.java | 4 +-
.../sdap/mudrod/weblog/pre/SessionGenerator.java | 16 +-
.../sdap/mudrod/weblog/pre/SessionStatistic.java | 22 +--
.../structure/{ => log}/ApacheAccessLog.java | 2 +-
.../weblog/structure/{ => log}/Coordinates.java | 2 +-
.../mudrod/weblog/structure/{ => log}/FtpLog.java | 2 +-
.../mudrod/weblog/structure/{ => log}/GeoIp.java | 2 +-
.../weblog/structure/{ => log}/RequestUrl.java | 2 +-
.../mudrod/weblog/structure/{ => log}/WebLog.java | 2 +-
.../structure/{ => session}/ClickStream.java | 2 +-
.../structure/{ => session}/RankingTrainData.java | 2 +-
.../weblog/structure/{ => session}/Session.java | 8 +-
.../structure/{ => session}/SessionExtractor.java | 44 +----
.../structure/{ => session}/SessionNode.java | 22 +--
.../structure/{ => session}/SessionTree.java | 24 +--
core/src/main/resources/config.properties | 2 +-
.../services/search/SessionDetailResource.java | 2 +-
22 files changed, 61 insertions(+), 338 deletions(-)
diff --git
a/core/src/main/java/org/apache/sdap/mudrod/main/MudrodConstants.java
b/core/src/main/java/org/apache/sdap/mudrod/main/MudrodConstants.java
index 0c8bcc2..84ba347 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/main/MudrodConstants.java
+++ b/core/src/main/java/org/apache/sdap/mudrod/main/MudrodConstants.java
@@ -98,8 +98,6 @@ public interface MudrodConstants {
public static final String ONTOLOGY_INPUT_PATH =
"mudrod.ontology.input.path";
- public static final String PROCESS_TYPE = "mudrod.processing.type";
-
/** Defined on CLI */
public static final String METADATA_DOWNLOAD = "mudrod.metadata.download";
diff --git
a/core/src/main/java/org/apache/sdap/mudrod/recommendation/pre/SessionCooccurence.java
b/core/src/main/java/org/apache/sdap/mudrod/recommendation/pre/SessionCooccurence.java
index a4a79e0..4a5d4bb 100644
---
a/core/src/main/java/org/apache/sdap/mudrod/recommendation/pre/SessionCooccurence.java
+++
b/core/src/main/java/org/apache/sdap/mudrod/recommendation/pre/SessionCooccurence.java
@@ -19,7 +19,7 @@ import org.apache.sdap.mudrod.driver.SparkDriver;
import org.apache.sdap.mudrod.main.MudrodConstants;
import org.apache.sdap.mudrod.utils.LabeledRowMatrix;
import org.apache.sdap.mudrod.utils.MatrixUtil;
-import org.apache.sdap.mudrod.weblog.structure.SessionExtractor;
+import org.apache.sdap.mudrod.weblog.structure.session.SessionExtractor;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.function.PairFunction;
import org.elasticsearch.action.search.SearchResponse;
diff --git
a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/ClickStreamGenerator.java
b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/ClickStreamGenerator.java
index 886cd4a..2c38d29 100644
---
a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/ClickStreamGenerator.java
+++
b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/ClickStreamGenerator.java
@@ -19,8 +19,8 @@ import org.apache.sdap.mudrod.driver.SparkDriver;
import org.apache.sdap.mudrod.main.MudrodConstants;
import org.apache.sdap.mudrod.utils.LabeledRowMatrix;
import org.apache.sdap.mudrod.utils.MatrixUtil;
-import org.apache.sdap.mudrod.weblog.structure.ClickStream;
-import org.apache.sdap.mudrod.weblog.structure.SessionExtractor;
+import org.apache.sdap.mudrod.weblog.structure.session.ClickStream;
+import org.apache.sdap.mudrod.weblog.structure.session.SessionExtractor;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.slf4j.Logger;
diff --git
a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/CrawlerDetection.java
b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/CrawlerDetection.java
index 3e782a7..704ccfd 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/CrawlerDetection.java
+++ b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/CrawlerDetection.java
@@ -79,7 +79,7 @@ public class CrawlerDetection extends LogAbstract {
LOG.info("Starting Crawler detection {}.", httpType);
startTime = System.currentTimeMillis();
try {
- checkByRate();
+ checkByRateInParallel();
} catch (InterruptedException | IOException e) {
LOG.error("Encountered an error whilst detecting Web crawlers.", e);
}
@@ -103,40 +103,6 @@ public class CrawlerDetection extends LogAbstract {
return false;
}
- public void checkByRate() throws InterruptedException, IOException {
- String processingType = props.getProperty(MudrodConstants.PROCESS_TYPE);
- if (processingType.equals("sequential")) {
- checkByRateInSequential();
- } else if (processingType.equals("parallel")) {
- checkByRateInParallel();
- }
- }
-
- /**
- * Check crawler by request sending rate, which is read from configruation
- * file
- *
- * @throws InterruptedException InterruptedException
- * @throws IOException IOException
- */
- public void checkByRateInSequential() throws InterruptedException,
IOException {
- es.createBulkProcessor();
-
- int rate =
Integer.parseInt(props.getProperty(MudrodConstants.REQUEST_RATE));
-
- Terms users = this.getUserTerms(this.httpType);
- LOG.info("Original User count: {}",
Integer.toString(users.getBuckets().size()));
-
- int userCount = 0;
- for (Terms.Bucket entry : users.getBuckets()) {
- String user = entry.getKey().toString();
- int count = checkByRate(es, user);
- userCount += count;
- }
- es.destroyBulkProcessor();
- LOG.info("User count: {}", Integer.toString(userCount));
- }
-
void checkByRateInParallel() throws InterruptedException, IOException {
JavaRDD<String> userRDD = getUserRDD(this.httpType);
diff --git
a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/ImportLogFile.java
b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/ImportLogFile.java
index c7622d6..6e5f473 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/ImportLogFile.java
+++ b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/ImportLogFile.java
@@ -16,8 +16,8 @@ package org.apache.sdap.mudrod.weblog.pre;
import org.apache.sdap.mudrod.driver.ESDriver;
import org.apache.sdap.mudrod.driver.SparkDriver;
import org.apache.sdap.mudrod.main.MudrodConstants;
-import org.apache.sdap.mudrod.weblog.structure.ApacheAccessLog;
-import org.apache.sdap.mudrod.weblog.structure.FtpLog;
+import org.apache.sdap.mudrod.weblog.structure.log.ApacheAccessLog;
+import org.apache.sdap.mudrod.weblog.structure.log.FtpLog;
import org.apache.spark.api.java.JavaRDD;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
@@ -55,6 +55,11 @@ public class ImportLogFile extends LogAbstract {
public static final int NUM_FIELDS = 9;
Pattern p = Pattern.compile(logEntryPattern);
transient Matcher matcher;
+
+ @Override
+ public Object execute(Object o) {
+ return null;
+ }
/**
* Constructor supporting a number of parameters documented below.
@@ -146,31 +151,7 @@ public class ImportLogFile extends LogAbstract {
return;
}
- String processingType = props.getProperty(MudrodConstants.PROCESS_TYPE,
"parallel");
- if (processingType.equals("sequential")) {
- readFileInSequential(httplogpath, ftplogpath);
- } else if (processingType.equals("parallel")) {
- readFileInParallel(httplogpath, ftplogpath);
- }
- }
-
- /**
- * Read the FTP or HTTP log path with the intention of processing lines from
- * log files.
- *
- * @param httplogpath path to the parent directory containing http logs
- * @param ftplogpath path to the parent directory containing ftp logs
- */
- public void readFileInSequential(String httplogpath, String ftplogpath) {
- es.createBulkProcessor();
- try {
- readLogFile(httplogpath, "http", logIndex, httpType);
- readLogFile(ftplogpath, "FTP", logIndex, ftpType);
-
- } catch (IOException e) {
- LOG.error("Error whilst reading log file.", e);
- }
- es.destroyBulkProcessor();
+ readFileInParallel(httplogpath, ftplogpath);
}
/**
@@ -181,7 +162,6 @@ public class ImportLogFile extends LogAbstract {
* @param ftplogpath path to the parent directory containing ftp logs
*/
public void readFileInParallel(String httplogpath, String ftplogpath) {
-
importHttpfile(httplogpath);
importFtpfile(ftplogpath);
}
@@ -189,171 +169,12 @@ public class ImportLogFile extends LogAbstract {
public void importHttpfile(String httplogpath) {
// import http logs
JavaRDD<String> accessLogs = spark.sc.textFile(httplogpath,
this.partition).map(s -> ApacheAccessLog.parseFromLogLine(s,
props)).filter(ApacheAccessLog::checknull);
-
JavaEsSpark.saveJsonToEs(accessLogs, logIndex + "/" + this.httpType);
}
public void importFtpfile(String ftplogpath) {
// import ftp logs
JavaRDD<String> ftpLogs = spark.sc.textFile(ftplogpath,
this.partition).map(s -> FtpLog.parseFromLogLine(s,
props)).filter(FtpLog::checknull);
-
JavaEsSpark.saveJsonToEs(ftpLogs, logIndex + "/" + this.ftpType);
}
-
- /**
- * Process a log path on local file system which contains the relevant
- * parameters as below.
- *
- * @param fileName the {@link java.lang.String} path to the log directory on
file
- * system
- * @param protocol whether to process 'http' or 'FTP'
- * @param index the index name to write logs to
- * @param type one of the available protocols from which Mudrod logs are
obtained.
- * @throws IOException if there is an error reading anything from the
fileName provided.
- */
- public void readLogFile(String fileName, String protocol, String index,
String type) throws IOException {
- BufferedReader br = new BufferedReader(new FileReader(fileName));
- int count = 0;
- try {
- String line = br.readLine();
- while (line != null) {
- if ("FTP".equals(protocol)) {
- parseSingleLineFTP(line, index, type);
- } else {
- parseSingleLineHTTP(line, index, type);
- }
- line = br.readLine();
- count++;
- }
- } catch (FileNotFoundException e) {
- LOG.error("File not found.", e);
- } catch (IOException e) {
- LOG.error("Error reading input directory.", e);
- } finally {
- br.close();
- LOG.info("Num of {} entries:\t{}", protocol, count);
- }
- }
-
- /**
- * Parse a single FTP log entry
- *
- * @param log a single log line
- * @param index the index name we wish to persist the log line to
- * @param type one of the available protocols from which Mudrod logs are
obtained.
- */
- public void parseSingleLineFTP(String log, String index, String type) {
- String ip = log.split(" +")[6];
-
- String time = log.split(" +")[1] + ":" + log.split(" +")[2] + ":" +
log.split(" +")[3] + ":" + log.split(" +")[4];
-
- time = switchtoNum(time);
- SimpleDateFormat formatter = new SimpleDateFormat("MM:dd:HH:mm:ss:yyyy");
- Date date = null;
- try {
- date = formatter.parse(time);
- } catch (ParseException e) {
- LOG.error("Error whilst parsing the date.", e);
- }
- String bytes = log.split(" +")[7];
-
- String request = log.split(" +")[8].toLowerCase();
-
- if (!request.contains("/misc/") && !request.contains("readme")) {
- IndexRequest ir;
- try {
- ir = new IndexRequest(index, type)
- .source(jsonBuilder()
- .startObject()
- .field("LogType", MudrodConstants.FTP_LOG)
- .field("IP", ip)
- .field("Time", date)
- .field("Request", request)
- .field("Bytes", Long.parseLong(bytes))
- .endObject());
- es.getBulkProcessor().add(ir);
- } catch (NumberFormatException e) {
- LOG.error("Error whilst processing numbers", e);
- } catch (IOException e) {
- LOG.error("IOError whilst adding to the bulk processor.", e);
- }
- }
-
- }
-
- /**
- * Parse a single HTTP log entry
- *
- * @param log a single log line
- * @param index the index name we wish to persist the log line to
- * @param type one of the available protocols from which Mudrod logs are
obtained.
- */
- public void parseSingleLineHTTP(String log, String index, String type) {
- matcher = p.matcher(log);
- if (!matcher.matches() || NUM_FIELDS != matcher.groupCount()) {
- return;
- }
- String time = matcher.group(4);
- time = switchtoNum(time);
- SimpleDateFormat formatter = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss");
- Date date = null;
- try {
- date = formatter.parse(time);
- } catch (ParseException e) {
- LOG.error("Error whilst attempting to parse date.", e);
- }
-
- String bytes = matcher.group(7);
- if ("-".equals(bytes)) {
- bytes = "0";
- }
-
- String request = matcher.group(5).toLowerCase();
- String agent = matcher.group(9);
- CrawlerDetection crawlerDe = new CrawlerDetection(this.props, this.es,
this.spark);
- if (!crawlerDe.checkKnownCrawler(agent)) {
- boolean tag = false;
- String[] mimeTypes =
props.getProperty(MudrodConstants.BLACK_LIST_REQUEST).split(",");
- for(String str:mimeTypes) {
- if (request.contains(str.trim())) {
- tag = true;
- break;
- }
- }
-
- if (!tag) {
- IndexRequest ir = null;
- executeBulkRequest(ir, index, type, matcher, date, bytes);
- }
- }
- }
-
- private void executeBulkRequest(IndexRequest ir, String index, String type,
Matcher matcher, Date date, String bytes) {
- IndexRequest newIr = ir;
- try {
- newIr = new IndexRequest(index, type)
- .source(jsonBuilder()
- .startObject()
- .field("LogType", MudrodConstants.HTTP_LOG)
- .field("IP", matcher.group(1))
- .field("Time", date)
- .field("Request", matcher.group(5))
- .field("Response", matcher.group(6))
- .field("Bytes", Integer.parseInt(bytes))
- .field("Referer", matcher.group(8))
- .field("Browser", matcher.group(9))
- .endObject());
-
- es.getBulkProcessor().add(newIr);
- } catch (NumberFormatException e) {
- LOG.error("Error whilst processing numbers", e);
- } catch (IOException e) {
- LOG.error("IOError whilst adding to the bulk processor.", e);
- }
- }
-
- @Override
- public Object execute(Object o) {
- return null;
- }
}
diff --git
a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/RankingTrainDataGenerator.java
b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/RankingTrainDataGenerator.java
index 766e853..de41d56 100644
---
a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/RankingTrainDataGenerator.java
+++
b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/RankingTrainDataGenerator.java
@@ -3,8 +3,8 @@ package org.apache.sdap.mudrod.weblog.pre;
import org.apache.sdap.mudrod.discoveryengine.DiscoveryStepAbstract;
import org.apache.sdap.mudrod.driver.ESDriver;
import org.apache.sdap.mudrod.driver.SparkDriver;
-import org.apache.sdap.mudrod.weblog.structure.RankingTrainData;
-import org.apache.sdap.mudrod.weblog.structure.SessionExtractor;
+import org.apache.sdap.mudrod.weblog.structure.session.RankingTrainData;
+import org.apache.sdap.mudrod.weblog.structure.session.SessionExtractor;
import org.apache.spark.api.java.JavaRDD;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
diff --git
a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/SessionGenerator.java
b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/SessionGenerator.java
index 4e170d7..4ce1535 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/SessionGenerator.java
+++ b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/SessionGenerator.java
@@ -16,7 +16,7 @@ package org.apache.sdap.mudrod.weblog.pre;
import org.apache.sdap.mudrod.driver.ESDriver;
import org.apache.sdap.mudrod.driver.SparkDriver;
import org.apache.sdap.mudrod.main.MudrodConstants;
-import org.apache.sdap.mudrod.weblog.structure.Session;
+import org.apache.sdap.mudrod.weblog.structure.session.Session;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
@@ -93,21 +93,15 @@ public class SessionGenerator extends LogAbstract {
}
public void genSessionByReferer(int timeThres) throws InterruptedException,
IOException {
- String processingType = props.getProperty(MudrodConstants.PROCESS_TYPE);
- if (processingType.equals("sequential")) {
- genSessionByRefererInSequential(timeThres);
- } else if (processingType.equals("parallel")) {
+
genSessionByRefererInParallel(timeThres);
- }
+
}
public void combineShortSessions(int timeThres) throws InterruptedException,
IOException {
- String processingType = props.getProperty(MudrodConstants.PROCESS_TYPE);
- if (processingType.equals("sequential")) {
- combineShortSessionsInSequential(timeThres);
- } else if (processingType.equals("parallel")) {
+
combineShortSessionsInParallel(timeThres);
- }
+
}
/**
diff --git
a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/SessionStatistic.java
b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/SessionStatistic.java
index 3cee9c7..981bece 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/SessionStatistic.java
+++ b/core/src/main/java/org/apache/sdap/mudrod/weblog/pre/SessionStatistic.java
@@ -16,7 +16,7 @@ package org.apache.sdap.mudrod.weblog.pre;
import org.apache.sdap.mudrod.driver.ESDriver;
import org.apache.sdap.mudrod.driver.SparkDriver;
import org.apache.sdap.mudrod.main.MudrodConstants;
-import org.apache.sdap.mudrod.weblog.structure.RequestUrl;
+import org.apache.sdap.mudrod.weblog.structure.log.RequestUrl;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
@@ -81,27 +81,7 @@ public class SessionStatistic extends LogAbstract {
}
public void processSession() throws InterruptedException, IOException,
ExecutionException {
- String processingType = props.getProperty(MudrodConstants.PROCESS_TYPE);
- if ("sequential".equals(processingType)) {
- processSessionInSequential();
- } else if ("parallel".equals(processingType)) {
processSessionInParallel();
- }
- }
-
- public void processSessionInSequential() throws IOException,
InterruptedException, ExecutionException {
- es.createBulkProcessor();
- Terms sessions = this.getSessionTerms();
- int sessionCount = 0;
- for (Terms.Bucket entry : sessions.getBuckets()) {
- if (entry.getDocCount() >= 3 && !"invalid".equals(entry.getKey())) {
- String sessionid = entry.getKey().toString();
- int sessionNum = processSession(es, sessionid);
- sessionCount += sessionNum;
- }
- }
- LOG.info("Final Session count: {}", Integer.toString(sessionCount));
- es.destroyBulkProcessor();
}
/**
diff --git
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/ApacheAccessLog.java
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/ApacheAccessLog.java
similarity index 98%
rename from
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/ApacheAccessLog.java
rename to
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/ApacheAccessLog.java
index 050d19d..8224b33 100644
---
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/ApacheAccessLog.java
+++
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/ApacheAccessLog.java
@@ -11,7 +11,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.log;
import com.google.gson.Gson;
diff --git
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/Coordinates.java
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/Coordinates.java
similarity index 92%
rename from
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/Coordinates.java
rename to
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/Coordinates.java
index 5e6fd07..43a7642 100644
---
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/Coordinates.java
+++
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/Coordinates.java
@@ -11,7 +11,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.log;
public class Coordinates {
/*
diff --git
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/FtpLog.java
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/FtpLog.java
similarity index 97%
rename from
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/FtpLog.java
rename to
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/FtpLog.java
index 9f39655..91fa228 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/FtpLog.java
+++ b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/FtpLog.java
@@ -11,7 +11,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.log;
import com.google.gson.Gson;
diff --git
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/GeoIp.java
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/GeoIp.java
similarity index 96%
rename from
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/GeoIp.java
rename to
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/GeoIp.java
index d3e94dc..154cd98 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/GeoIp.java
+++ b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/GeoIp.java
@@ -11,7 +11,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.log;
import org.apache.sdap.mudrod.utils.HttpRequest;
diff --git
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/RequestUrl.java
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/RequestUrl.java
similarity index 99%
rename from
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/RequestUrl.java
rename to
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/RequestUrl.java
index 05a3395..0c365a4 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/RequestUrl.java
+++
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/RequestUrl.java
@@ -11,7 +11,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.log;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
diff --git
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/WebLog.java
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/WebLog.java
similarity index 97%
rename from
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/WebLog.java
rename to
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/WebLog.java
index 93a9747..9046992 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/WebLog.java
+++ b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/log/WebLog.java
@@ -11,7 +11,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.log;
import java.io.Serializable;
diff --git
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/ClickStream.java
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/ClickStream.java
similarity index 98%
rename from
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/ClickStream.java
rename to
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/ClickStream.java
index 2f0c34d..7e5069e 100644
---
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/ClickStream.java
+++
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/ClickStream.java
@@ -11,7 +11,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.session;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
diff --git
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/RankingTrainData.java
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/RankingTrainData.java
similarity index 98%
rename from
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/RankingTrainData.java
rename to
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/RankingTrainData.java
index cf4ec23..bdf477a 100644
---
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/RankingTrainData.java
+++
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/RankingTrainData.java
@@ -1,4 +1,4 @@
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.session;
import java.io.Serializable;
import java.util.Map;
diff --git
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/Session.java
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/Session.java
similarity index 97%
rename from
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/Session.java
rename to
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/Session.java
index 31bef0c..2c917a6 100644
--- a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/Session.java
+++
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/Session.java
@@ -11,7 +11,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.session;
import com.google.gson.Gson;
import com.google.gson.JsonElement;
@@ -188,8 +188,10 @@ public class Session /*extends MudrodAbstract*/ implements
Comparable<Session> {
} catch (UnsupportedEncodingException e) {
LOG.error("Erro whilst obtaining the Session Tree: {}", e);
}
+
+ //tree.printTree(tree.root);
- List<ClickStream> clickthroughs = tree.getClickStreamList();
+ List<ClickStream> clickthroughs = tree.getClickStreamList(props);
return clickthroughs;
}
@@ -215,7 +217,7 @@ public class Session /*extends MudrodAbstract*/ implements
Comparable<Session> {
String logType = (String) result.get("LogType");
String referer = (String) result.get("Referer");
- SessionNode node = new SessionNode(request, logType, referer,
props.getProperty(MudrodConstants.BASE_URL), time, seq);
+ SessionNode node = new SessionNode(props, request, logType, referer,
time, seq);
tree.insert(node);
seq++;
}
diff --git
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/SessionExtractor.java
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionExtractor.java
similarity index 90%
rename from
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/SessionExtractor.java
rename to
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionExtractor.java
index 85b6961..f7eb602 100644
---
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/SessionExtractor.java
+++
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionExtractor.java
@@ -11,7 +11,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.session;
import org.apache.sdap.mudrod.driver.ESDriver;
import org.apache.sdap.mudrod.driver.SparkDriver;
@@ -70,47 +70,7 @@ public class SessionExtractor implements Serializable {
* @return clickstream list in JavaRDD format {@link ClickStream}
*/
public JavaRDD<ClickStream> extractClickStreamFromES(Properties props,
ESDriver es, SparkDriver spark) {
- switch (props.getProperty(MudrodConstants.PROCESS_TYPE)) {
- case "sequential":
- List<ClickStream> queryList = this.getClickStreamList(props, es);
- return spark.sc.parallelize(queryList);
- case "parallel":
- return getClickStreamListInParallel(props, spark, es);
- default:
- LOG.error("Error finding processing type for '{}'. Please check your
config.xml.", props.getProperty(MudrodConstants.PROCESS_TYPE));
- }
- return null;
- }
-
- /**
- * getClickStreamList:Extract click streams from logs stored in
Elasticsearch.
- *
- * @param props
- * the Mudrod configuration
- * @param es
- * the Elasticsearch driver
- * @return clickstream list {@link ClickStream}
- */
- protected List<ClickStream> getClickStreamList(Properties props, ESDriver
es) {
- List<String> logIndexList =
es.getIndexListWithPrefix(props.getProperty(MudrodConstants.LOG_INDEX));
-
- List<ClickStream> result = new ArrayList<>();
- for (String logIndex : logIndexList) {
- List<String> sessionIdList;
- try {
- sessionIdList = this.getSessions(props, es, logIndex);
- Session session = new Session(props, es);
- for (String aSessionIdList : sessionIdList) {
- String[] sArr = aSessionIdList.split(",");
- List<ClickStream> datas = session.getClickStreamList(sArr[1],
sArr[2], sArr[0]);
- result.addAll(datas);
- }
- } catch (Exception e) {
- LOG.error("Error during extraction of Clickstreams from log index.
{}", e);
- }
- }
-
- return result;
+ return getClickStreamListInParallel(props, spark, es);
}
protected JavaRDD<ClickStream> getClickStreamListInParallel(Properties
props, SparkDriver spark, ESDriver es) {
diff --git
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/SessionNode.java
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionNode.java
similarity index 91%
rename from
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/SessionNode.java
rename to
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionNode.java
index 6378615..91a29e1 100644
---
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/SessionNode.java
+++
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionNode.java
@@ -11,7 +11,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.session;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
@@ -19,6 +19,8 @@ import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import org.apache.sdap.mudrod.main.MudrodConstants;
+
/**
* ClassName: SessionNode Function: Functions related to a node in a session
* tree sturcture.
@@ -65,13 +67,13 @@ public class SessionNode {
* @param time: request time of node
* @param seq: sequence of this node
*/
- public SessionNode(String request, String logType, String referer, String
basicUrl, String time, int seq) {
+ public SessionNode(Properties props, String request, String logType, String
referer, String time, int seq) {
this.logType = logType;
this.time = time;
this.seq = seq;
this.setRequest(request);
- this.setReferer(referer, basicUrl);
- this.setKey(request, logType);
+ this.setReferer(referer, props.getProperty(MudrodConstants.BASE_URL));
+ this.setKey(props, request, logType);
}
/**
@@ -94,7 +96,7 @@ public class SessionNode {
*/
public void setRequest(String req) {
this.request = req;
- if (this.logType.equals("PO.DAAC")) {
+ if (this.logType.equals(MudrodConstants.HTTP_LOG)) {
this.parseRequest(req);
}
}
@@ -156,19 +158,19 @@ public class SessionNode {
* @param request request url
* @param logType url type
*/
- public void setKey(String request, String logType) {
+ public void setKey(Properties props, String request, String logType) {
this.key = "";
- String datasetlist = "/datasetlist?";
- String dataset = "/dataset/";
+ String datasetlist = props.getProperty(MudrodConstants.SEARCH_MARKER);
+ String dataset = props.getProperty(MudrodConstants.VIEW_MARKER);
if (logType.equals("ftp")) {
this.key = "ftp";
} else if (logType.equals("root")) {
this.key = "root";
} else {
if (request.contains(datasetlist)) {
- this.key = "datasetlist";
+ this.key = MudrodConstants.SEARCH_MARKER;
} else if (request.contains(dataset) /* || request.contains(granule) */)
{
- this.key = "dataset";
+ this.key = MudrodConstants.VIEW_MARKER;
}
}
}
diff --git
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/SessionTree.java
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionTree.java
similarity index 95%
rename from
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/SessionTree.java
rename to
core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionTree.java
index 7d31129..5531f83 100644
---
a/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/SessionTree.java
+++
b/core/src/main/java/org/apache/sdap/mudrod/weblog/structure/session/SessionTree.java
@@ -11,7 +11,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.sdap.mudrod.weblog.structure;
+package org.apache.sdap.mudrod.weblog.structure.session;
import com.google.gson.Gson;
import com.google.gson.JsonElement;
@@ -20,7 +20,7 @@ import com.google.gson.JsonObject;
import org.apache.sdap.mudrod.discoveryengine.MudrodAbstract;
import org.apache.sdap.mudrod.driver.ESDriver;
import org.apache.sdap.mudrod.main.MudrodConstants;
-
+import org.apache.sdap.mudrod.weblog.structure.log.RequestUrl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -68,7 +68,7 @@ public class SessionTree extends MudrodAbstract {
*/
public SessionTree(Properties props, ESDriver es, SessionNode rootData,
String sessionID, String cleanupType) {
super(props, es, null);
- root = new SessionNode("root", "root", "",
props.getProperty(MudrodConstants.BASE_URL), "", 0);
+ root = new SessionNode(props, "root", "root", "", "", 0);
tmpnode = root;
this.sessionID = sessionID;
this.cleanupType = cleanupType;
@@ -84,7 +84,7 @@ public class SessionTree extends MudrodAbstract {
*/
public SessionTree(Properties props, ESDriver es, String sessionID, String
cleanupType) {
super(props, es, null);
- root = new SessionNode("root", "root", "",
props.getProperty(MudrodConstants.BASE_URL), "", 0);
+ root = new SessionNode(props, "root", "root", "", "", 0);
root.setParent(root);
tmpnode = root;
this.sessionID = sessionID;
@@ -99,15 +99,15 @@ public class SessionTree extends MudrodAbstract {
*/
public SessionNode insert(SessionNode node) {
// begin with datasetlist
- if
(props.getProperty(MudrodConstants.SEARCH_MARKER).equals(node.getKey())) {
+ if (MudrodConstants.SEARCH_MARKER.equals(node.getKey())) {
this.binsert = true;
}
if (!this.binsert) {
return null;
}
// remove unrelated node
- if
(!props.getProperty(MudrodConstants.SEARCH_MARKER).equals(node.getKey()) &&
-
!props.getProperty(MudrodConstants.VIEW_MARKER).equals(node.getKey()) &&
+ if (!MudrodConstants.SEARCH_MARKER.equals(node.getKey()) &&
+ !MudrodConstants.VIEW_MARKER.equals(node.getKey()) &&
!MudrodConstants.FTP_LOG.equals(node.getKey())) {
return null;
}
@@ -125,7 +125,7 @@ public class SessionTree extends MudrodAbstract {
// record insert node
tmpnode = node;
- if ("dataset".equals(node.getKey())) {
+ if (MudrodConstants.VIEW_MARKER.equals(node.getKey())) {
latestDatasetnode = node;
}
@@ -190,7 +190,7 @@ public class SessionTree extends MudrodAbstract {
*
* @return {@link ClickStream}
*/
- public List<ClickStream> getClickStreamList() {
+ public List<ClickStream> getClickStreamList(Properties props) {
List<ClickStream> clickthroughs = new ArrayList<>();
List<SessionNode> viewnodes = this.getViewNodes(this.root);
@@ -198,7 +198,7 @@ public class SessionTree extends MudrodAbstract {
SessionNode parent = viewnode.getParent();
List<SessionNode> children = viewnode.getChildren();
- if (!"datasetlist".equals(parent.getKey())) {
+ if (!MudrodConstants.SEARCH_MARKER.equals(parent.getKey())) {
continue;
}
@@ -413,7 +413,7 @@ public class SessionTree extends MudrodAbstract {
private List<SessionNode> getViewNodes(SessionNode node) {
List<SessionNode> viewnodes = new ArrayList<>();
- if ("dataset".equals(node.getKey())) {
+ if (MudrodConstants.VIEW_MARKER.equals(node.getKey())) {
viewnodes.add(node);
}
@@ -428,7 +428,7 @@ public class SessionTree extends MudrodAbstract {
}
private List<SessionNode> getQueryNodes(SessionNode node) {
- return this.getNodes(node, "datasetlist");
+ return this.getNodes(node, MudrodConstants.SEARCH_MARKER);
}
private List<SessionNode> getNodes(SessionNode node, String nodeKey) {
diff --git a/core/src/main/resources/config.properties
b/core/src/main/resources/config.properties
index 6e2bd54..495d29c 100644
--- a/core/src/main/resources/config.properties
+++ b/core/src/main/resources/config.properties
@@ -29,7 +29,7 @@ mudrod.spark.optimize = repartition
mudrod.log.index = log
mudrod.ftp.prefix = FTP.
mudrod.http.prefix = WWW.
-mudrod.base.url = http://podaac.jpl.nasa.gov
+mudrod.base.url = http://podaac.jpl.nasa.gov/
mudrod.black.request.list = .js, .css, .jpg, .png, .ico, image_captcha,
autocomplete, .gif, /alldata/, /api/, get / http/1.1, .jpeg, /ws/
mudrod.black.agent.list = crawler, googlebot, bingbot, slurp, yacybot,
rogerbot, yandexbot, -, apache-httpclient, java, curl
mudrod.search.freq = 100
diff --git
a/service/src/main/java/org/apache/sdap/mudrod/services/search/SessionDetailResource.java
b/service/src/main/java/org/apache/sdap/mudrod/services/search/SessionDetailResource.java
index dc31993..074378d 100644
---
a/service/src/main/java/org/apache/sdap/mudrod/services/search/SessionDetailResource.java
+++
b/service/src/main/java/org/apache/sdap/mudrod/services/search/SessionDetailResource.java
@@ -18,7 +18,7 @@ import com.google.gson.JsonObject;
import org.apache.sdap.mudrod.main.MudrodConstants;
import org.apache.sdap.mudrod.main.MudrodEngine;
-import org.apache.sdap.mudrod.weblog.structure.Session;
+import org.apache.sdap.mudrod.weblog.structure.session.Session;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
--
To stop receiving notification emails like this one, please contact
[email protected].