[
https://issues.apache.org/jira/browse/SDAP-120?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16589173#comment-16589173
]
ASF GitHub Bot commented on SDAP-120:
-------------------------------------
lewismc closed pull request #32: SDAP-120 Error trying to ingest logs
URL: https://github.com/apache/incubator-sdap-mudrod/pull/32
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git
a/core/src/test/java/org/apache/sdap/mudrod/weblog/structure/TestApacheAccessLog.java
b/core/src/test/java/org/apache/sdap/mudrod/weblog/structure/TestApacheAccessLog.java
new file mode 100644
index 0000000..e1453a5
--- /dev/null
+++
b/core/src/test/java/org/apache/sdap/mudrod/weblog/structure/TestApacheAccessLog.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.sdap.mudrod.weblog.structure;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.text.ParseException;
+import java.util.Properties;
+
+import static org.junit.Assert.assertNotEquals;
+
+
+public class TestApacheAccessLog {
+
+ private static Properties testProperties = new Properties();
+
+ @BeforeClass
+ public static void loadProperties() throws IOException {
+
+ URL configURL =
ClassLoader.getSystemClassLoader().getResource("config.properties");
+
+ assert configURL != null : "Could not load config.properties";
+ try (InputStream instream = new FileInputStream(configURL.getFile())) {
+ testProperties.load(instream);
+ }
+ }
+
+ @Test
+ public void testLogMatch() throws IOException, ParseException {
+
+
+ String testLogLine = "198.118.243.84 - - [31/Dec/2017:23:59:20 +0000]
\"GET
/events?page=12&%25252525252525252525252525252525252525253Bsort=asc&order=field_location&sort=desc
HTTP/1.1\" 200 86173";
+
+ String result = ApacheAccessLog.parseFromLogLine(testLogLine,
testProperties);
+
+ assertNotEquals("Log line does not match", "{}", result);
+ }
+}
diff --git a/core/src/test/resources/config.properties
b/core/src/test/resources/config.properties
new file mode 100644
index 0000000..4c8991e
--- /dev/null
+++ b/core/src/test/resources/config.properties
@@ -0,0 +1,74 @@
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Database configuration
+mudrod.cluster.name=MudrodES
+mudrod.es.transport.tcp.port = 9300
+mudrod.es.unicast.hosts = 127.0.0.1
+mudrod.es.http.port = 9200
+mudrod.es.index = mudrod
+
+# Spark related
+# Log processing type. Possible values include 'sequential' or 'parallel'
+mudrod.processing.type = parallel
+mudrod.spark.app.name = MudrodSparkApp
+mudrod.spark.master = local[4]
+mudrod.spark.optimize = repartition
+
+# Web log processing configuration
+# index name has to be all lowercase
+mudrod.log.index = log
+mudrod.ftp.prefix = FTP.
+mudrod.http.prefix = WWW.
+mudrod.base.url = http://podaac.jpl.nasa.gov
+mudrod.black.request.list = .js, .css, .jpg, .png, .ico, image_captcha,
autocomplete, .gif, /alldata/, /api/, get / http/1.1, .jpeg, /ws/
+mudrod.black.agent.list = crawler, googlebot, bingbot, slurp, yacybot,
rogerbot, yandexbot, -, apache-httpclient, java, curl
+mudrod.search.freq = 100
+mudrod.view.freq = 200
+mudrod.download.freq = 100
+mudrod.request.rate = 30
+mudrod.session.port = 8080
+mudrod.session.url = /mudrod-service/session.html
+mudrod.request.time.gap = 600
+mudrod.view.url.marker = /dataset/
+mudrod.search.url.marker = /datasetlist?
+# In order to better parse a URL (getting searching keyword, etc.), please
consider custimize
+# org.apache.sdap.mudrod.weblog.structure.RequestUrl - GetSearchInfo,
getFilterInfo
+
+# User search history
+mudrod.query.min = 0
+mudrod.user.history.weight = 2
+
+# clickstream
+mudrod.download.weight = 3
+mudrod.clickstream.svd.d = 50
+mudrod.clickstream.weight = 2
+
+# metadata
+mudrod.metadata.download = 0
+mudrod.metadata.download.url =
https://podaac.jpl.nasa.gov/api/dataset?startIndex=$startIndex&entries=10&sortField=Dataset-AllTimePopularity&sortOrder=asc&id=&value=&search=
+mudrod.metadata.svd.d = 50
+mudrod.metadata.url = null
+mudrod.metadata.weight = 1
+mudrod.metadata.type = RawMetadata
+
+# ranking, ${svmSgdModel.value} is resolved at build time. See the property in
core/pom.xml for the value
+mudrod.ranking.machine.learning = 1
+mudrod.ranking.model = ${svmSgdModel.value}.zip
+
+# recommendation
+mudrod.metadata.id = Dataset-ShortName
+mudrod.metadata.semantic.fields =
DatasetParameter-Term,DatasetParameter-Variable,Dataset-ExtractTerm
+
+# ontology service implementation. Possible values include EsipPortal -
EsipPortalOntology EsipCOR - EsipCOROntology Local -
org.apache.sdap.mudrod.ontology.process.Local
+mudrod.ontology.implementation = Local
+mudrod.ontology.weight = 2
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> Error trying to ingest logs
> ---------------------------
>
> Key: SDAP-120
> URL: https://issues.apache.org/jira/browse/SDAP-120
> Project: Apache Science Data Analytics Platform
> Issue Type: Bug
> Components: mudrod
> Reporter: Frank Greguska
> Priority: Blocker
>
> Trying to ingest January 2018 logs results in error
>
> {quote}
> 2018-07-09 18:06:29,119 INFO server.Server (Server.java:doStart(379)) -
> Started @3794ms
> 2018-07-09 18:06:29,381 INFO handler.ContextHandler
> (ContextHandler.java:doStart(744)) - Started
> o.s.j.s.ServletContextHandler@11dcd42c{/metrics/json,null,AVAILABLE}
> 2018-07-09 18:06:29,874 INFO discoveryengine.WeblogDiscoveryEngine
> (WeblogDiscoveryEngine.java:<init>(51)) - Started Mudrod Weblog Discovery
> Engine.
> 2018-07-09 18:06:29,874 INFO discoveryengine.WeblogDiscoveryEngine
> (WeblogDiscoveryEngine.java:preprocess(98)) - Starting Web log preprocessing.
> 2018-07-09 18:06:29,875 INFO discoveryengine.WeblogDiscoveryEngine
> (WeblogDiscoveryEngine.java:preprocess(106)) - Processing logs dated 201801.gz
> 2018-07-09 18:06:30,013 INFO pre.ImportLogFile
> (ImportLogFile.java:execute(80)) - Starting Log Import 201801.gz
> 2018-07-09 18:06:31,084 INFO util.Version (Version.java:logVersion(108)) -
> Elasticsearch Hadoop v5.2.0 [d85a257f9f]
> 2018-07-09 18:06:31,451 INFO rdd.EsRDDWriter
> (RestService.java:createWriter(562)) - Writing to [log201801.gz/raw.http]
> 2018-07-09 18:08:15,371 INFO rdd.EsRDDWriter
> (RestService.java:createWriter(562)) - Writing to [log201801.gz/raw.ftp]
> 2018-07-09 18:13:15,916 INFO pre.ImportLogFile
> (ImportLogFile.java:execute(84)) - Log Import complete. Time elapsed 405
> seconds
> 2018-07-09 18:13:15,925 INFO pre.CrawlerDetection
> (CrawlerDetection.java:execute(82)) - Starting Crawler detection raw.http
> 2018-07-09 18:13:16,262 ERROR main.MudrodEngine (MudrodEngine.java:main(395))
> - Error whilst parsing command line.
> java.lang.IllegalArgumentException: [size] must be greater than 0. Found [0]
> in [Users]
> at
> org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder.size(TermsAggregationBuilder.java:148)
> at
> org.apache.sdap.mudrod.weblog.pre.LogAbstract.getUserTerms(LogAbstract.java:127)
> at
> org.apache.sdap.mudrod.weblog.pre.LogAbstract.getUserDocs(LogAbstract.java:135)
> at
> org.apache.sdap.mudrod.weblog.pre.LogAbstract.getUserRDD(LogAbstract.java:100)
> at
> org.apache.sdap.mudrod.weblog.pre.CrawlerDetection.checkByRateInParallel(CrawlerDetection.java:112)
> at
> org.apache.sdap.mudrod.weblog.pre.CrawlerDetection.execute(CrawlerDetection.java:85)
> at
> org.apache.sdap.mudrod.discoveryengine.WeblogDiscoveryEngine.preprocess(WeblogDiscoveryEngine.java:112)
> at
> org.apache.sdap.mudrod.main.MudrodEngine.startFullIngest(MudrodEngine.java:240)
> at org.apache.sdap.mudrod.main.MudrodEngine.main(MudrodEngine.java:385)
> {quote}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)