[ 
https://issues.apache.org/jira/browse/SDAP-120?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16589175#comment-16589175
 ] 

ASF GitHub Bot commented on SDAP-120:
-------------------------------------

lewismc closed pull request #32: SDAP-120 Error trying to ingest logs
URL: https://github.com/apache/incubator-sdap-mudrod/pull/32
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git 
a/core/src/test/java/org/apache/sdap/mudrod/weblog/structure/TestApacheAccessLog.java
 
b/core/src/test/java/org/apache/sdap/mudrod/weblog/structure/TestApacheAccessLog.java
new file mode 100644
index 0000000..e1453a5
--- /dev/null
+++ 
b/core/src/test/java/org/apache/sdap/mudrod/weblog/structure/TestApacheAccessLog.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you
+ * may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.sdap.mudrod.weblog.structure;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.text.ParseException;
+import java.util.Properties;
+
+import static org.junit.Assert.assertNotEquals;
+
+
+public class TestApacheAccessLog {
+
+    private static Properties testProperties = new Properties();
+
+    @BeforeClass
+    public static void loadProperties() throws IOException {
+
+        URL configURL = 
ClassLoader.getSystemClassLoader().getResource("config.properties");
+
+        assert configURL != null : "Could not load config.properties";
+        try (InputStream instream = new FileInputStream(configURL.getFile())) {
+            testProperties.load(instream);
+        }
+    }
+
+    @Test
+    public void testLogMatch() throws IOException, ParseException {
+
+
+        String testLogLine = "198.118.243.84 - - [31/Dec/2017:23:59:20 +0000] 
\"GET 
/events?page=12&amp%25252525252525252525252525252525252525253Bsort=asc&order=field_location&sort=desc
 HTTP/1.1\" 200 86173";
+
+        String result = ApacheAccessLog.parseFromLogLine(testLogLine, 
testProperties);
+
+        assertNotEquals("Log line does not match", "{}", result);
+    }
+}
diff --git a/core/src/test/resources/config.properties 
b/core/src/test/resources/config.properties
new file mode 100644
index 0000000..4c8991e
--- /dev/null
+++ b/core/src/test/resources/config.properties
@@ -0,0 +1,74 @@
+# Licensed under the Apache License, Version 2.0 (the "License"); you 
+# may not use this file except in compliance with the License. 
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Database configuration
+mudrod.cluster.name=MudrodES
+mudrod.es.transport.tcp.port = 9300
+mudrod.es.unicast.hosts = 127.0.0.1
+mudrod.es.http.port = 9200
+mudrod.es.index = mudrod
+    
+# Spark related
+# Log processing type. Possible values include 'sequential' or 'parallel'
+mudrod.processing.type = parallel
+mudrod.spark.app.name = MudrodSparkApp
+mudrod.spark.master = local[4]
+mudrod.spark.optimize = repartition
+    
+# Web log processing configuration
+# index name has to be all lowercase
+mudrod.log.index = log
+mudrod.ftp.prefix = FTP.
+mudrod.http.prefix = WWW.
+mudrod.base.url = http://podaac.jpl.nasa.gov
+mudrod.black.request.list = .js, .css, .jpg, .png, .ico, image_captcha, 
autocomplete, .gif, /alldata/, /api/, get / http/1.1, .jpeg, /ws/
+mudrod.black.agent.list = crawler, googlebot, bingbot, slurp, yacybot, 
rogerbot, yandexbot, -, apache-httpclient, java, curl
+mudrod.search.freq = 100
+mudrod.view.freq = 200
+mudrod.download.freq = 100
+mudrod.request.rate = 30
+mudrod.session.port = 8080
+mudrod.session.url = /mudrod-service/session.html
+mudrod.request.time.gap = 600   
+mudrod.view.url.marker = /dataset/
+mudrod.search.url.marker = /datasetlist?
+# In order to better parse a URL (getting searching keyword, etc.), please 
consider custimize 
+# org.apache.sdap.mudrod.weblog.structure.RequestUrl - GetSearchInfo, 
getFilterInfo
+       
+# User search history
+mudrod.query.min = 0
+mudrod.user.history.weight = 2
+       
+# clickstream
+mudrod.download.weight = 3
+mudrod.clickstream.svd.d = 50
+mudrod.clickstream.weight = 2
+                               
+# metadata
+mudrod.metadata.download = 0
+mudrod.metadata.download.url = 
https://podaac.jpl.nasa.gov/api/dataset?startIndex=$startIndex&entries=10&sortField=Dataset-AllTimePopularity&sortOrder=asc&id=&value=&search=
+mudrod.metadata.svd.d = 50
+mudrod.metadata.url = null
+mudrod.metadata.weight = 1
+mudrod.metadata.type = RawMetadata
+               
+# ranking, ${svmSgdModel.value} is resolved at build time. See the property in 
core/pom.xml for the value
+mudrod.ranking.machine.learning = 1
+mudrod.ranking.model = ${svmSgdModel.value}.zip
+               
+# recommendation
+mudrod.metadata.id = Dataset-ShortName
+mudrod.metadata.semantic.fields = 
DatasetParameter-Term,DatasetParameter-Variable,Dataset-ExtractTerm
+
+# ontology service implementation. Possible values include EsipPortal - 
EsipPortalOntology EsipCOR - EsipCOROntology Local - 
org.apache.sdap.mudrod.ontology.process.Local
+mudrod.ontology.implementation = Local
+mudrod.ontology.weight = 2


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> Error trying to ingest logs
> ---------------------------
>
>                 Key: SDAP-120
>                 URL: https://issues.apache.org/jira/browse/SDAP-120
>             Project: Apache Science Data Analytics Platform
>          Issue Type: Bug
>          Components: mudrod
>            Reporter: Frank Greguska
>            Priority: Blocker
>
> Trying to ingest January 2018 logs results in error
>  
> {quote}
> 2018-07-09 18:06:29,119 INFO  server.Server (Server.java:doStart(379)) - 
> Started @3794ms
> 2018-07-09 18:06:29,381 INFO  handler.ContextHandler 
> (ContextHandler.java:doStart(744)) - Started 
> o.s.j.s.ServletContextHandler@11dcd42c{/metrics/json,null,AVAILABLE}
> 2018-07-09 18:06:29,874 INFO  discoveryengine.WeblogDiscoveryEngine 
> (WeblogDiscoveryEngine.java:<init>(51)) - Started Mudrod Weblog Discovery 
> Engine.
> 2018-07-09 18:06:29,874 INFO  discoveryengine.WeblogDiscoveryEngine 
> (WeblogDiscoveryEngine.java:preprocess(98)) - Starting Web log preprocessing.
> 2018-07-09 18:06:29,875 INFO  discoveryengine.WeblogDiscoveryEngine 
> (WeblogDiscoveryEngine.java:preprocess(106)) - Processing logs dated 201801.gz
> 2018-07-09 18:06:30,013 INFO  pre.ImportLogFile 
> (ImportLogFile.java:execute(80)) - Starting Log Import 201801.gz
> 2018-07-09 18:06:31,084 INFO  util.Version (Version.java:logVersion(108)) - 
> Elasticsearch Hadoop v5.2.0 [d85a257f9f]
> 2018-07-09 18:06:31,451 INFO  rdd.EsRDDWriter 
> (RestService.java:createWriter(562)) - Writing to [log201801.gz/raw.http]
> 2018-07-09 18:08:15,371 INFO  rdd.EsRDDWriter 
> (RestService.java:createWriter(562)) - Writing to [log201801.gz/raw.ftp]
> 2018-07-09 18:13:15,916 INFO  pre.ImportLogFile 
> (ImportLogFile.java:execute(84)) - Log Import complete. Time elapsed 405 
> seconds
> 2018-07-09 18:13:15,925 INFO  pre.CrawlerDetection 
> (CrawlerDetection.java:execute(82)) - Starting Crawler detection raw.http
> 2018-07-09 18:13:16,262 ERROR main.MudrodEngine (MudrodEngine.java:main(395)) 
> - Error whilst parsing command line.
> java.lang.IllegalArgumentException: [size] must be greater than 0. Found [0] 
> in [Users]
>  at 
> org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder.size(TermsAggregationBuilder.java:148)
>  at 
> org.apache.sdap.mudrod.weblog.pre.LogAbstract.getUserTerms(LogAbstract.java:127)
>  at 
> org.apache.sdap.mudrod.weblog.pre.LogAbstract.getUserDocs(LogAbstract.java:135)
>  at 
> org.apache.sdap.mudrod.weblog.pre.LogAbstract.getUserRDD(LogAbstract.java:100)
>  at 
> org.apache.sdap.mudrod.weblog.pre.CrawlerDetection.checkByRateInParallel(CrawlerDetection.java:112)
>  at 
> org.apache.sdap.mudrod.weblog.pre.CrawlerDetection.execute(CrawlerDetection.java:85)
>  at 
> org.apache.sdap.mudrod.discoveryengine.WeblogDiscoveryEngine.preprocess(WeblogDiscoveryEngine.java:112)
>  at 
> org.apache.sdap.mudrod.main.MudrodEngine.startFullIngest(MudrodEngine.java:240)
>  at org.apache.sdap.mudrod.main.MudrodEngine.main(MudrodEngine.java:385)
> {quote}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to