Repository: incubator-gobblin Updated Branches: refs/heads/master 70cbe91b9 -> adb810a7b
[GOBBLIN-271] Move the grok converter to the gobblin-grok module Project: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/commit/791306b8 Tree: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/tree/791306b8 Diff: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/diff/791306b8 Branch: refs/heads/master Commit: 791306b866dc02f56ab28ffa099feea411efb028 Parents: 8284bb7 Author: Hung Tran <[email protected]> Authored: Fri Sep 29 10:07:23 2017 -0700 Committer: Hung Tran <[email protected]> Committed: Fri Sep 29 10:07:23 2017 -0700 ---------------------------------------------------------------------- gobblin-core/build.gradle | 1 - .../converter/grok/GrokToJsonConverter.java | 219 ------------------- .../src/main/resources/grok/grok-base-patterns | 97 -------- .../converter/grok/GrokToJsonConverterTest.java | 118 ---------- .../converter/grok/convertedRecord.json | 1 - .../grok/convertedS3AccessLogRecord.json | 1 - .../converter/grok/s3AccessLogSchema.json | 138 ------------ .../grok/schemaWithNonNullableFields.json | 66 ------ .../grok/schemaWithNullableFields.json | 66 ------ .../src/test/resources/grok/grok-patterns | 97 -------- gobblin-modules/gobblin-grok/build.gradle | 32 +++ .../converter/grok/GrokToJsonConverter.java | 219 +++++++++++++++++++ .../src/main/resources/grok/grok-base-patterns | 97 ++++++++ .../converter/grok/GrokToJsonConverterTest.java | 118 ++++++++++ .../converter/grok/convertedRecord.json | 1 + .../grok/convertedS3AccessLogRecord.json | 1 + .../converter/grok/s3AccessLogSchema.json | 138 ++++++++++++ .../grok/schemaWithNonNullableFields.json | 66 ++++++ .../grok/schemaWithNullableFields.json | 66 ++++++ .../src/test/resources/grok/grok-patterns | 97 ++++++++ 20 files changed, 835 insertions(+), 804 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/791306b8/gobblin-core/build.gradle ---------------------------------------------------------------------- diff --git a/gobblin-core/build.gradle b/gobblin-core/build.gradle index 9eeae56..ee7a77c 100644 --- a/gobblin-core/build.gradle +++ b/gobblin-core/build.gradle @@ -56,7 +56,6 @@ dependencies { compile externalDependency.findBugsAnnotations compile externalDependency.oltu compile externalDependency.opencsv - compile externalDependency.grok compile externalDependency.hadoopHdfs runtime externalDependency.protobuf http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/791306b8/gobblin-core/src/main/java/org/apache/gobblin/converter/grok/GrokToJsonConverter.java ---------------------------------------------------------------------- diff --git a/gobblin-core/src/main/java/org/apache/gobblin/converter/grok/GrokToJsonConverter.java b/gobblin-core/src/main/java/org/apache/gobblin/converter/grok/GrokToJsonConverter.java deleted file mode 100644 index 1568eb7..0000000 --- a/gobblin-core/src/main/java/org/apache/gobblin/converter/grok/GrokToJsonConverter.java +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.gobblin.converter.grok; - -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.InputStreamReader; -import java.io.UnsupportedEncodingException; -import java.util.List; -import java.util.regex.Pattern; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.gson.JsonArray; -import com.google.gson.JsonElement; -import com.google.gson.JsonNull; -import com.google.gson.JsonObject; -import com.google.gson.JsonParser; - -import io.thekraken.grok.api.Grok; -import io.thekraken.grok.api.Match; -import io.thekraken.grok.api.exception.GrokException; - -import org.apache.gobblin.configuration.WorkUnitState; -import org.apache.gobblin.converter.Converter; -import org.apache.gobblin.converter.DataConversionException; -import org.apache.gobblin.converter.SchemaConversionException; -import org.apache.gobblin.converter.SingleRecordIterable; -import org.apache.gobblin.util.DatasetFilterUtils; - - -/** - * GrokToJsonConverter accepts already deserialized text row, String, where you can use. - * - * Converts Text to JSON based on Grok pattern. Schema is represented by the form of JsonArray same interface being used by CsvToJonConverter. - * Each text record is represented by a String. - * The converter only supports Grok patterns where every group is named because it uses the group names as column names. - * - * The following config properties can be set: - * The grok pattern to use for the conversion: - * converter.grokToJsonConverter.grokPattern ="^%{IPORHOST:clientip} (?:-|%{USER:ident}) (?:-|%{USER:auth}) \[%{HTTPDATE:timestamp}\] \"(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|-)\" %{NUMBER:response} (?:-|%{NUMBER:bytes})" - * - * Path to the file which contains the base grok patterns which can be used in the converter's GROK pattern (if not set it will use the default ones): - * converter.grokToJsonConverter.baseGrokPatternsFile= - ** - * Specify a comma separated list of regexes which will be applied on the fields and matched one will be converted to json null: - * converter.grokToJsonConverter.nullStringRegexes="[-\s]" - * - * Example of schema: - * [ - { - "columnName": "Day", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "string" - } - }, - { - "columnName": "Pageviews", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "long" - } - } - ] - */ -public class GrokToJsonConverter extends Converter<String, JsonArray, String, JsonObject> { - - private static final Logger LOG = LoggerFactory.getLogger(GrokToJsonConverter.class); - private static final JsonParser JSON_PARSER = new JsonParser(); - private static final String COLUMN_NAME_KEY = "columnName"; - private static final String DATA_TYPE = "dataType"; - private static final String TYPE_KEY = "type"; - private static final String NULLABLE = "isNullable"; - - public static final String GROK_PATTERN = "converter.grokToJsonConverter.grokPattern"; - public static final String BASE_PATTERNS_FILE = "converter.grokToJsonConverter.baseGrokPatternsFile"; - public static final String NULLSTRING_REGEXES = "converter.grokToJsonConverter.nullStringRegexes"; - - public static final String DEFAULT_GROK_PATTERNS_FILE = "/grok/grok-patterns"; - - private List<Pattern> nullStringRegexes; - - private Grok grok; - - @Override - public Converter<String, JsonArray, String, JsonObject> init(WorkUnitState workUnit) { - super.init(workUnit); - String pattern = workUnit.getProp(GROK_PATTERN); - String patternsFile = workUnit.getProp(BASE_PATTERNS_FILE); - this.nullStringRegexes = DatasetFilterUtils.getPatternsFromStrings(workUnit.getPropAsList(NULLSTRING_REGEXES, "")); - - InputStreamReader grokPatterns; - try { - if (patternsFile == null) { - grokPatterns = new InputStreamReader(getClass().getResourceAsStream("/grok/grok-base-patterns"), "UTF8"); - } else { - grokPatterns = new InputStreamReader(new FileInputStream(patternsFile), "UTF8"); - } - grok = new Grok(); - grok.addPatternFromReader(grokPatterns); - grok.compile(pattern); - } catch (GrokException | FileNotFoundException | UnsupportedEncodingException e) { - throw new RuntimeException("Error initializing GROK: " + e); - } - - return this; - } - - @Override - public JsonArray convertSchema(String inputSchema, WorkUnitState workUnit) - throws SchemaConversionException { - Preconditions.checkNotNull(inputSchema, "inputSchema is required."); - return JSON_PARSER.parse(inputSchema).getAsJsonArray(); - } - - /** - * Converts Text (String) to JSON based on a Grok regexp expression. - * By default, fields between Text and JSON are mapped by Grok SEMANTIC which is the identifier you give to the piece of text being matched in your Grok expression. - * - * - * e.g: - * {@inheritDoc} - * @see Converter#convertRecord(Object, Object, WorkUnitState) - */ - @Override - public Iterable<JsonObject> convertRecord(JsonArray outputSchema, String inputRecord, WorkUnitState workUnit) - throws DataConversionException { - - JsonObject outputRecord = createOutput(outputSchema, inputRecord); - - LOG.debug("Converted into " + outputRecord); - - return new SingleRecordIterable<JsonObject>(outputRecord); - } - - @VisibleForTesting - JsonObject createOutput(JsonArray outputSchema, String inputRecord) - throws DataConversionException { - JsonObject outputRecord = new JsonObject(); - - Match gm = grok.match(inputRecord); - gm.captures(); - - JsonElement capturesJson = JSON_PARSER.parse(gm.toJson()); - - for (JsonElement anOutputSchema : outputSchema) { - JsonObject outputSchemaJsonObject = anOutputSchema.getAsJsonObject(); - String key = outputSchemaJsonObject.get(COLUMN_NAME_KEY).getAsString(); - String type = outputSchemaJsonObject.getAsJsonObject(DATA_TYPE).get(TYPE_KEY).getAsString(); - - if (isFieldNull(capturesJson, key)) { - if (!outputSchemaJsonObject.get(NULLABLE).getAsBoolean()) { - throw new DataConversionException( - "Field " + key + " is null or not exists but it is non-nullable by the schema."); - } - outputRecord.add(key, JsonNull.INSTANCE); - } else { - JsonElement jsonElement = capturesJson.getAsJsonObject().get(key); - switch (type) { - case "int": - outputRecord.addProperty(key, jsonElement.getAsInt()); - break; - case "long": - outputRecord.addProperty(key, jsonElement.getAsLong()); - break; - case "double": - outputRecord.addProperty(key, jsonElement.getAsDouble()); - break; - case "float": - outputRecord.addProperty(key, jsonElement.getAsFloat()); - break; - case "boolean": - outputRecord.addProperty(key, jsonElement.getAsBoolean()); - break; - case "string": - default: - outputRecord.addProperty(key, jsonElement.getAsString()); - } - } - } - return outputRecord; - } - - private boolean isFieldNull(JsonElement capturesJson, String key) { - JsonObject jsonObject = capturesJson.getAsJsonObject(); - - if (!jsonObject.has(key)) { - return true; - } - - for (Pattern pattern : this.nullStringRegexes) { - if (pattern.matcher(jsonObject.get(key).getAsString()).matches()) { - return true; - } - } - - return false; - } -} http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/791306b8/gobblin-core/src/main/resources/grok/grok-base-patterns ---------------------------------------------------------------------- diff --git a/gobblin-core/src/main/resources/grok/grok-base-patterns b/gobblin-core/src/main/resources/grok/grok-base-patterns deleted file mode 100644 index 3793e02..0000000 --- a/gobblin-core/src/main/resources/grok/grok-base-patterns +++ /dev/null @@ -1,97 +0,0 @@ -#Forked from https://github.com/logstash-plugins/logstash-patterns-core/blob/master/patterns/grok-patterns - -USERNAME [a-zA-Z0-9._-]+ -USER %{USERNAME} -EMAILLOCALPART [a-zA-Z][a-zA-Z0-9_.+-=:]+ -EMAILADDRESS %{EMAILLOCALPART}@%{HOSTNAME} -INT (?:[+-]?(?:[0-9]+)) -BASE10NUM (?<![0-9.+-])(?>[+-]?(?:(?:[0-9]+(?:\.[0-9]+)?)|(?:\.[0-9]+))) -NUMBER (?:%{BASE10NUM}) -BASE16NUM (?<![0-9A-Fa-f])(?:[+-]?(?:0x)?(?:[0-9A-Fa-f]+)) -BASE16FLOAT \b(?<![0-9A-Fa-f.])(?:[+-]?(?:0x)?(?:(?:[0-9A-Fa-f]+(?:\.[0-9A-Fa-f]*)?)|(?:\.[0-9A-Fa-f]+)))\b - -POSINT \b(?:[1-9][0-9]*)\b -NONNEGINT \b(?:[0-9]+)\b -WORD \b\w+\b -NOTSPACE \S+ -SPACE \s* -DATA .*? -GREEDYDATA .* -QUOTEDSTRING (?>(?<!\\)(?>"(?>\\.|[^\\"]+)+"|""|(?>'(?>\\.|[^\\']+)+')|''|(?>`(?>\\.|[^\\`]+)+`)|``)) -UUID [A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12} -# URN, allowing use of RFC 2141 section 2.3 reserved characters -URN urn:[0-9A-Za-z][0-9A-Za-z-]{0,31}:(?:%[0-9a-fA-F]{2}|[0-9A-Za-z()+,.:=@;$_!*'/?#-])+ - -# Networking -MAC (?:%{CISCOMAC}|%{WINDOWSMAC}|%{COMMONMAC}) -CISCOMAC (?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4}) -WINDOWSMAC (?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2}) -COMMONMAC (?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2}) -IPV6 ((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5 ]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)? -IPV4 (?<![0-9])(?:(?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5]))(?![0-9]) -IP (?:%{IPV6}|%{IPV4}) -HOSTNAME \b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\.?|\b) -IPORHOST (?:%{IP}|%{HOSTNAME}) -HOSTPORT %{IPORHOST}:%{POSINT} - -# paths -PATH (?:%{UNIXPATH}|%{WINPATH}) -UNIXPATH (/([\w_%!$@:.,+~-]+|\\.)*)+ -TTY (?:/dev/(pts|tty([pq])?)(\w+)?/?(?:[0-9]+)) -WINPATH (?>[A-Za-z]+:|\\)(?:\\[^\\?*]*)+ -URIPROTO [A-Za-z]([A-Za-z0-9+\-.]+)+ -URIHOST %{IPORHOST}(?::%{POSINT:port})? -# uripath comes loosely from RFC1738, but mostly from what Firefox -# doesn't turn into %XX -URIPATH (?:/[A-Za-z0-9$.+!*'(){},~:;=@#%&_\-]*)+ -#URIPARAM \?(?:[A-Za-z0-9]+(?:=(?:[^&]*))?(?:&(?:[A-Za-z0-9]+(?:=(?:[^&]*))?)?)*)? -URIPARAM \?[A-Za-z0-9$.+!*'|(){},~@#%&/=:;_?\-\[\]<>]* -URIPATHPARAM %{URIPATH}(?:%{URIPARAM})? -URI %{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})? - -# Months: January, Feb, 3, 03, 12, December -MONTH \b(?:[Jj]an(?:uary|uar)?|[Ff]eb(?:ruary|ruar)?|[Mm](?:a|ä)?r(?:ch|z)?|[Aa]pr(?:il)?|[Mm]a(?:y|i)?|[Jj]un(?:e|i)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ep(?:tember)?|[Oo](?:c|k)?t(?:ober)?|[Nn]ov(?:ember)?|[Dd]e(?:c|z)(?:ember)?)\b -MONTHNUM (?:0?[1-9]|1[0-2]) -MONTHNUM2 (?:0[1-9]|1[0-2]) -MONTHDAY (?:(?:0[1-9])|(?:[12][0-9])|(?:3[01])|[1-9]) - -# Days: Monday, Tue, Thu, etc... -DAY (?:Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?) - -# Years? -YEAR (?>\d\d){1,2} -HOUR (?:2[0123]|[01]?[0-9]) -MINUTE (?:[0-5][0-9]) -# '60' is a leap second in most time standards and thus is valid. -SECOND (?:(?:[0-5]?[0-9]|60)(?:[:.,][0-9]+)?) -TIME (?!<[0-9])%{HOUR}:%{MINUTE}(?::%{SECOND})(?![0-9]) -# datestamp is YYYY/MM/DD-HH:MM:SS.UUUU (or something like it) -DATE_US %{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR} -DATE_EU %{MONTHDAY}[./-]%{MONTHNUM}[./-]%{YEAR} -ISO8601_TIMEZONE (?:Z|[+-]%{HOUR}(?::?%{MINUTE})) -ISO8601_SECOND (?:%{SECOND}|60) -TIMESTAMP_ISO8601 %{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR}:?%{MINUTE}(?::?%{SECOND})?%{ISO8601_TIMEZONE}? -DATE %{DATE_US}|%{DATE_EU} -DATESTAMP %{DATE}[- ]%{TIME} -TZ (?:[APMCE][SD]T|UTC) -DATESTAMP_RFC822 %{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ} -DATESTAMP_RFC2822 %{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{TIME} %{ISO8601_TIMEZONE} -DATESTAMP_OTHER %{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR} -DATESTAMP_EVENTLOG %{YEAR}%{MONTHNUM2}%{MONTHDAY}%{HOUR}%{MINUTE}%{SECOND} - -# Syslog Dates: Month Day HH:MM:SS -SYSLOGTIMESTAMP %{MONTH} +%{MONTHDAY} %{TIME} -PROG [\x21-\x5a\x5c\x5e-\x7e]+ -SYSLOGPROG %{PROG:program}(?:\[%{POSINT:pid}\])? -SYSLOGHOST %{IPORHOST} -SYSLOGFACILITY <%{NONNEGINT:facility}.%{NONNEGINT:priority}> -HTTPDATE %{MONTHDAY}/%{MONTH}/%{YEAR}:%{TIME} %{INT} - -# Shortcuts -QS %{QUOTEDSTRING} - -# Log formats -SYSLOGBASE %{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}: - -# Log Levels -LOGLEVEL ([Aa]lert|ALERT|[Tt]race|TRACE|[Dd]ebug|DEBUG|[Nn]otice|NOTICE|[Ii]nfo|INFO|[Ww]arn?(?:ing)?|WARN?(?:ING)?|[Ee]rr?(?:or)?|ERR?(?:OR)?|[Cc]rit?(?:ical)?|CRIT?(?:ICAL)?|[Ff]atal|FATAL|[Ss]evere|SEVERE|EMERG(?:ENCY)?|[Ee]merg(?:ency)?) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/791306b8/gobblin-core/src/test/java/org/apache/gobblin/converter/grok/GrokToJsonConverterTest.java ---------------------------------------------------------------------- diff --git a/gobblin-core/src/test/java/org/apache/gobblin/converter/grok/GrokToJsonConverterTest.java b/gobblin-core/src/test/java/org/apache/gobblin/converter/grok/GrokToJsonConverterTest.java deleted file mode 100644 index 3a4b78b..0000000 --- a/gobblin-core/src/test/java/org/apache/gobblin/converter/grok/GrokToJsonConverterTest.java +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.gobblin.converter.grok; - -import java.io.InputStreamReader; - -import org.testng.Assert; -import org.testng.annotations.Test; - -import com.google.gson.JsonArray; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; -import com.google.gson.JsonParser; - -import gobblin.configuration.WorkUnitState; - -import org.apache.gobblin.converter.DataConversionException; - - -@Test(groups = {"gobblin.converter"}) -public class GrokToJsonConverterTest { - @Test - public void convertOutputWithNullableFields() - throws Exception { - JsonParser parser = new JsonParser(); - - String inputRecord = - "10.121.123.104 - - [01/Nov/2012:21:01:17 +0100] \"GET /cpc/auth.do?loginsetup=true&targetPage=%2Fcpc%2F HTTP/1.1\" 302 466"; - - JsonElement jsonElement = parser - .parse(new InputStreamReader(getClass().getResourceAsStream("/converter/grok/schemaWithNullableFields.json"))); - JsonArray outputSchema = jsonElement.getAsJsonArray(); - - GrokToJsonConverter grokToJsonConverter = new GrokToJsonConverter(); - WorkUnitState workUnitState = new WorkUnitState(); - workUnitState.setProp(GrokToJsonConverter.GROK_PATTERN, - "^%{IPORHOST:clientip} (?:-|%{USER:ident}) (?:-|%{USER:auth}) \\[%{HTTPDATE:timestamp}\\] \\\"(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|-)\\\" %{NUMBER:response} (?:-|%{NUMBER:bytes})"); - - grokToJsonConverter.init(workUnitState); - JsonObject actual = grokToJsonConverter.convertRecord(outputSchema, inputRecord, workUnitState).iterator().next(); - - JsonObject expected = - parser.parse(new InputStreamReader(getClass().getResourceAsStream("/converter/grok/convertedRecord.json"))) - .getAsJsonObject(); - Assert.assertEquals(actual, expected); - grokToJsonConverter.close(); - } - - @Test(expectedExceptions = DataConversionException.class) - public void convertOutputWithNonNullableFieldsShouldThrowDataConversionException() - throws Exception { - JsonParser parser = new JsonParser(); - - String inputRecord = - "10.121.123.104 - - [01/Nov/2012:21:01:17 +0100] \"GET /cpc/auth.do?loginsetup=true&targetPage=%2Fcpc%2F HTTP/1.1\" 302 466"; - - JsonElement jsonElement = parser.parse( - new InputStreamReader(getClass().getResourceAsStream("/converter/grok/schemaWithNonNullableFields.json"))); - JsonArray outputSchema = jsonElement.getAsJsonArray(); - - GrokToJsonConverter grokToJsonConverter = new GrokToJsonConverter(); - WorkUnitState workUnitState = new WorkUnitState(); - workUnitState.setProp(GrokToJsonConverter.GROK_PATTERN, - "^%{IPORHOST:clientip} (?:-|%{USER:ident}) (?:-|%{USER:auth}) \\[%{HTTPDATE:timestamp}\\] \\\"(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|-)\\\" %{NUMBER:response} (?:-|%{NUMBER:bytes})"); - - grokToJsonConverter.init(workUnitState); - JsonObject actual = grokToJsonConverter.convertRecord(outputSchema, inputRecord, workUnitState).iterator().next(); - - JsonObject expected = - parser.parse(new InputStreamReader(getClass().getResourceAsStream("/converter/grok/convertedRecord.json"))) - .getAsJsonObject(); - grokToJsonConverter.close(); - } - - @Test - public void convertWithNullStringSet() - throws Exception { - JsonParser parser = new JsonParser(); - - String inputRecord = - "79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be mybucket [06/Feb/2014:00:00:38 +0000] 192.0.2.3 79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be 3E57427F3EXAMPLE REST.GET.VERSIONING - \"GET /mybucket?versioning HTTP/1.1\" 200 - 113 - 7 - \"-\" \"S3Console/0.4\" -"; - - JsonElement jsonElement = - parser.parse(new InputStreamReader(getClass().getResourceAsStream("/converter/grok/s3AccessLogSchema.json"))); - JsonArray outputSchema = jsonElement.getAsJsonArray(); - - GrokToJsonConverter grokToJsonConverter = new GrokToJsonConverter(); - WorkUnitState workUnitState = new WorkUnitState(); - //Grok expression was taken from https://github.com/logstash-plugins/logstash-patterns-core/blob/master/patterns/aws - workUnitState.setProp(GrokToJsonConverter.GROK_PATTERN, - "%{WORD:owner} %{NOTSPACE:bucket} \\[%{HTTPDATE:timestamp}\\] %{IP:clientip} %{NOTSPACE:requester} %{NOTSPACE:request_id} %{NOTSPACE:operation} %{NOTSPACE:key} (?:\"(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|%{DATA:rawrequest})\"|-) (?:%{INT:response:int}|-) (?:-|%{NOTSPACE:error_code}) (?:%{INT:bytes:int}|-) (?:%{INT:object_size:int}|-) (?:%{INT:request_time_ms:int}|-) (?:%{INT:turnaround_time_ms:int}|-) (?:%{QS:referrer}|-) (?:\"?%{QS:agent}\"?|-) (?:-|%{NOTSPACE:version_id})"); - workUnitState.setProp(GrokToJsonConverter.NULLSTRING_REGEXES, "[\\s-]"); - - grokToJsonConverter.init(workUnitState); - JsonObject actual = grokToJsonConverter.convertRecord(outputSchema, inputRecord, workUnitState).iterator().next(); - - JsonObject expected = parser - .parse(new InputStreamReader(getClass().getResourceAsStream("/converter/grok/convertedS3AccessLogRecord.json"))) - .getAsJsonObject(); - Assert.assertEquals(actual, expected); - grokToJsonConverter.close(); - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/791306b8/gobblin-core/src/test/resources/converter/grok/convertedRecord.json ---------------------------------------------------------------------- diff --git a/gobblin-core/src/test/resources/converter/grok/convertedRecord.json b/gobblin-core/src/test/resources/converter/grok/convertedRecord.json deleted file mode 100644 index 69f45f1..0000000 --- a/gobblin-core/src/test/resources/converter/grok/convertedRecord.json +++ /dev/null @@ -1 +0,0 @@ -{"clientip":"10.121.123.104","ident":null,"auth":null,"timestamp":"01/Nov/2012:21:01:17 +0100","request":"/cpc/auth.do?loginsetup=true&targetPage=%2Fcpc%2F","httpversion":1.1,"response":302,"bytes":466} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/791306b8/gobblin-core/src/test/resources/converter/grok/convertedS3AccessLogRecord.json ---------------------------------------------------------------------- diff --git a/gobblin-core/src/test/resources/converter/grok/convertedS3AccessLogRecord.json b/gobblin-core/src/test/resources/converter/grok/convertedS3AccessLogRecord.json deleted file mode 100644 index 29da948..0000000 --- a/gobblin-core/src/test/resources/converter/grok/convertedS3AccessLogRecord.json +++ /dev/null @@ -1 +0,0 @@ -{"owner":"79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be","bucket":"mybucket","timestamp":"06/Feb/2014:00:00:38 +0000","clientip":"192.0.2.3","requester":"79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be","request_id":"3E57427F3EXAMPLE","operation":"REST.GET.VERSIONING","key":null,"response":200,"error_code":null,"bytes":113,"object_size":null,"request_time_ms":7,"turnaround_time_ms":null,"referrer":null,"agent":"S3Console/0.4","version_id":null} http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/791306b8/gobblin-core/src/test/resources/converter/grok/s3AccessLogSchema.json ---------------------------------------------------------------------- diff --git a/gobblin-core/src/test/resources/converter/grok/s3AccessLogSchema.json b/gobblin-core/src/test/resources/converter/grok/s3AccessLogSchema.json deleted file mode 100644 index 934399c..0000000 --- a/gobblin-core/src/test/resources/converter/grok/s3AccessLogSchema.json +++ /dev/null @@ -1,138 +0,0 @@ -[ - { - "columnName": "owner", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "string" - } - }, - { - "columnName": "bucket", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "string" - } - }, - { - "columnName": "timestamp", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "string" - } - }, - { - "columnName": "clientip", - "comment": "", - "isNullable": "false", - "dataType": { - "type": "string" - } - }, - { - "columnName": "requester", - "comment": "", - "isNullable": "false", - "dataType": { - "type": "string" - } - }, - { - "columnName": "request_id", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "string" - } - }, - { - "columnName": "operation", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "string" - } - }, - { - "columnName": "key", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "string" - } - }, - { - "columnName": "response", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "int" - } - }, - { - "columnName": "error_code", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "int" - } - }, - { - "columnName": "bytes", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "int" - } - }, - { - "columnName": "object_size", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "int" - } - }, - { - "columnName": "request_time_ms", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "int" - } - }, - { - "columnName": "turnaround_time_ms", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "int" - } - }, - { - "columnName": "referrer", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "string" - } - }, - { - "columnName": "agent", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "string" - } - }, - { - "columnName": "version_id", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "string" - } - } -] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/791306b8/gobblin-core/src/test/resources/converter/grok/schemaWithNonNullableFields.json ---------------------------------------------------------------------- diff --git a/gobblin-core/src/test/resources/converter/grok/schemaWithNonNullableFields.json b/gobblin-core/src/test/resources/converter/grok/schemaWithNonNullableFields.json deleted file mode 100644 index a1ce180..0000000 --- a/gobblin-core/src/test/resources/converter/grok/schemaWithNonNullableFields.json +++ /dev/null @@ -1,66 +0,0 @@ -[ - { - "columnName": "clientip", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "string" - } - }, - { - "columnName": "ident", - "comment": "", - "isNullable": "false", - "dataType": { - "type": "string" - } - }, - { - "columnName": "auth", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "string" - } - }, - { - "columnName": "timestamp", - "comment": "", - "isNullable": "false", - "dataType": { - "type": "string" - } - }, - { - "columnName": "request", - "comment": "", - "isNullable": "false", - "dataType": { - "type": "string" - } - }, - { - "columnName": "httpversion", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "double" - } - }, - { - "columnName": "response", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "int" - } - }, - { - "columnName": "bytes", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "int" - } - } -] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/791306b8/gobblin-core/src/test/resources/converter/grok/schemaWithNullableFields.json ---------------------------------------------------------------------- diff --git a/gobblin-core/src/test/resources/converter/grok/schemaWithNullableFields.json b/gobblin-core/src/test/resources/converter/grok/schemaWithNullableFields.json deleted file mode 100644 index b8b0536..0000000 --- a/gobblin-core/src/test/resources/converter/grok/schemaWithNullableFields.json +++ /dev/null @@ -1,66 +0,0 @@ -[ - { - "columnName": "clientip", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "string" - } - }, - { - "columnName": "ident", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "string" - } - }, - { - "columnName": "auth", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "string" - } - }, - { - "columnName": "timestamp", - "comment": "", - "isNullable": "false", - "dataType": { - "type": "string" - } - }, - { - "columnName": "request", - "comment": "", - "isNullable": "false", - "dataType": { - "type": "string" - } - }, - { - "columnName": "httpversion", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "double" - } - }, - { - "columnName": "response", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "int" - } - }, - { - "columnName": "bytes", - "comment": "", - "isNullable": "true", - "dataType": { - "type": "int" - } - } -] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/791306b8/gobblin-core/src/test/resources/grok/grok-patterns ---------------------------------------------------------------------- diff --git a/gobblin-core/src/test/resources/grok/grok-patterns b/gobblin-core/src/test/resources/grok/grok-patterns deleted file mode 100644 index 3793e02..0000000 --- a/gobblin-core/src/test/resources/grok/grok-patterns +++ /dev/null @@ -1,97 +0,0 @@ -#Forked from https://github.com/logstash-plugins/logstash-patterns-core/blob/master/patterns/grok-patterns - -USERNAME [a-zA-Z0-9._-]+ -USER %{USERNAME} -EMAILLOCALPART [a-zA-Z][a-zA-Z0-9_.+-=:]+ -EMAILADDRESS %{EMAILLOCALPART}@%{HOSTNAME} -INT (?:[+-]?(?:[0-9]+)) -BASE10NUM (?<![0-9.+-])(?>[+-]?(?:(?:[0-9]+(?:\.[0-9]+)?)|(?:\.[0-9]+))) -NUMBER (?:%{BASE10NUM}) -BASE16NUM (?<![0-9A-Fa-f])(?:[+-]?(?:0x)?(?:[0-9A-Fa-f]+)) -BASE16FLOAT \b(?<![0-9A-Fa-f.])(?:[+-]?(?:0x)?(?:(?:[0-9A-Fa-f]+(?:\.[0-9A-Fa-f]*)?)|(?:\.[0-9A-Fa-f]+)))\b - -POSINT \b(?:[1-9][0-9]*)\b -NONNEGINT \b(?:[0-9]+)\b -WORD \b\w+\b -NOTSPACE \S+ -SPACE \s* -DATA .*? -GREEDYDATA .* -QUOTEDSTRING (?>(?<!\\)(?>"(?>\\.|[^\\"]+)+"|""|(?>'(?>\\.|[^\\']+)+')|''|(?>`(?>\\.|[^\\`]+)+`)|``)) -UUID [A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12} -# URN, allowing use of RFC 2141 section 2.3 reserved characters -URN urn:[0-9A-Za-z][0-9A-Za-z-]{0,31}:(?:%[0-9a-fA-F]{2}|[0-9A-Za-z()+,.:=@;$_!*'/?#-])+ - -# Networking -MAC (?:%{CISCOMAC}|%{WINDOWSMAC}|%{COMMONMAC}) -CISCOMAC (?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4}) -WINDOWSMAC (?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2}) -COMMONMAC (?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2}) -IPV6 ((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5 ]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)? -IPV4 (?<![0-9])(?:(?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5]))(?![0-9]) -IP (?:%{IPV6}|%{IPV4}) -HOSTNAME \b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\.?|\b) -IPORHOST (?:%{IP}|%{HOSTNAME}) -HOSTPORT %{IPORHOST}:%{POSINT} - -# paths -PATH (?:%{UNIXPATH}|%{WINPATH}) -UNIXPATH (/([\w_%!$@:.,+~-]+|\\.)*)+ -TTY (?:/dev/(pts|tty([pq])?)(\w+)?/?(?:[0-9]+)) -WINPATH (?>[A-Za-z]+:|\\)(?:\\[^\\?*]*)+ -URIPROTO [A-Za-z]([A-Za-z0-9+\-.]+)+ -URIHOST %{IPORHOST}(?::%{POSINT:port})? -# uripath comes loosely from RFC1738, but mostly from what Firefox -# doesn't turn into %XX -URIPATH (?:/[A-Za-z0-9$.+!*'(){},~:;=@#%&_\-]*)+ -#URIPARAM \?(?:[A-Za-z0-9]+(?:=(?:[^&]*))?(?:&(?:[A-Za-z0-9]+(?:=(?:[^&]*))?)?)*)? -URIPARAM \?[A-Za-z0-9$.+!*'|(){},~@#%&/=:;_?\-\[\]<>]* -URIPATHPARAM %{URIPATH}(?:%{URIPARAM})? -URI %{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})? - -# Months: January, Feb, 3, 03, 12, December -MONTH \b(?:[Jj]an(?:uary|uar)?|[Ff]eb(?:ruary|ruar)?|[Mm](?:a|ä)?r(?:ch|z)?|[Aa]pr(?:il)?|[Mm]a(?:y|i)?|[Jj]un(?:e|i)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ep(?:tember)?|[Oo](?:c|k)?t(?:ober)?|[Nn]ov(?:ember)?|[Dd]e(?:c|z)(?:ember)?)\b -MONTHNUM (?:0?[1-9]|1[0-2]) -MONTHNUM2 (?:0[1-9]|1[0-2]) -MONTHDAY (?:(?:0[1-9])|(?:[12][0-9])|(?:3[01])|[1-9]) - -# Days: Monday, Tue, Thu, etc... -DAY (?:Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?) - -# Years? -YEAR (?>\d\d){1,2} -HOUR (?:2[0123]|[01]?[0-9]) -MINUTE (?:[0-5][0-9]) -# '60' is a leap second in most time standards and thus is valid. -SECOND (?:(?:[0-5]?[0-9]|60)(?:[:.,][0-9]+)?) -TIME (?!<[0-9])%{HOUR}:%{MINUTE}(?::%{SECOND})(?![0-9]) -# datestamp is YYYY/MM/DD-HH:MM:SS.UUUU (or something like it) -DATE_US %{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR} -DATE_EU %{MONTHDAY}[./-]%{MONTHNUM}[./-]%{YEAR} -ISO8601_TIMEZONE (?:Z|[+-]%{HOUR}(?::?%{MINUTE})) -ISO8601_SECOND (?:%{SECOND}|60) -TIMESTAMP_ISO8601 %{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR}:?%{MINUTE}(?::?%{SECOND})?%{ISO8601_TIMEZONE}? -DATE %{DATE_US}|%{DATE_EU} -DATESTAMP %{DATE}[- ]%{TIME} -TZ (?:[APMCE][SD]T|UTC) -DATESTAMP_RFC822 %{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ} -DATESTAMP_RFC2822 %{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{TIME} %{ISO8601_TIMEZONE} -DATESTAMP_OTHER %{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR} -DATESTAMP_EVENTLOG %{YEAR}%{MONTHNUM2}%{MONTHDAY}%{HOUR}%{MINUTE}%{SECOND} - -# Syslog Dates: Month Day HH:MM:SS -SYSLOGTIMESTAMP %{MONTH} +%{MONTHDAY} %{TIME} -PROG [\x21-\x5a\x5c\x5e-\x7e]+ -SYSLOGPROG %{PROG:program}(?:\[%{POSINT:pid}\])? -SYSLOGHOST %{IPORHOST} -SYSLOGFACILITY <%{NONNEGINT:facility}.%{NONNEGINT:priority}> -HTTPDATE %{MONTHDAY}/%{MONTH}/%{YEAR}:%{TIME} %{INT} - -# Shortcuts -QS %{QUOTEDSTRING} - -# Log formats -SYSLOGBASE %{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}: - -# Log Levels -LOGLEVEL ([Aa]lert|ALERT|[Tt]race|TRACE|[Dd]ebug|DEBUG|[Nn]otice|NOTICE|[Ii]nfo|INFO|[Ww]arn?(?:ing)?|WARN?(?:ING)?|[Ee]rr?(?:or)?|ERR?(?:OR)?|[Cc]rit?(?:ical)?|CRIT?(?:ICAL)?|[Ff]atal|FATAL|[Ss]evere|SEVERE|EMERG(?:ENCY)?|[Ee]merg(?:ency)?) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/791306b8/gobblin-modules/gobblin-grok/build.gradle ---------------------------------------------------------------------- diff --git a/gobblin-modules/gobblin-grok/build.gradle b/gobblin-modules/gobblin-grok/build.gradle new file mode 100644 index 0000000..bbc4b9e --- /dev/null +++ b/gobblin-modules/gobblin-grok/build.gradle @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +apply plugin: 'java' + +dependencies { + compile project(":gobblin-api") + compile project(":gobblin-core-base") + compile project(":gobblin-data-management") + + compile externalDependency.grok + compile externalDependency.slf4j + + testCompile externalDependency.testng +} + +ext.classification="library" + http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/791306b8/gobblin-modules/gobblin-grok/src/main/java/org/apache/gobblin/converter/grok/GrokToJsonConverter.java ---------------------------------------------------------------------- diff --git a/gobblin-modules/gobblin-grok/src/main/java/org/apache/gobblin/converter/grok/GrokToJsonConverter.java b/gobblin-modules/gobblin-grok/src/main/java/org/apache/gobblin/converter/grok/GrokToJsonConverter.java new file mode 100644 index 0000000..1568eb7 --- /dev/null +++ b/gobblin-modules/gobblin-grok/src/main/java/org/apache/gobblin/converter/grok/GrokToJsonConverter.java @@ -0,0 +1,219 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gobblin.converter.grok; + +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.InputStreamReader; +import java.io.UnsupportedEncodingException; +import java.util.List; +import java.util.regex.Pattern; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonNull; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; + +import io.thekraken.grok.api.Grok; +import io.thekraken.grok.api.Match; +import io.thekraken.grok.api.exception.GrokException; + +import org.apache.gobblin.configuration.WorkUnitState; +import org.apache.gobblin.converter.Converter; +import org.apache.gobblin.converter.DataConversionException; +import org.apache.gobblin.converter.SchemaConversionException; +import org.apache.gobblin.converter.SingleRecordIterable; +import org.apache.gobblin.util.DatasetFilterUtils; + + +/** + * GrokToJsonConverter accepts already deserialized text row, String, where you can use. + * + * Converts Text to JSON based on Grok pattern. Schema is represented by the form of JsonArray same interface being used by CsvToJonConverter. + * Each text record is represented by a String. + * The converter only supports Grok patterns where every group is named because it uses the group names as column names. + * + * The following config properties can be set: + * The grok pattern to use for the conversion: + * converter.grokToJsonConverter.grokPattern ="^%{IPORHOST:clientip} (?:-|%{USER:ident}) (?:-|%{USER:auth}) \[%{HTTPDATE:timestamp}\] \"(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|-)\" %{NUMBER:response} (?:-|%{NUMBER:bytes})" + * + * Path to the file which contains the base grok patterns which can be used in the converter's GROK pattern (if not set it will use the default ones): + * converter.grokToJsonConverter.baseGrokPatternsFile= + ** + * Specify a comma separated list of regexes which will be applied on the fields and matched one will be converted to json null: + * converter.grokToJsonConverter.nullStringRegexes="[-\s]" + * + * Example of schema: + * [ + { + "columnName": "Day", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "string" + } + }, + { + "columnName": "Pageviews", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "long" + } + } + ] + */ +public class GrokToJsonConverter extends Converter<String, JsonArray, String, JsonObject> { + + private static final Logger LOG = LoggerFactory.getLogger(GrokToJsonConverter.class); + private static final JsonParser JSON_PARSER = new JsonParser(); + private static final String COLUMN_NAME_KEY = "columnName"; + private static final String DATA_TYPE = "dataType"; + private static final String TYPE_KEY = "type"; + private static final String NULLABLE = "isNullable"; + + public static final String GROK_PATTERN = "converter.grokToJsonConverter.grokPattern"; + public static final String BASE_PATTERNS_FILE = "converter.grokToJsonConverter.baseGrokPatternsFile"; + public static final String NULLSTRING_REGEXES = "converter.grokToJsonConverter.nullStringRegexes"; + + public static final String DEFAULT_GROK_PATTERNS_FILE = "/grok/grok-patterns"; + + private List<Pattern> nullStringRegexes; + + private Grok grok; + + @Override + public Converter<String, JsonArray, String, JsonObject> init(WorkUnitState workUnit) { + super.init(workUnit); + String pattern = workUnit.getProp(GROK_PATTERN); + String patternsFile = workUnit.getProp(BASE_PATTERNS_FILE); + this.nullStringRegexes = DatasetFilterUtils.getPatternsFromStrings(workUnit.getPropAsList(NULLSTRING_REGEXES, "")); + + InputStreamReader grokPatterns; + try { + if (patternsFile == null) { + grokPatterns = new InputStreamReader(getClass().getResourceAsStream("/grok/grok-base-patterns"), "UTF8"); + } else { + grokPatterns = new InputStreamReader(new FileInputStream(patternsFile), "UTF8"); + } + grok = new Grok(); + grok.addPatternFromReader(grokPatterns); + grok.compile(pattern); + } catch (GrokException | FileNotFoundException | UnsupportedEncodingException e) { + throw new RuntimeException("Error initializing GROK: " + e); + } + + return this; + } + + @Override + public JsonArray convertSchema(String inputSchema, WorkUnitState workUnit) + throws SchemaConversionException { + Preconditions.checkNotNull(inputSchema, "inputSchema is required."); + return JSON_PARSER.parse(inputSchema).getAsJsonArray(); + } + + /** + * Converts Text (String) to JSON based on a Grok regexp expression. + * By default, fields between Text and JSON are mapped by Grok SEMANTIC which is the identifier you give to the piece of text being matched in your Grok expression. + * + * + * e.g: + * {@inheritDoc} + * @see Converter#convertRecord(Object, Object, WorkUnitState) + */ + @Override + public Iterable<JsonObject> convertRecord(JsonArray outputSchema, String inputRecord, WorkUnitState workUnit) + throws DataConversionException { + + JsonObject outputRecord = createOutput(outputSchema, inputRecord); + + LOG.debug("Converted into " + outputRecord); + + return new SingleRecordIterable<JsonObject>(outputRecord); + } + + @VisibleForTesting + JsonObject createOutput(JsonArray outputSchema, String inputRecord) + throws DataConversionException { + JsonObject outputRecord = new JsonObject(); + + Match gm = grok.match(inputRecord); + gm.captures(); + + JsonElement capturesJson = JSON_PARSER.parse(gm.toJson()); + + for (JsonElement anOutputSchema : outputSchema) { + JsonObject outputSchemaJsonObject = anOutputSchema.getAsJsonObject(); + String key = outputSchemaJsonObject.get(COLUMN_NAME_KEY).getAsString(); + String type = outputSchemaJsonObject.getAsJsonObject(DATA_TYPE).get(TYPE_KEY).getAsString(); + + if (isFieldNull(capturesJson, key)) { + if (!outputSchemaJsonObject.get(NULLABLE).getAsBoolean()) { + throw new DataConversionException( + "Field " + key + " is null or not exists but it is non-nullable by the schema."); + } + outputRecord.add(key, JsonNull.INSTANCE); + } else { + JsonElement jsonElement = capturesJson.getAsJsonObject().get(key); + switch (type) { + case "int": + outputRecord.addProperty(key, jsonElement.getAsInt()); + break; + case "long": + outputRecord.addProperty(key, jsonElement.getAsLong()); + break; + case "double": + outputRecord.addProperty(key, jsonElement.getAsDouble()); + break; + case "float": + outputRecord.addProperty(key, jsonElement.getAsFloat()); + break; + case "boolean": + outputRecord.addProperty(key, jsonElement.getAsBoolean()); + break; + case "string": + default: + outputRecord.addProperty(key, jsonElement.getAsString()); + } + } + } + return outputRecord; + } + + private boolean isFieldNull(JsonElement capturesJson, String key) { + JsonObject jsonObject = capturesJson.getAsJsonObject(); + + if (!jsonObject.has(key)) { + return true; + } + + for (Pattern pattern : this.nullStringRegexes) { + if (pattern.matcher(jsonObject.get(key).getAsString()).matches()) { + return true; + } + } + + return false; + } +} http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/791306b8/gobblin-modules/gobblin-grok/src/main/resources/grok/grok-base-patterns ---------------------------------------------------------------------- diff --git a/gobblin-modules/gobblin-grok/src/main/resources/grok/grok-base-patterns b/gobblin-modules/gobblin-grok/src/main/resources/grok/grok-base-patterns new file mode 100644 index 0000000..3793e02 --- /dev/null +++ b/gobblin-modules/gobblin-grok/src/main/resources/grok/grok-base-patterns @@ -0,0 +1,97 @@ +#Forked from https://github.com/logstash-plugins/logstash-patterns-core/blob/master/patterns/grok-patterns + +USERNAME [a-zA-Z0-9._-]+ +USER %{USERNAME} +EMAILLOCALPART [a-zA-Z][a-zA-Z0-9_.+-=:]+ +EMAILADDRESS %{EMAILLOCALPART}@%{HOSTNAME} +INT (?:[+-]?(?:[0-9]+)) +BASE10NUM (?<![0-9.+-])(?>[+-]?(?:(?:[0-9]+(?:\.[0-9]+)?)|(?:\.[0-9]+))) +NUMBER (?:%{BASE10NUM}) +BASE16NUM (?<![0-9A-Fa-f])(?:[+-]?(?:0x)?(?:[0-9A-Fa-f]+)) +BASE16FLOAT \b(?<![0-9A-Fa-f.])(?:[+-]?(?:0x)?(?:(?:[0-9A-Fa-f]+(?:\.[0-9A-Fa-f]*)?)|(?:\.[0-9A-Fa-f]+)))\b + +POSINT \b(?:[1-9][0-9]*)\b +NONNEGINT \b(?:[0-9]+)\b +WORD \b\w+\b +NOTSPACE \S+ +SPACE \s* +DATA .*? +GREEDYDATA .* +QUOTEDSTRING (?>(?<!\\)(?>"(?>\\.|[^\\"]+)+"|""|(?>'(?>\\.|[^\\']+)+')|''|(?>`(?>\\.|[^\\`]+)+`)|``)) +UUID [A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12} +# URN, allowing use of RFC 2141 section 2.3 reserved characters +URN urn:[0-9A-Za-z][0-9A-Za-z-]{0,31}:(?:%[0-9a-fA-F]{2}|[0-9A-Za-z()+,.:=@;$_!*'/?#-])+ + +# Networking +MAC (?:%{CISCOMAC}|%{WINDOWSMAC}|%{COMMONMAC}) +CISCOMAC (?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4}) +WINDOWSMAC (?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2}) +COMMONMAC (?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2}) +IPV6 ((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5 ]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)? +IPV4 (?<![0-9])(?:(?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5]))(?![0-9]) +IP (?:%{IPV6}|%{IPV4}) +HOSTNAME \b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\.?|\b) +IPORHOST (?:%{IP}|%{HOSTNAME}) +HOSTPORT %{IPORHOST}:%{POSINT} + +# paths +PATH (?:%{UNIXPATH}|%{WINPATH}) +UNIXPATH (/([\w_%!$@:.,+~-]+|\\.)*)+ +TTY (?:/dev/(pts|tty([pq])?)(\w+)?/?(?:[0-9]+)) +WINPATH (?>[A-Za-z]+:|\\)(?:\\[^\\?*]*)+ +URIPROTO [A-Za-z]([A-Za-z0-9+\-.]+)+ +URIHOST %{IPORHOST}(?::%{POSINT:port})? +# uripath comes loosely from RFC1738, but mostly from what Firefox +# doesn't turn into %XX +URIPATH (?:/[A-Za-z0-9$.+!*'(){},~:;=@#%&_\-]*)+ +#URIPARAM \?(?:[A-Za-z0-9]+(?:=(?:[^&]*))?(?:&(?:[A-Za-z0-9]+(?:=(?:[^&]*))?)?)*)? +URIPARAM \?[A-Za-z0-9$.+!*'|(){},~@#%&/=:;_?\-\[\]<>]* +URIPATHPARAM %{URIPATH}(?:%{URIPARAM})? +URI %{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})? + +# Months: January, Feb, 3, 03, 12, December +MONTH \b(?:[Jj]an(?:uary|uar)?|[Ff]eb(?:ruary|ruar)?|[Mm](?:a|ä)?r(?:ch|z)?|[Aa]pr(?:il)?|[Mm]a(?:y|i)?|[Jj]un(?:e|i)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ep(?:tember)?|[Oo](?:c|k)?t(?:ober)?|[Nn]ov(?:ember)?|[Dd]e(?:c|z)(?:ember)?)\b +MONTHNUM (?:0?[1-9]|1[0-2]) +MONTHNUM2 (?:0[1-9]|1[0-2]) +MONTHDAY (?:(?:0[1-9])|(?:[12][0-9])|(?:3[01])|[1-9]) + +# Days: Monday, Tue, Thu, etc... +DAY (?:Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?) + +# Years? +YEAR (?>\d\d){1,2} +HOUR (?:2[0123]|[01]?[0-9]) +MINUTE (?:[0-5][0-9]) +# '60' is a leap second in most time standards and thus is valid. +SECOND (?:(?:[0-5]?[0-9]|60)(?:[:.,][0-9]+)?) +TIME (?!<[0-9])%{HOUR}:%{MINUTE}(?::%{SECOND})(?![0-9]) +# datestamp is YYYY/MM/DD-HH:MM:SS.UUUU (or something like it) +DATE_US %{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR} +DATE_EU %{MONTHDAY}[./-]%{MONTHNUM}[./-]%{YEAR} +ISO8601_TIMEZONE (?:Z|[+-]%{HOUR}(?::?%{MINUTE})) +ISO8601_SECOND (?:%{SECOND}|60) +TIMESTAMP_ISO8601 %{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR}:?%{MINUTE}(?::?%{SECOND})?%{ISO8601_TIMEZONE}? +DATE %{DATE_US}|%{DATE_EU} +DATESTAMP %{DATE}[- ]%{TIME} +TZ (?:[APMCE][SD]T|UTC) +DATESTAMP_RFC822 %{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ} +DATESTAMP_RFC2822 %{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{TIME} %{ISO8601_TIMEZONE} +DATESTAMP_OTHER %{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR} +DATESTAMP_EVENTLOG %{YEAR}%{MONTHNUM2}%{MONTHDAY}%{HOUR}%{MINUTE}%{SECOND} + +# Syslog Dates: Month Day HH:MM:SS +SYSLOGTIMESTAMP %{MONTH} +%{MONTHDAY} %{TIME} +PROG [\x21-\x5a\x5c\x5e-\x7e]+ +SYSLOGPROG %{PROG:program}(?:\[%{POSINT:pid}\])? +SYSLOGHOST %{IPORHOST} +SYSLOGFACILITY <%{NONNEGINT:facility}.%{NONNEGINT:priority}> +HTTPDATE %{MONTHDAY}/%{MONTH}/%{YEAR}:%{TIME} %{INT} + +# Shortcuts +QS %{QUOTEDSTRING} + +# Log formats +SYSLOGBASE %{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}: + +# Log Levels +LOGLEVEL ([Aa]lert|ALERT|[Tt]race|TRACE|[Dd]ebug|DEBUG|[Nn]otice|NOTICE|[Ii]nfo|INFO|[Ww]arn?(?:ing)?|WARN?(?:ING)?|[Ee]rr?(?:or)?|ERR?(?:OR)?|[Cc]rit?(?:ical)?|CRIT?(?:ICAL)?|[Ff]atal|FATAL|[Ss]evere|SEVERE|EMERG(?:ENCY)?|[Ee]merg(?:ency)?) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/791306b8/gobblin-modules/gobblin-grok/src/test/java/org/apache/gobblin/converter/grok/GrokToJsonConverterTest.java ---------------------------------------------------------------------- diff --git a/gobblin-modules/gobblin-grok/src/test/java/org/apache/gobblin/converter/grok/GrokToJsonConverterTest.java b/gobblin-modules/gobblin-grok/src/test/java/org/apache/gobblin/converter/grok/GrokToJsonConverterTest.java new file mode 100644 index 0000000..3a4b78b --- /dev/null +++ b/gobblin-modules/gobblin-grok/src/test/java/org/apache/gobblin/converter/grok/GrokToJsonConverterTest.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.gobblin.converter.grok; + +import java.io.InputStreamReader; + +import org.testng.Assert; +import org.testng.annotations.Test; + +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; + +import gobblin.configuration.WorkUnitState; + +import org.apache.gobblin.converter.DataConversionException; + + +@Test(groups = {"gobblin.converter"}) +public class GrokToJsonConverterTest { + @Test + public void convertOutputWithNullableFields() + throws Exception { + JsonParser parser = new JsonParser(); + + String inputRecord = + "10.121.123.104 - - [01/Nov/2012:21:01:17 +0100] \"GET /cpc/auth.do?loginsetup=true&targetPage=%2Fcpc%2F HTTP/1.1\" 302 466"; + + JsonElement jsonElement = parser + .parse(new InputStreamReader(getClass().getResourceAsStream("/converter/grok/schemaWithNullableFields.json"))); + JsonArray outputSchema = jsonElement.getAsJsonArray(); + + GrokToJsonConverter grokToJsonConverter = new GrokToJsonConverter(); + WorkUnitState workUnitState = new WorkUnitState(); + workUnitState.setProp(GrokToJsonConverter.GROK_PATTERN, + "^%{IPORHOST:clientip} (?:-|%{USER:ident}) (?:-|%{USER:auth}) \\[%{HTTPDATE:timestamp}\\] \\\"(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|-)\\\" %{NUMBER:response} (?:-|%{NUMBER:bytes})"); + + grokToJsonConverter.init(workUnitState); + JsonObject actual = grokToJsonConverter.convertRecord(outputSchema, inputRecord, workUnitState).iterator().next(); + + JsonObject expected = + parser.parse(new InputStreamReader(getClass().getResourceAsStream("/converter/grok/convertedRecord.json"))) + .getAsJsonObject(); + Assert.assertEquals(actual, expected); + grokToJsonConverter.close(); + } + + @Test(expectedExceptions = DataConversionException.class) + public void convertOutputWithNonNullableFieldsShouldThrowDataConversionException() + throws Exception { + JsonParser parser = new JsonParser(); + + String inputRecord = + "10.121.123.104 - - [01/Nov/2012:21:01:17 +0100] \"GET /cpc/auth.do?loginsetup=true&targetPage=%2Fcpc%2F HTTP/1.1\" 302 466"; + + JsonElement jsonElement = parser.parse( + new InputStreamReader(getClass().getResourceAsStream("/converter/grok/schemaWithNonNullableFields.json"))); + JsonArray outputSchema = jsonElement.getAsJsonArray(); + + GrokToJsonConverter grokToJsonConverter = new GrokToJsonConverter(); + WorkUnitState workUnitState = new WorkUnitState(); + workUnitState.setProp(GrokToJsonConverter.GROK_PATTERN, + "^%{IPORHOST:clientip} (?:-|%{USER:ident}) (?:-|%{USER:auth}) \\[%{HTTPDATE:timestamp}\\] \\\"(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|-)\\\" %{NUMBER:response} (?:-|%{NUMBER:bytes})"); + + grokToJsonConverter.init(workUnitState); + JsonObject actual = grokToJsonConverter.convertRecord(outputSchema, inputRecord, workUnitState).iterator().next(); + + JsonObject expected = + parser.parse(new InputStreamReader(getClass().getResourceAsStream("/converter/grok/convertedRecord.json"))) + .getAsJsonObject(); + grokToJsonConverter.close(); + } + + @Test + public void convertWithNullStringSet() + throws Exception { + JsonParser parser = new JsonParser(); + + String inputRecord = + "79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be mybucket [06/Feb/2014:00:00:38 +0000] 192.0.2.3 79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be 3E57427F3EXAMPLE REST.GET.VERSIONING - \"GET /mybucket?versioning HTTP/1.1\" 200 - 113 - 7 - \"-\" \"S3Console/0.4\" -"; + + JsonElement jsonElement = + parser.parse(new InputStreamReader(getClass().getResourceAsStream("/converter/grok/s3AccessLogSchema.json"))); + JsonArray outputSchema = jsonElement.getAsJsonArray(); + + GrokToJsonConverter grokToJsonConverter = new GrokToJsonConverter(); + WorkUnitState workUnitState = new WorkUnitState(); + //Grok expression was taken from https://github.com/logstash-plugins/logstash-patterns-core/blob/master/patterns/aws + workUnitState.setProp(GrokToJsonConverter.GROK_PATTERN, + "%{WORD:owner} %{NOTSPACE:bucket} \\[%{HTTPDATE:timestamp}\\] %{IP:clientip} %{NOTSPACE:requester} %{NOTSPACE:request_id} %{NOTSPACE:operation} %{NOTSPACE:key} (?:\"(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|%{DATA:rawrequest})\"|-) (?:%{INT:response:int}|-) (?:-|%{NOTSPACE:error_code}) (?:%{INT:bytes:int}|-) (?:%{INT:object_size:int}|-) (?:%{INT:request_time_ms:int}|-) (?:%{INT:turnaround_time_ms:int}|-) (?:%{QS:referrer}|-) (?:\"?%{QS:agent}\"?|-) (?:-|%{NOTSPACE:version_id})"); + workUnitState.setProp(GrokToJsonConverter.NULLSTRING_REGEXES, "[\\s-]"); + + grokToJsonConverter.init(workUnitState); + JsonObject actual = grokToJsonConverter.convertRecord(outputSchema, inputRecord, workUnitState).iterator().next(); + + JsonObject expected = parser + .parse(new InputStreamReader(getClass().getResourceAsStream("/converter/grok/convertedS3AccessLogRecord.json"))) + .getAsJsonObject(); + Assert.assertEquals(actual, expected); + grokToJsonConverter.close(); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/791306b8/gobblin-modules/gobblin-grok/src/test/resources/converter/grok/convertedRecord.json ---------------------------------------------------------------------- diff --git a/gobblin-modules/gobblin-grok/src/test/resources/converter/grok/convertedRecord.json b/gobblin-modules/gobblin-grok/src/test/resources/converter/grok/convertedRecord.json new file mode 100644 index 0000000..69f45f1 --- /dev/null +++ b/gobblin-modules/gobblin-grok/src/test/resources/converter/grok/convertedRecord.json @@ -0,0 +1 @@ +{"clientip":"10.121.123.104","ident":null,"auth":null,"timestamp":"01/Nov/2012:21:01:17 +0100","request":"/cpc/auth.do?loginsetup=true&targetPage=%2Fcpc%2F","httpversion":1.1,"response":302,"bytes":466} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/791306b8/gobblin-modules/gobblin-grok/src/test/resources/converter/grok/convertedS3AccessLogRecord.json ---------------------------------------------------------------------- diff --git a/gobblin-modules/gobblin-grok/src/test/resources/converter/grok/convertedS3AccessLogRecord.json b/gobblin-modules/gobblin-grok/src/test/resources/converter/grok/convertedS3AccessLogRecord.json new file mode 100644 index 0000000..29da948 --- /dev/null +++ b/gobblin-modules/gobblin-grok/src/test/resources/converter/grok/convertedS3AccessLogRecord.json @@ -0,0 +1 @@ +{"owner":"79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be","bucket":"mybucket","timestamp":"06/Feb/2014:00:00:38 +0000","clientip":"192.0.2.3","requester":"79a59df900b949e55d96a1e698fbacedfd6e09d98eacf8f8d5218e7cd47ef2be","request_id":"3E57427F3EXAMPLE","operation":"REST.GET.VERSIONING","key":null,"response":200,"error_code":null,"bytes":113,"object_size":null,"request_time_ms":7,"turnaround_time_ms":null,"referrer":null,"agent":"S3Console/0.4","version_id":null} http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/791306b8/gobblin-modules/gobblin-grok/src/test/resources/converter/grok/s3AccessLogSchema.json ---------------------------------------------------------------------- diff --git a/gobblin-modules/gobblin-grok/src/test/resources/converter/grok/s3AccessLogSchema.json b/gobblin-modules/gobblin-grok/src/test/resources/converter/grok/s3AccessLogSchema.json new file mode 100644 index 0000000..934399c --- /dev/null +++ b/gobblin-modules/gobblin-grok/src/test/resources/converter/grok/s3AccessLogSchema.json @@ -0,0 +1,138 @@ +[ + { + "columnName": "owner", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "string" + } + }, + { + "columnName": "bucket", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "string" + } + }, + { + "columnName": "timestamp", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "string" + } + }, + { + "columnName": "clientip", + "comment": "", + "isNullable": "false", + "dataType": { + "type": "string" + } + }, + { + "columnName": "requester", + "comment": "", + "isNullable": "false", + "dataType": { + "type": "string" + } + }, + { + "columnName": "request_id", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "string" + } + }, + { + "columnName": "operation", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "string" + } + }, + { + "columnName": "key", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "string" + } + }, + { + "columnName": "response", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "int" + } + }, + { + "columnName": "error_code", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "int" + } + }, + { + "columnName": "bytes", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "int" + } + }, + { + "columnName": "object_size", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "int" + } + }, + { + "columnName": "request_time_ms", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "int" + } + }, + { + "columnName": "turnaround_time_ms", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "int" + } + }, + { + "columnName": "referrer", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "string" + } + }, + { + "columnName": "agent", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "string" + } + }, + { + "columnName": "version_id", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "string" + } + } +] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/791306b8/gobblin-modules/gobblin-grok/src/test/resources/converter/grok/schemaWithNonNullableFields.json ---------------------------------------------------------------------- diff --git a/gobblin-modules/gobblin-grok/src/test/resources/converter/grok/schemaWithNonNullableFields.json b/gobblin-modules/gobblin-grok/src/test/resources/converter/grok/schemaWithNonNullableFields.json new file mode 100644 index 0000000..a1ce180 --- /dev/null +++ b/gobblin-modules/gobblin-grok/src/test/resources/converter/grok/schemaWithNonNullableFields.json @@ -0,0 +1,66 @@ +[ + { + "columnName": "clientip", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "string" + } + }, + { + "columnName": "ident", + "comment": "", + "isNullable": "false", + "dataType": { + "type": "string" + } + }, + { + "columnName": "auth", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "string" + } + }, + { + "columnName": "timestamp", + "comment": "", + "isNullable": "false", + "dataType": { + "type": "string" + } + }, + { + "columnName": "request", + "comment": "", + "isNullable": "false", + "dataType": { + "type": "string" + } + }, + { + "columnName": "httpversion", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "double" + } + }, + { + "columnName": "response", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "int" + } + }, + { + "columnName": "bytes", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "int" + } + } +] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/791306b8/gobblin-modules/gobblin-grok/src/test/resources/converter/grok/schemaWithNullableFields.json ---------------------------------------------------------------------- diff --git a/gobblin-modules/gobblin-grok/src/test/resources/converter/grok/schemaWithNullableFields.json b/gobblin-modules/gobblin-grok/src/test/resources/converter/grok/schemaWithNullableFields.json new file mode 100644 index 0000000..b8b0536 --- /dev/null +++ b/gobblin-modules/gobblin-grok/src/test/resources/converter/grok/schemaWithNullableFields.json @@ -0,0 +1,66 @@ +[ + { + "columnName": "clientip", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "string" + } + }, + { + "columnName": "ident", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "string" + } + }, + { + "columnName": "auth", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "string" + } + }, + { + "columnName": "timestamp", + "comment": "", + "isNullable": "false", + "dataType": { + "type": "string" + } + }, + { + "columnName": "request", + "comment": "", + "isNullable": "false", + "dataType": { + "type": "string" + } + }, + { + "columnName": "httpversion", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "double" + } + }, + { + "columnName": "response", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "int" + } + }, + { + "columnName": "bytes", + "comment": "", + "isNullable": "true", + "dataType": { + "type": "int" + } + } +] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/791306b8/gobblin-modules/gobblin-grok/src/test/resources/grok/grok-patterns ---------------------------------------------------------------------- diff --git a/gobblin-modules/gobblin-grok/src/test/resources/grok/grok-patterns b/gobblin-modules/gobblin-grok/src/test/resources/grok/grok-patterns new file mode 100644 index 0000000..3793e02 --- /dev/null +++ b/gobblin-modules/gobblin-grok/src/test/resources/grok/grok-patterns @@ -0,0 +1,97 @@ +#Forked from https://github.com/logstash-plugins/logstash-patterns-core/blob/master/patterns/grok-patterns + +USERNAME [a-zA-Z0-9._-]+ +USER %{USERNAME} +EMAILLOCALPART [a-zA-Z][a-zA-Z0-9_.+-=:]+ +EMAILADDRESS %{EMAILLOCALPART}@%{HOSTNAME} +INT (?:[+-]?(?:[0-9]+)) +BASE10NUM (?<![0-9.+-])(?>[+-]?(?:(?:[0-9]+(?:\.[0-9]+)?)|(?:\.[0-9]+))) +NUMBER (?:%{BASE10NUM}) +BASE16NUM (?<![0-9A-Fa-f])(?:[+-]?(?:0x)?(?:[0-9A-Fa-f]+)) +BASE16FLOAT \b(?<![0-9A-Fa-f.])(?:[+-]?(?:0x)?(?:(?:[0-9A-Fa-f]+(?:\.[0-9A-Fa-f]*)?)|(?:\.[0-9A-Fa-f]+)))\b + +POSINT \b(?:[1-9][0-9]*)\b +NONNEGINT \b(?:[0-9]+)\b +WORD \b\w+\b +NOTSPACE \S+ +SPACE \s* +DATA .*? +GREEDYDATA .* +QUOTEDSTRING (?>(?<!\\)(?>"(?>\\.|[^\\"]+)+"|""|(?>'(?>\\.|[^\\']+)+')|''|(?>`(?>\\.|[^\\`]+)+`)|``)) +UUID [A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12} +# URN, allowing use of RFC 2141 section 2.3 reserved characters +URN urn:[0-9A-Za-z][0-9A-Za-z-]{0,31}:(?:%[0-9a-fA-F]{2}|[0-9A-Za-z()+,.:=@;$_!*'/?#-])+ + +# Networking +MAC (?:%{CISCOMAC}|%{WINDOWSMAC}|%{COMMONMAC}) +CISCOMAC (?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4}) +WINDOWSMAC (?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2}) +COMMONMAC (?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2}) +IPV6 ((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5 ]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)? +IPV4 (?<![0-9])(?:(?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5]))(?![0-9]) +IP (?:%{IPV6}|%{IPV4}) +HOSTNAME \b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\.?|\b) +IPORHOST (?:%{IP}|%{HOSTNAME}) +HOSTPORT %{IPORHOST}:%{POSINT} + +# paths +PATH (?:%{UNIXPATH}|%{WINPATH}) +UNIXPATH (/([\w_%!$@:.,+~-]+|\\.)*)+ +TTY (?:/dev/(pts|tty([pq])?)(\w+)?/?(?:[0-9]+)) +WINPATH (?>[A-Za-z]+:|\\)(?:\\[^\\?*]*)+ +URIPROTO [A-Za-z]([A-Za-z0-9+\-.]+)+ +URIHOST %{IPORHOST}(?::%{POSINT:port})? +# uripath comes loosely from RFC1738, but mostly from what Firefox +# doesn't turn into %XX +URIPATH (?:/[A-Za-z0-9$.+!*'(){},~:;=@#%&_\-]*)+ +#URIPARAM \?(?:[A-Za-z0-9]+(?:=(?:[^&]*))?(?:&(?:[A-Za-z0-9]+(?:=(?:[^&]*))?)?)*)? +URIPARAM \?[A-Za-z0-9$.+!*'|(){},~@#%&/=:;_?\-\[\]<>]* +URIPATHPARAM %{URIPATH}(?:%{URIPARAM})? +URI %{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})? + +# Months: January, Feb, 3, 03, 12, December +MONTH \b(?:[Jj]an(?:uary|uar)?|[Ff]eb(?:ruary|ruar)?|[Mm](?:a|ä)?r(?:ch|z)?|[Aa]pr(?:il)?|[Mm]a(?:y|i)?|[Jj]un(?:e|i)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ep(?:tember)?|[Oo](?:c|k)?t(?:ober)?|[Nn]ov(?:ember)?|[Dd]e(?:c|z)(?:ember)?)\b +MONTHNUM (?:0?[1-9]|1[0-2]) +MONTHNUM2 (?:0[1-9]|1[0-2]) +MONTHDAY (?:(?:0[1-9])|(?:[12][0-9])|(?:3[01])|[1-9]) + +# Days: Monday, Tue, Thu, etc... +DAY (?:Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?) + +# Years? +YEAR (?>\d\d){1,2} +HOUR (?:2[0123]|[01]?[0-9]) +MINUTE (?:[0-5][0-9]) +# '60' is a leap second in most time standards and thus is valid. +SECOND (?:(?:[0-5]?[0-9]|60)(?:[:.,][0-9]+)?) +TIME (?!<[0-9])%{HOUR}:%{MINUTE}(?::%{SECOND})(?![0-9]) +# datestamp is YYYY/MM/DD-HH:MM:SS.UUUU (or something like it) +DATE_US %{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR} +DATE_EU %{MONTHDAY}[./-]%{MONTHNUM}[./-]%{YEAR} +ISO8601_TIMEZONE (?:Z|[+-]%{HOUR}(?::?%{MINUTE})) +ISO8601_SECOND (?:%{SECOND}|60) +TIMESTAMP_ISO8601 %{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR}:?%{MINUTE}(?::?%{SECOND})?%{ISO8601_TIMEZONE}? +DATE %{DATE_US}|%{DATE_EU} +DATESTAMP %{DATE}[- ]%{TIME} +TZ (?:[APMCE][SD]T|UTC) +DATESTAMP_RFC822 %{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ} +DATESTAMP_RFC2822 %{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{TIME} %{ISO8601_TIMEZONE} +DATESTAMP_OTHER %{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR} +DATESTAMP_EVENTLOG %{YEAR}%{MONTHNUM2}%{MONTHDAY}%{HOUR}%{MINUTE}%{SECOND} + +# Syslog Dates: Month Day HH:MM:SS +SYSLOGTIMESTAMP %{MONTH} +%{MONTHDAY} %{TIME} +PROG [\x21-\x5a\x5c\x5e-\x7e]+ +SYSLOGPROG %{PROG:program}(?:\[%{POSINT:pid}\])? +SYSLOGHOST %{IPORHOST} +SYSLOGFACILITY <%{NONNEGINT:facility}.%{NONNEGINT:priority}> +HTTPDATE %{MONTHDAY}/%{MONTH}/%{YEAR}:%{TIME} %{INT} + +# Shortcuts +QS %{QUOTEDSTRING} + +# Log formats +SYSLOGBASE %{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}: + +# Log Levels +LOGLEVEL ([Aa]lert|ALERT|[Tt]race|TRACE|[Dd]ebug|DEBUG|[Nn]otice|NOTICE|[Ii]nfo|INFO|[Ww]arn?(?:ing)?|WARN?(?:ING)?|[Ee]rr?(?:or)?|ERR?(?:OR)?|[Cc]rit?(?:ical)?|CRIT?(?:ICAL)?|[Ff]atal|FATAL|[Ss]evere|SEVERE|EMERG(?:ENCY)?|[Ee]merg(?:ency)?) \ No newline at end of file
