[
https://issues.apache.org/jira/browse/TAJO-2179?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15443491#comment-15443491
]
ASF GitHub Bot commented on TAJO-2179:
--------------------------------------
Github user jihoonson commented on a diff in the pull request:
https://github.com/apache/tajo/pull/1046#discussion_r76532365
--- Diff:
tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/regex/RegexLineDeserializer.java
---
@@ -0,0 +1,167 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.storage.regex;
+
+
+import io.netty.buffer.ByteBuf;
+import io.netty.util.CharsetUtil;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tajo.catalog.Column;
+import org.apache.tajo.catalog.Schema;
+import org.apache.tajo.catalog.TableMeta;
+import org.apache.tajo.datum.Datum;
+import org.apache.tajo.datum.NullDatum;
+import org.apache.tajo.exception.InvalidTablePropertyException;
+import org.apache.tajo.exception.TajoRuntimeException;
+import org.apache.tajo.plan.util.PlannerUtil;
+import org.apache.tajo.storage.FieldSerializerDeserializer;
+import org.apache.tajo.storage.StorageConstants;
+import org.apache.tajo.storage.Tuple;
+import org.apache.tajo.storage.text.TextFieldSerializerDeserializer;
+import org.apache.tajo.storage.text.TextLineDeserializer;
+import org.apache.tajo.storage.text.TextLineParsingError;
+import org.apache.tajo.storage.text.TextLineSerDe;
+
+import java.io.IOException;
+import java.nio.charset.CharsetDecoder;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class RegexLineDeserializer extends TextLineDeserializer {
+ private static final Log LOG =
LogFactory.getLog(RegexLineDeserializer.class);
+
+ private final CharsetDecoder decoder =
CharsetUtil.getDecoder(CharsetUtil.UTF_8);
+ private FieldSerializerDeserializer fieldSerDer;
+ private ByteBuf nullChars;
+
+ private int[] targetColumnIndexes;
+ private String inputRegex;
+ private Pattern inputPattern;
+ // Number of rows not matching the regex
+ private long unmatchedRows = 0;
+ private long nextUnmatchedRows = 1;
+ // Number of rows that match the regex but have missing groups.
+ private long partialMatchedRows = 0;
+ private long nextPartialMatchedRows = 1;
+
+ public RegexLineDeserializer(Schema schema, TableMeta meta, Column[]
projected) {
+ super(schema, meta);
+ targetColumnIndexes = PlannerUtil.getTargetIds(schema, projected);
+ }
+
+ @Override
+ public void init() {
+ fieldSerDer = new TextFieldSerializerDeserializer(meta);
+ fieldSerDer.init(schema);
+
+ // Read the configuration parameters
+ inputRegex = meta.getProperty(StorageConstants.TEXT_REGEX);
+ boolean inputRegexIgnoreCase = "true".equalsIgnoreCase(
+ meta.getProperty(StorageConstants.TEXT_REGEX_CASE_INSENSITIVE,
"false"));
+
+ // Parse the configuration parameters
+ if (inputRegex != null) {
+ inputPattern = Pattern.compile(inputRegex, Pattern.DOTALL
+ + (inputRegexIgnoreCase ? Pattern.CASE_INSENSITIVE : 0));
+ } else {
+ throw new TajoRuntimeException(new
InvalidTablePropertyException(StorageConstants.TEXT_REGEX,
+ "This table does not have serde property \"" +
StorageConstants.TEXT_REGEX + "\"!"));
+ }
+
+ if (nullChars != null) {
+ nullChars.release();
+ }
+ nullChars = TextLineSerDe.getNullChars(meta);
+ }
+
+
+ @Override
+ public void deserialize(final ByteBuf lineBuf, Tuple output) throws
IOException, TextLineParsingError {
+
+ if (lineBuf == null || targetColumnIndexes.length == 0) {
+ return;
+ }
+
+ String line = decoder.decode(lineBuf.nioBuffer(lineBuf.readerIndex(),
lineBuf.readableBytes())).toString();
+ int[] projection = targetColumnIndexes;
+
+ // Projection
+ int currentTarget = 0;
+ int currentIndex = 0;
+ Matcher m = inputPattern.matcher(line);
+
+ if (!m.matches()) {
+ unmatchedRows++;
+ if (unmatchedRows >= nextUnmatchedRows) {
+ nextUnmatchedRows *= 100;
+ // Report the row
+ LOG.warn("" + unmatchedRows + " unmatched rows are found: " +
line);
--- End diff --
Never mind. Printing which line is not matched will be much helpful.
> Add a regular expression scanner and appender
> ---------------------------------------------
>
> Key: TAJO-2179
> URL: https://issues.apache.org/jira/browse/TAJO-2179
> Project: Tajo
> Issue Type: New Feature
> Components: Storage
> Reporter: Jinho Kim
> Assignee: Jinho Kim
> Fix For: 0.12.0
>
>
> Regex deserializer support is very desired feature for users who want to
> analysis log data. We also need to support this feature.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)