paul-rogers commented on a change in pull request #1114: Drill-6104: Added Logfile Reader URL: https://github.com/apache/drill/pull/1114#discussion_r200862441
########## File path: exec/java-exec/src/test/java/org/apache/drill/exec/store/log/TestLogReader.java ########## @@ -0,0 +1,368 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.store.log; + +import org.apache.drill.common.exceptions.ExecutionSetupException; +import org.apache.drill.common.types.TypeProtos.MinorType; +import org.apache.drill.exec.record.BatchSchema; +import org.apache.drill.exec.rpc.RpcException; +import org.apache.drill.exec.server.Drillbit; +import org.apache.drill.exec.store.StoragePluginRegistry; +import org.apache.drill.exec.store.dfs.FileSystemConfig; +import org.apache.drill.exec.store.dfs.FileSystemPlugin; +import org.apache.drill.test.BaseDirTestWatcher; +import org.apache.drill.test.ClusterFixture; +import org.apache.drill.test.ClusterTest; +import org.apache.drill.test.rowSet.RowSet; +import org.apache.drill.test.rowSet.RowSetComparison; +import org.apache.drill.test.rowSet.schema.SchemaBuilder; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; + +import java.util.ArrayList; + +import static org.junit.Assert.assertEquals; + +public class TestLogReader extends ClusterTest { + + public static final String DATE_ONLY_PATTERN = "(\\d\\d\\d\\d)-(\\d\\d)-(\\d\\d) .*"; + + @ClassRule + public static final BaseDirTestWatcher dirTestWatcher = new BaseDirTestWatcher(); + + @BeforeClass + public static void setup() throws Exception { + ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher)); + + // Define a regex format config for testing. + + defineRegexPlugin(); + } + + private static void defineRegexPlugin() throws ExecutionSetupException { + + // Create an instance of the regex config. + // Note: we can't use the ".log" extension; the Drill .gitignore + // file ignores such files, so they'll never get committed. Instead, + // make up a fake suffix. + + LogFormatConfig sampleConfig = new LogFormatConfig(); + sampleConfig.extension = "log1"; + sampleConfig.regex = DATE_ONLY_PATTERN; + + sampleConfig.schema = new ArrayList<LogFormatField>(); + sampleConfig.schema.add( new LogFormatField("year","INT")); + sampleConfig.schema.add( new LogFormatField("month", "INT")); + sampleConfig.schema.add( new LogFormatField("day", "INT")); + + // Full Drill log parser definition. + + LogFormatConfig logConfig = new LogFormatConfig(); + logConfig.extension = "log1"; + logConfig.regex = "(\\d\\d\\d\\d)-(\\d\\d)-(\\d\\d) " + + "(\\d\\d):(\\d\\d):(\\d\\d),\\d+ " + + "\\[([^]]*)] (\\w+)\\s+(\\S+) - (.*)"; + + logConfig.schema = new ArrayList<LogFormatField>(); + logConfig.schema.add( new LogFormatField("year","INT")); + logConfig.schema.add( new LogFormatField("month","INT")); + logConfig.schema.add( new LogFormatField("day","INT")); + logConfig.schema.add( new LogFormatField("hour","INT")); + logConfig.schema.add( new LogFormatField("minute","INT")); + logConfig.schema.add( new LogFormatField("second","INT")); + logConfig.schema.add( new LogFormatField("thread")); + logConfig.schema.add( new LogFormatField("level")); + logConfig.schema.add( new LogFormatField("module")); + logConfig.schema.add( new LogFormatField("message")); + + + //Set up additional configs to check the time/date formats + LogFormatConfig logDateConfig = new LogFormatConfig(); + logDateConfig.extension = "log2"; + logDateConfig.regex = "(\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}),(\\d+)\\s\\[(\\w+)\\]\\s([A-Z]+)\\s(.+)"; + + logDateConfig.schema = new ArrayList<LogFormatField>(); + logDateConfig.schema.add( new LogFormatField( "entry_date", "TIMESTAMP", "yy-MM-dd hh:mm:ss")); + logDateConfig.schema.add( new LogFormatField( "pid", "INT")); + logDateConfig.schema.add( new LogFormatField( "location")); + logDateConfig.schema.add( new LogFormatField( "message_type")); + logDateConfig.schema.add( new LogFormatField( "message")); + + logDateConfig.maxErrors = 3; + + LogFormatConfig mysqlLogConfig = new LogFormatConfig(); + mysqlLogConfig.extension = "sqllog"; + mysqlLogConfig.regex = "(\\d{6})\\s(\\d{2}:\\d{2}:\\d{2})\\s+(\\d+)\\s(\\w+)\\s+(.+)"; + + + // Define a temporary format plugin for the "cp" storage plugin. + Drillbit drillbit = cluster.drillbit(); + final StoragePluginRegistry pluginRegistry = drillbit.getContext().getStorage(); + final FileSystemPlugin plugin = (FileSystemPlugin) pluginRegistry.getPlugin("cp"); + final FileSystemConfig pluginConfig = (FileSystemConfig) plugin.getConfig(); + pluginConfig.formats.put("sample", sampleConfig); + pluginConfig.formats.put("drill-log", logConfig); + pluginConfig.formats.put("date-log",logDateConfig); + pluginConfig.formats.put( "mysql-log", mysqlLogConfig); + pluginRegistry.createOrUpdate("cp", pluginConfig, false); + + } + + @Test + public void testWildcard() throws RpcException { + String sql = "SELECT * FROM cp.`regex/simple.log1`"; + RowSet results = client.queryBuilder().sql(sql).rowSet(); + + BatchSchema expectedSchema = new SchemaBuilder() + .addNullable("year", MinorType.INT) + .addNullable("month", MinorType.INT) + .addNullable("day", MinorType.INT) + .build(); + + RowSet expected = client.rowSetBuilder(expectedSchema) + .addRow(2017, 12, 17) + .addRow(2017, 12, 18) + .addRow(2017, 12, 19) + .build(); + + new RowSetComparison(expected).verifyAndClearAll(results); + } + + @Test + public void testExplicit() throws RpcException { + String sql = "SELECT `day`, `month` FROM cp.`regex/simple.log1`"; + RowSet results = client.queryBuilder().sql(sql).rowSet(); + + BatchSchema expectedSchema = new SchemaBuilder() + .addNullable("day", MinorType.INT) + .addNullable("month", MinorType.INT) + .build(); + + RowSet expected = client.rowSetBuilder(expectedSchema) + .addRow(17, 12) + .addRow(18, 12) + .addRow(19, 12) + .build(); + +// results.print(); +// expected.print(); + new RowSetComparison(expected).verifyAndClearAll(results); + } + + @Test + public void testMissing() throws RpcException { + String sql = "SELECT `day`, `missing`, `month` FROM cp.`regex/simple.log1`"; + RowSet results = client.queryBuilder().sql(sql).rowSet(); + + BatchSchema expectedSchema = new SchemaBuilder() + .addNullable("day", MinorType.INT) + .addNullable("missing", MinorType.VARCHAR) + .addNullable("month", MinorType.INT) + .build(); + + RowSet expected = client.rowSetBuilder(expectedSchema) + .addRow(17, null, 12) + .addRow(18, null, 12) + .addRow(19, null, 12) + .build(); + +// results.print(); +// expected.print(); + new RowSetComparison(expected).verifyAndClearAll(results); + } + + @Test + public void testRaw() throws RpcException { + String sql = "SELECT `_raw` FROM cp.`regex/simple.log1`"; + RowSet results = client.queryBuilder().sql(sql).rowSet(); + + BatchSchema expectedSchema = new SchemaBuilder() + .addNullable("_raw", MinorType.VARCHAR) + .build(); + + RowSet expected = client.rowSetBuilder(expectedSchema) + .addRow("2017-12-17 10:52:41,820 [main] INFO o.a.d.e.e.f.FunctionImplementationRegistry - Function registry loaded. 459 functions loaded in 1396 ms.") + .addRow("2017-12-18 10:52:37,652 [main] INFO o.a.drill.common.config.DrillConfig - Configuration and plugin file(s) identified in 115ms.") + .addRow("2017-12-19 11:12:27,278 [main] ERROR o.apache.drill.exec.server.Drillbit - Failure during initial startup of Drillbit.") + .build(); + new RowSetComparison(expected).verifyAndClearAll(results); + } + + + @Test + public void testDate() throws RpcException { + String sql = "SELECT TYPEOF(`entry_date`) AS entry_date FROM cp.`regex/simple.log2` LIMIT 1"; + RowSet results = client.queryBuilder().sql(sql).rowSet(); + + BatchSchema expectedSchema = new SchemaBuilder() + .add("entry_date", MinorType.VARCHAR) + .build(); + + RowSet expected = client.rowSetBuilder(expectedSchema) + .addRow("TIMESTAMP") + .build(); + + new RowSetComparison(expected).verifyAndClearAll(results); + + } + + @Test + public void testCount() throws RpcException { + String sql = "SELECT COUNT(*) FROM cp.`regex/simple.log1`"; + long result = client.queryBuilder().sql(sql).singletonLong(); + assertEquals(3, result); + } + + @Test + public void testFull() throws RpcException { + String sql = "SELECT * FROM cp.`regex/simple.log1`"; + client.queryBuilder().sql(sql).printCsv(); + } + + //This section tests log queries without a defined schema + @Test + public void testStarQueryNoSchema() throws RpcException { + String sql = "SELECT * FROM cp.`regex/mysql.sqllog`"; + RowSet results = client.queryBuilder().sql(sql).rowSet(); + + BatchSchema expectedSchema = new SchemaBuilder() + .addNullable("field_0", MinorType.VARCHAR) + .addNullable("field_1", MinorType.VARCHAR) + .addNullable("field_2", MinorType.VARCHAR) + .addNullable("field_3", MinorType.VARCHAR) + .addNullable("field_4", MinorType.VARCHAR) + .build(); + + RowSet expected = client.rowSetBuilder(expectedSchema) + .addRow("070823", "21:00:32", "1", "Connect", "root@localhost on test1") + .addRow("070823", "21:00:48", "1", "Query", "show tables") + .addRow("070823", "21:00:56", "1", "Query", "select * from category" ) + .addRow("070917", "16:29:01", "21", "Query","select * from location" ) + .addRow("070917", "16:29:12", "21", "Query","select * from location where id = 1 LIMIT 1" ) + .build(); + + //results.print(); + //expected.print(); + new RowSetComparison(expected).verifyAndClearAll(results); + } + + @Test + public void testAllFieldsQueryNoSchema() throws RpcException { + String sql = "SELECT field_0, field_1, field_2, field_3, field_4 FROM cp.`regex/mysql.sqllog`"; + RowSet results = client.queryBuilder().sql(sql).rowSet(); + + BatchSchema expectedSchema = new SchemaBuilder() + .addNullable("field_0", MinorType.VARCHAR) + .addNullable("field_1", MinorType.VARCHAR) + .addNullable("field_2", MinorType.VARCHAR) + .addNullable("field_3", MinorType.VARCHAR) + .addNullable("field_4", MinorType.VARCHAR) + .build(); + + RowSet expected = client.rowSetBuilder(expectedSchema) + .addRow("070823", "21:00:32", "1", "Connect", "root@localhost on test1") + .addRow("070823", "21:00:48", "1", "Query", "show tables") + .addRow("070823", "21:00:56", "1", "Query", "select * from category" ) + .addRow("070917", "16:29:01", "21", "Query","select * from location" ) + .addRow("070917", "16:29:12", "21", "Query","select * from location where id = 1 LIMIT 1" ) + .build(); + + new RowSetComparison(expected).verifyAndClearAll(results); + } + + @Test + public void testSomeFieldsQueryNoSchema() throws RpcException { + String sql = "SELECT field_0, field_4 FROM cp.`regex/mysql.sqllog`"; + RowSet results = client.queryBuilder().sql(sql).rowSet(); + + BatchSchema expectedSchema = new SchemaBuilder() + .addNullable("field_0", MinorType.VARCHAR) + .addNullable("field_4", MinorType.VARCHAR) + .build(); + + RowSet expected = client.rowSetBuilder(expectedSchema) + .addRow("070823", "root@localhost on test1") + .addRow("070823", "show tables") + .addRow("070823", "select * from category" ) + .addRow("070917", "select * from location" ) + .addRow("070917", "select * from location where id = 1 LIMIT 1" ) + .build(); + + new RowSetComparison(expected).verifyAndClearAll(results); + } + + @Test + public void testRawNoSchema() throws RpcException { + String sql = "SELECT _raw FROM cp.`regex/mysql.sqllog`"; + RowSet results = client.queryBuilder().sql(sql).rowSet(); + + BatchSchema expectedSchema = new SchemaBuilder() + .addNullable("_raw", MinorType.VARCHAR) + .build(); + + RowSet expected = client.rowSetBuilder(expectedSchema) + .addRow("070823 21:00:32 1 Connect root@localhost on test1") + .addRow("070823 21:00:48 1 Query show tables") + .addRow("070823 21:00:56 1 Query select * from category" ) + .addRow("070917 16:29:01 21 Query select * from location" ) + .addRow("070917 16:29:12 21 Query select * from location where id = 1 LIMIT 1" ) + .build(); + + new RowSetComparison(expected).verifyAndClearAll(results); + } + + @Test + public void testUMNoSchema() throws RpcException { + String sql = "SELECT _unmatched_rows FROM cp.`regex/mysql.sqllog`"; + RowSet results = client.queryBuilder().sql(sql).rowSet(); + + BatchSchema expectedSchema = new SchemaBuilder() + .addNullable("_unmatched_rows", MinorType.VARCHAR) + .build(); + + RowSet expected = client.rowSetBuilder(expectedSchema) + .addRow("dfadkfjaldkjafsdfjlksdjflksjdlkfjsldkfjslkjl") + .build(); + + new RowSetComparison(expected).verifyAndClearAll(results); + } + + @Test + public void testRawUMNoSchema() throws RpcException { + String sql = "SELECT _raw, _unmatched_rows FROM cp.`regex/mysql.sqllog`"; + RowSet results = client.queryBuilder().sql(sql).rowSet(); + + BatchSchema expectedSchema = new SchemaBuilder() + .addNullable("_raw", MinorType.VARCHAR) + .addNullable("_unmatched_rows", MinorType.VARCHAR) + .build(); + + RowSet expected = client.rowSetBuilder(expectedSchema) + .addRow("070823 21:00:32 1 Connect root@localhost on test1", null) + .addRow("070823 21:00:48 1 Query show tables", null) + .addRow("070823 21:00:56 1 Query select * from category", null ) + .addRow("070917 16:29:01 21 Query select * from location", null ) + .addRow("070917 16:29:12 21 Query select * from location where id = 1 LIMIT 1", null ) + .addRow( null, "dfadkfjaldkjafsdfjlksdjflksjdlkfjsldkfjslkjl") + .build(); + + new RowSetComparison(expected).verifyAndClearAll(results); + } + +} Review comment: Missing a test for a bogus type name, repeated field names and other similar error conditions. What else should be checked? Null or empty file extension? Negative minimum errors? min/max errors out of sync? What else? The way to think about this is: we only need to test those cases that we don't want the Drill team to have to fix in response to an escalation in a production system. If it is OK to wait to fix the issue until an escalation, we don't need to test it. (Now, I'm sure the Drill team will suggest that there are no such cases that are OK to fix in an escalation...) Here's how I think about this. For every field that that the user might set, think of all possible values, write code to test for, and fail, for bad values, then write tests to provide that he bad values are rejected. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
