[
https://issues.apache.org/jira/browse/BEAM-4626?focusedWorklogId=115617&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-115617
]
ASF GitHub Bot logged work on BEAM-4626:
----------------------------------------
Author: ASF GitHub Bot
Created on: 25/Jun/18 20:06
Start Date: 25/Jun/18 20:06
Worklog Time Spent: 10m
Work Description: kennknowles commented on a change in pull request
#5748: [BEAM-4626] SQL text tables of raw lines
URL: https://github.com/apache/beam/pull/5748#discussion_r197924869
##########
File path:
sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/text/TextTableProviderTest.java
##########
@@ -17,69 +17,229 @@
*/
package org.apache.beam.sdk.extensions.sql.meta.provider.text;
-import static org.apache.beam.sdk.schemas.Schema.toSchema;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-
-import com.alibaba.fastjson.JSONObject;
-import java.util.stream.Stream;
-import org.apache.beam.sdk.extensions.sql.BeamSqlTable;
-import org.apache.beam.sdk.extensions.sql.meta.Table;
+import static org.hamcrest.Matchers.containsInAnyOrder;
+import static org.junit.Assert.assertThat;
+
+import com.google.common.base.Charsets;
+import java.io.File;
+import java.nio.file.Files;
+import org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv;
+import org.apache.beam.sdk.extensions.sql.impl.rel.BeamSqlRelUtils;
import org.apache.beam.sdk.schemas.Schema;
-import org.apache.commons.csv.CSVFormat;
+import org.apache.beam.sdk.testing.PAssert;
+import org.apache.beam.sdk.testing.TestPipeline;
+import org.apache.beam.sdk.transforms.MapElements;
+import org.apache.beam.sdk.util.BackOff;
+import org.apache.beam.sdk.util.NumberedShardedFile;
+import org.apache.beam.sdk.util.Sleeper;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.Row;
+import org.apache.beam.sdk.values.TypeDescriptors;
+import org.junit.Rule;
import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
-/** UnitTest for {@link TextTableProvider}. */
+/** Tests for {@link TextTableProvider}. */
public class TextTableProviderTest {
- private TextTableProvider provider = new TextTableProvider();
+ @Rule public TestPipeline pipeline = TestPipeline.create();
+
+ @Rule
+ public TemporaryFolder tempFolder =
+ new TemporaryFolder() {
+ @Override
+ protected void after() {}
+ };
+
+ private static final String SQL_CSV_SCHEMA = "(f_string VARCHAR, f_int INT)";
+ private static final Schema CSV_SCHEMA =
+ Schema.builder()
+ .addNullableField("f_string", Schema.FieldType.STRING)
+ .addNullableField("f_int", Schema.FieldType.INT32)
+ .build();
+
+ private static final Schema LINES_SCHEMA =
Schema.builder().addStringField("f_string").build();
+ private static final String SQL_LINES_SCHEMA = "(f_string VARCHAR)";
+
+ // Even though these have the same schema as LINES_SCHEMA, that is
accidental; they exist for a
+ // different purpose, to test Excel CSV format that does not ignore empty
lines
+ private static final Schema SINGLE_STRING_CSV_SCHEMA =
+ Schema.builder().addStringField("f_string").build();
+ private static final String SINGLE_STRING_SQL_SCHEMA = "(f_string VARCHAR)";
+
+ /**
+ * Tests {@code CREATE TABLE TYPE text} with no format reads a default CSV.
+ *
+ * <p>The default format ignores empty lines, so that is an important part
of this test.
+ */
@Test
- public void testGetTableType() throws Exception {
- assertEquals("text", provider.getTableType());
+ public void testLegacyDefaultCsv() throws Exception {
+ Files.write(
+ tempFolder.newFile("test.csv").toPath(),
+ "hello,13\n\ngoodbye,42\n".getBytes(Charsets.UTF_8));
+
+ BeamSqlEnv env = BeamSqlEnv.inMemory(new TextTableProvider());
+ env.executeDdl(
+ String.format(
+ "CREATE TABLE test %s TYPE text LOCATION '%s/*'",
+ SQL_CSV_SCHEMA, tempFolder.getRoot()));
+
+ PCollection<Row> rows =
+ BeamSqlRelUtils.toPCollection(pipeline, env.parseQuery("SELECT * FROM
test"));
+
+ PAssert.that(rows)
+ .containsInAnyOrder(
+ Row.withSchema(CSV_SCHEMA).addValues("hello", 13).build(),
+ Row.withSchema(CSV_SCHEMA).addValues("goodbye", 42).build());
+ pipeline.run();
}
+ /**
+ * Tests {@code CREATE TABLE TYPE text} with a format other than "csv" or
"lines" results in a CSV
+ * read of that format.
+ */
@Test
- public void testBuildBeamSqlTable() throws Exception {
- Table table = mockTable("hello", null);
- BeamSqlTable sqlTable = provider.buildBeamSqlTable(table);
+ public void testLegacyTdfCsv() throws Exception {
+ Files.write(
+ tempFolder.newFile("test.csv").toPath(),
+ "hello\t13\n\ngoodbye\t42\n".getBytes(Charsets.UTF_8));
- assertNotNull(sqlTable);
- assertTrue(sqlTable instanceof BeamTextCSVTable);
+ BeamSqlEnv env = BeamSqlEnv.inMemory(new TextTableProvider());
+ env.executeDdl(
+ String.format(
+ "CREATE TABLE test %s TYPE text LOCATION '%s/*' TBLPROPERTIES
'{\"format\":\"TDF\"}'",
Review comment:
@XuMingmin This is a test of TSV actually. We could register `TSV` as an
alias for `TDF` once we move to our own enum.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
Issue Time Tracking
-------------------
Worklog Id: (was: 115617)
Time Spent: 50m (was: 40m)
> Support text table format with a single column of the lines of the files
> ------------------------------------------------------------------------
>
> Key: BEAM-4626
> URL: https://issues.apache.org/jira/browse/BEAM-4626
> Project: Beam
> Issue Type: New Feature
> Components: dsl-sql
> Reporter: Kenneth Knowles
> Assignee: Kenneth Knowles
> Priority: Major
> Time Spent: 50m
> Remaining Estimate: 0h
>
> Today, SQL can read CSV and allows a {{format}} flag to control what CSV
> variant is used. But to do easy things and write pure SQL jobs it would be
> nice to just read the text file as a one-column table and do transformations
> in SQL.
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)