This is an automated email from the ASF dual-hosted git repository.
mblow pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/asterixdb.git
The following commit(s) were added to refs/heads/master by this push:
new c503ef8 [ASTERIXDB-3005][EXT]: Ignore byte order mark when reading
ext data
new 574595e Merge branch 'gerrit/neo'
c503ef8 is described below
commit c503ef8028d5786fee8031c4728d18ee081dcbd3
Author: Hussain Towaileb <[email protected]>
AuthorDate: Thu Jan 13 23:34:51 2022 +0300
[ASTERIXDB-3005][EXT]: Ignore byte order mark when reading ext data
Change-Id: Ic7a863097ec4a6adad018785011f0d26d540f2a5
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/14785
Tested-by: Hussain Towaileb <[email protected]>
Integration-Tests: Hussain Towaileb <[email protected]>
Reviewed-by: Hussain Towaileb <[email protected]>
Reviewed-by: Michael Blow <[email protected]>
---
.../external_dataset/ExternalDatasetTestUtils.java | 30 ++++++++++++++++++-
.../aws/AwsS3ExternalDatasetOnePartitionTest.java | 4 +++
.../aws/AwsS3ExternalDatasetTest.java | 14 ++++++++-
.../common/byte_order_mark/csv/test.000.ddl.sqlpp | 35 ++++++++++++++++++++++
.../byte_order_mark/csv/test.001.query.sqlpp | 23 ++++++++++++++
.../common/byte_order_mark/csv/test.099.ddl.sqlpp | 20 +++++++++++++
.../common/byte_order_mark/json/test.000.ddl.sqlpp | 34 +++++++++++++++++++++
.../byte_order_mark/json/test.001.query.sqlpp | 23 ++++++++++++++
.../common/byte_order_mark/json/test.099.ddl.sqlpp | 20 +++++++++++++
.../common/byte_order_mark/tsv/test.000.ddl.sqlpp | 35 ++++++++++++++++++++++
.../byte_order_mark/tsv/test.001.query.sqlpp | 23 ++++++++++++++
.../common/byte_order_mark/tsv/test.099.ddl.sqlpp | 20 +++++++++++++
.../common/byte_order_mark/csv/result.001.adm | 5 ++++
.../common/byte_order_mark/json/result.001.adm | 5 ++++
.../common/byte_order_mark/tsv/result.001.adm | 5 ++++
...stsuite_external_dataset_azure_blob_storage.xml | 20 +++++++++++++
.../runtimets/testsuite_external_dataset_s3.xml | 20 +++++++++++++
.../record/reader/stream/LineRecordReader.java | 6 ++++
.../reader/stream/QuotedLineRecordReader.java | 5 ++++
.../reader/stream/SemiStructuredRecordReader.java | 3 +-
.../external/util/ExternalDataConstants.java | 1 +
21 files changed, 348 insertions(+), 3 deletions(-)
diff --git
a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java
b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java
index f6b501a..094c1db 100644
---
a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java
+++
b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java
@@ -19,6 +19,7 @@
package org.apache.asterix.test.external_dataset;
import static
org.apache.asterix.test.external_dataset.BinaryFileConverterUtil.BINARY_GEN_BASEDIR;
+import static
org.apache.asterix.test.external_dataset.aws.AwsS3ExternalDatasetTest.BOM_FILE_CONTAINER;
import static
org.apache.asterix.test.external_dataset.aws.AwsS3ExternalDatasetTest.FIXED_DATA_CONTAINER;
import java.io.BufferedWriter;
@@ -62,6 +63,7 @@ public class ExternalDatasetTestUtils {
private static Uploader playgroundDataLoader;
private static Uploader fixedDataLoader;
private static Uploader mixedDataLoader;
+ private static Uploader bomFileLoader;
protected TestCaseContext tcCtx;
@@ -95,10 +97,12 @@ public class ExternalDatasetTestUtils {
TSV_DATA_PATH = tsvDataPath;
}
- public static void setUploaders(Uploader playgroundDataLoader, Uploader
fixedDataLoader, Uploader mixedDataLoader) {
+ public static void setUploaders(Uploader playgroundDataLoader, Uploader
fixedDataLoader, Uploader mixedDataLoader,
+ Uploader bomFileLoader) {
ExternalDatasetTestUtils.playgroundDataLoader = playgroundDataLoader;
ExternalDatasetTestUtils.fixedDataLoader = fixedDataLoader;
ExternalDatasetTestUtils.mixedDataLoader = mixedDataLoader;
+ ExternalDatasetTestUtils.bomFileLoader = bomFileLoader;
}
/**
@@ -148,6 +152,30 @@ public class ExternalDatasetTestUtils {
fixedDataLoader.upload("lvl1/lvl2/5.json", path, true, false);
}
+ /**
+ * This bucket contains files that start with byte order mark (BOM): U+FEFF
+ */
+ public static void prepareBomFileContainer() {
+ LOGGER.info("Loading bom files data to " + BOM_FILE_CONTAINER);
+
+ // Files data
+ bomFileLoader.upload("1.json", "\uFEFF{\"id\": 1, \"age\": 1}", false,
false);
+ bomFileLoader.upload("2.json", "\uFEFF{\"id\": 2, \"age\": 2}", false,
false);
+ bomFileLoader.upload("3.json", "\uFEFF{\"id\": 3, \"age\": 3}", false,
false);
+ bomFileLoader.upload("4.json", "\uFEFF{\"id\": 4, \"age\": 4}", false,
false);
+ bomFileLoader.upload("5.json", "\uFEFF{\"id\": 5, \"age\": 5}", false,
false);
+ bomFileLoader.upload("1.csv", "\uFEFF1,1", false, false);
+ bomFileLoader.upload("2.csv", "\uFEFF2,2", false, false);
+ bomFileLoader.upload("3.csv", "\uFEFF3,3", false, false);
+ bomFileLoader.upload("4.csv", "\uFEFF4,4", false, false);
+ bomFileLoader.upload("5.csv", "\uFEFF5,5", false, false);
+ bomFileLoader.upload("1.tsv", "\uFEFF1\t1", false, false);
+ bomFileLoader.upload("2.tsv", "\uFEFF2\t2", false, false);
+ bomFileLoader.upload("3.tsv", "\uFEFF3\t3", false, false);
+ bomFileLoader.upload("4.tsv", "\uFEFF4\t4", false, false);
+ bomFileLoader.upload("5.tsv", "\uFEFF5\t5", false, false);
+ }
+
public static void loadJsonFiles() {
String dataBasePath = JSON_DATA_PATH;
String definition = JSON_DEFINITION;
diff --git
a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetOnePartitionTest.java
b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetOnePartitionTest.java
index 8114873..6c07fab 100644
---
a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetOnePartitionTest.java
+++
b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetOnePartitionTest.java
@@ -46,6 +46,7 @@ public class AwsS3ExternalDatasetOnePartitionTest extends
AwsS3ExternalDatasetTe
PREPARE_BUCKET = AwsS3ExternalDatasetOnePartitionTest::prepareS3Bucket;
PREPARE_FIXED_DATA_BUCKET =
AwsS3ExternalDatasetOnePartitionTest::prepareFixedDataBucket;
PREPARE_MIXED_DATA_BUCKET =
AwsS3ExternalDatasetOnePartitionTest::prepareMixedDataBucket;
+ PREPARE_BOM_FILE_BUCKET =
AwsS3ExternalDatasetOnePartitionTest::prepareBomDataBucket;
return LangExecutionUtil.tests(ONLY_TESTS, SUITE_TESTS);
}
@@ -57,4 +58,7 @@ public class AwsS3ExternalDatasetOnePartitionTest extends
AwsS3ExternalDatasetTe
private static void prepareMixedDataBucket() {
}
+
+ private static void prepareBomDataBucket() {
+ }
}
diff --git
a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java
b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java
index 8035f5a..05b0d0b 100644
---
a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java
+++
b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java
@@ -89,6 +89,7 @@ public class AwsS3ExternalDatasetTest {
static Runnable PREPARE_BUCKET;
static Runnable PREPARE_FIXED_DATA_BUCKET;
static Runnable PREPARE_MIXED_DATA_BUCKET;
+ static Runnable PREPARE_BOM_FILE_BUCKET;
// Base directory paths for data files
private static final String JSON_DATA_PATH = joinPath("data", "json");
@@ -115,12 +116,15 @@ public class AwsS3ExternalDatasetTest {
public static final String PLAYGROUND_CONTAINER = "playground";
public static final String FIXED_DATA_CONTAINER = "fixed-data"; // Do not
use, has fixed data
public static final String INCLUDE_EXCLUDE_CONTAINER = "include-exclude";
+ public static final String BOM_FILE_CONTAINER = "bom-file-container";
public static final PutObjectRequest.Builder playgroundBuilder =
PutObjectRequest.builder().bucket(PLAYGROUND_CONTAINER);
public static final PutObjectRequest.Builder fixedDataBuilder =
PutObjectRequest.builder().bucket(FIXED_DATA_CONTAINER);
public static final PutObjectRequest.Builder includeExcludeBuilder =
PutObjectRequest.builder().bucket(INCLUDE_EXCLUDE_CONTAINER);
+ public static final PutObjectRequest.Builder bomFileContainerBuilder =
+ PutObjectRequest.builder().bucket(BOM_FILE_CONTAINER);
public AwsS3ExternalDatasetTest(TestCaseContext tcCtx) {
this.tcCtx = tcCtx;
@@ -158,6 +162,8 @@ public class AwsS3ExternalDatasetTest {
PREPARE_BUCKET = ExternalDatasetTestUtils::preparePlaygroundContainer;
PREPARE_FIXED_DATA_BUCKET =
ExternalDatasetTestUtils::prepareFixedDataContainer;
PREPARE_MIXED_DATA_BUCKET =
ExternalDatasetTestUtils::prepareMixedDataContainer;
+ PREPARE_BOM_FILE_BUCKET =
ExternalDatasetTestUtils::prepareBomFileContainer;
+
return LangExecutionUtil.tests(ONLY_TESTS, SUITE_TESTS);
}
@@ -199,15 +205,17 @@ public class AwsS3ExternalDatasetTest {
client.createBucket(CreateBucketRequest.builder().bucket(PLAYGROUND_CONTAINER).build());
client.createBucket(CreateBucketRequest.builder().bucket(FIXED_DATA_CONTAINER).build());
client.createBucket(CreateBucketRequest.builder().bucket(INCLUDE_EXCLUDE_CONTAINER).build());
+
client.createBucket(CreateBucketRequest.builder().bucket(BOM_FILE_CONTAINER).build());
LOGGER.info("Client created successfully");
// Create the bucket and upload some json files
setDataPaths(JSON_DATA_PATH, CSV_DATA_PATH, TSV_DATA_PATH);
setUploaders(AwsS3ExternalDatasetTest::loadPlaygroundData,
AwsS3ExternalDatasetTest::loadFixedData,
- AwsS3ExternalDatasetTest::loadMixedData);
+ AwsS3ExternalDatasetTest::loadMixedData,
AwsS3ExternalDatasetTest::loadBomData);
PREPARE_BUCKET.run();
PREPARE_FIXED_DATA_BUCKET.run();
PREPARE_MIXED_DATA_BUCKET.run();
+ PREPARE_BOM_FILE_BUCKET.run();
}
private static void loadPlaygroundData(String key, String content, boolean
fromFile, boolean gzipped) {
@@ -222,6 +230,10 @@ public class AwsS3ExternalDatasetTest {
client.putObject(includeExcludeBuilder.key(key).build(),
getRequestBody(content, fromFile, gzipped));
}
+ private static void loadBomData(String key, String content, boolean
fromFile, boolean gzipped) {
+ client.putObject(bomFileContainerBuilder.key(key).build(),
getRequestBody(content, fromFile, gzipped));
+ }
+
private static RequestBody getRequestBody(String content, boolean
fromFile, boolean gzipped) {
RequestBody body;
// Content is string
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/csv/test.000.ddl.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/csv/test.000.ddl.sqlpp
new file mode 100644
index 0000000..69e42c1
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/csv/test.000.ddl.sqlpp
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+drop dataverse test if exists;
+create dataverse test;
+use test;
+
+drop type test if exists;
+create type test as { id: int, age: int };
+
+drop dataset test1 if exists;
+CREATE EXTERNAL DATASET test1(test) USING %adapter% (
+%template%,
+("container"="bom-file-container"),
+("format"="csv"),
+("include"="*.csv"),
+("header"=False),
+("null"="")
+);
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/csv/test.001.query.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/csv/test.001.query.sqlpp
new file mode 100644
index 0000000..5aa5580
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/csv/test.001.query.sqlpp
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use test;
+
+select value test1 from test1 order by id asc;
+
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/csv/test.099.ddl.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/csv/test.099.ddl.sqlpp
new file mode 100644
index 0000000..548e632
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/csv/test.099.ddl.sqlpp
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+drop dataverse test if exists;
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/json/test.000.ddl.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/json/test.000.ddl.sqlpp
new file mode 100644
index 0000000..ad6513f
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/json/test.000.ddl.sqlpp
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+drop dataverse test if exists;
+create dataverse test;
+use test;
+
+drop type test if exists;
+create type test as open {
+};
+
+drop dataset test1 if exists;
+CREATE EXTERNAL DATASET test1(test) USING %adapter% (
+%template%,
+("container"="bom-file-container"),
+("format"="json"),
+("include"="*.json")
+);
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/json/test.001.query.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/json/test.001.query.sqlpp
new file mode 100644
index 0000000..5aa5580
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/json/test.001.query.sqlpp
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use test;
+
+select value test1 from test1 order by id asc;
+
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/json/test.099.ddl.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/json/test.099.ddl.sqlpp
new file mode 100644
index 0000000..548e632
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/json/test.099.ddl.sqlpp
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+drop dataverse test if exists;
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/tsv/test.000.ddl.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/tsv/test.000.ddl.sqlpp
new file mode 100644
index 0000000..956e835
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/tsv/test.000.ddl.sqlpp
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+drop dataverse test if exists;
+create dataverse test;
+use test;
+
+drop type test if exists;
+create type test as { id: int, age: int };
+
+drop dataset test1 if exists;
+CREATE EXTERNAL DATASET test1(test) USING %adapter% (
+%template%,
+("container"="bom-file-container"),
+("format"="tsv"),
+("include"="*.tsv"),
+("header"=False),
+("null"="")
+);
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/tsv/test.001.query.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/tsv/test.001.query.sqlpp
new file mode 100644
index 0000000..5aa5580
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/tsv/test.001.query.sqlpp
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use test;
+
+select value test1 from test1 order by id asc;
+
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/tsv/test.099.ddl.sqlpp
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/tsv/test.099.ddl.sqlpp
new file mode 100644
index 0000000..548e632
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/byte_order_mark/tsv/test.099.ddl.sqlpp
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+drop dataverse test if exists;
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/byte_order_mark/csv/result.001.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/byte_order_mark/csv/result.001.adm
new file mode 100644
index 0000000..19d10f6
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/byte_order_mark/csv/result.001.adm
@@ -0,0 +1,5 @@
+{ "id": 1, "age": 1 }
+{ "id": 2, "age": 2 }
+{ "id": 3, "age": 3 }
+{ "id": 4, "age": 4 }
+{ "id": 5, "age": 5 }
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/byte_order_mark/json/result.001.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/byte_order_mark/json/result.001.adm
new file mode 100644
index 0000000..19d10f6
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/byte_order_mark/json/result.001.adm
@@ -0,0 +1,5 @@
+{ "id": 1, "age": 1 }
+{ "id": 2, "age": 2 }
+{ "id": 3, "age": 3 }
+{ "id": 4, "age": 4 }
+{ "id": 5, "age": 5 }
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/byte_order_mark/tsv/result.001.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/byte_order_mark/tsv/result.001.adm
new file mode 100644
index 0000000..19d10f6
--- /dev/null
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/byte_order_mark/tsv/result.001.adm
@@ -0,0 +1,5 @@
+{ "id": 1, "age": 1 }
+{ "id": 2, "age": 2 }
+{ "id": 3, "age": 3 }
+{ "id": 4, "age": 4 }
+{ "id": 5, "age": 5 }
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_azure_blob_storage.xml
b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_azure_blob_storage.xml
index 8844368..2e1a6bf 100644
---
a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_azure_blob_storage.xml
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_azure_blob_storage.xml
@@ -364,4 +364,24 @@
</compilation-unit>
</test-case>
</test-group>
+ <test-group name="bom">
+ <test-case FilePath="external-dataset">
+ <compilation-unit name="common/byte_order_mark/json">
+ <placeholder name="adapter" value="AZUREBLOB" />
+ <output-dir compare="Text">common/byte_order_mark/json</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="external-dataset">
+ <compilation-unit name="common/byte_order_mark/csv">
+ <placeholder name="adapter" value="AZUREBLOB" />
+ <output-dir compare="Text">common/byte_order_mark/csv</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="external-dataset">
+ <compilation-unit name="common/byte_order_mark/tsv">
+ <placeholder name="adapter" value="AZUREBLOB" />
+ <output-dir compare="Text">common/byte_order_mark/tsv</output-dir>
+ </compilation-unit>
+ </test-case>
+ </test-group>
</test-suite>
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml
b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml
index 11a2fe7..bacc23b 100644
---
a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml
@@ -411,4 +411,24 @@
</compilation-unit>
</test-case>
</test-group>
+ <test-group name="bom">
+ <test-case FilePath="external-dataset">
+ <compilation-unit name="common/byte_order_mark/json">
+ <placeholder name="adapter" value="S3" />
+ <output-dir compare="Text">common/byte_order_mark/json</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="external-dataset">
+ <compilation-unit name="common/byte_order_mark/csv">
+ <placeholder name="adapter" value="S3" />
+ <output-dir compare="Text">common/byte_order_mark/csv</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="external-dataset">
+ <compilation-unit name="common/byte_order_mark/tsv">
+ <placeholder name="adapter" value="S3" />
+ <output-dir compare="Text">common/byte_order_mark/tsv</output-dir>
+ </compilation-unit>
+ </test-case>
+ </test-group>
</test-suite>
diff --git
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/LineRecordReader.java
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/LineRecordReader.java
index 4b86142..db20d31 100644
---
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/LineRecordReader.java
+++
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/LineRecordReader.java
@@ -18,6 +18,8 @@
*/
package org.apache.asterix.external.input.record.reader.stream;
+import static
org.apache.asterix.external.util.ExternalDataConstants.BYTE_ORDER_MARK;
+
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
@@ -121,6 +123,10 @@ public class LineRecordReader extends StreamRecordReader {
}
}
for (; bufferPosn < bufferLength; ++bufferPosn) { //search for
newline
+ if (inputBuffer[bufferPosn] == BYTE_ORDER_MARK) {
+ startPosn++;
+ continue;
+ }
if (inputBuffer[bufferPosn] == ExternalDataConstants.LF) {
newlineLength = (prevCharCR) ? 2 : 1;
++bufferPosn; // at next invocation proceed from
following byte
diff --git
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java
index 4c253bc..4433b49 100644
---
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java
+++
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java
@@ -18,6 +18,7 @@
*/
package org.apache.asterix.external.input.record.reader.stream;
+import static
org.apache.asterix.external.util.ExternalDataConstants.BYTE_ORDER_MARK;
import static
org.apache.asterix.external.util.ExternalDataConstants.REC_ENDED_AT_EOF;
import java.io.IOException;
@@ -119,6 +120,10 @@ public class QuotedLineRecordReader extends
LineRecordReader {
boolean maybeInQuote = false;
for (; bufferPosn < bufferLength; ++bufferPosn) {
char ch = inputBuffer[bufferPosn];
+ if (ch == BYTE_ORDER_MARK) {
+ startPosn++;
+ continue;
+ }
// count lines here since we need to also count the lines
inside quotes
if (ch == ExternalDataConstants.LF || prevCharCR) {
lineNumber++;
diff --git
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/SemiStructuredRecordReader.java
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/SemiStructuredRecordReader.java
index 0e23e46..2c31a0a 100644
---
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/SemiStructuredRecordReader.java
+++
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/SemiStructuredRecordReader.java
@@ -18,6 +18,7 @@
*/
package org.apache.asterix.external.input.record.reader.stream;
+import static
org.apache.asterix.external.util.ExternalDataConstants.BYTE_ORDER_MARK;
import static
org.apache.asterix.external.util.ExternalDataConstants.CLOSING_BRACKET;
import static org.apache.asterix.external.util.ExternalDataConstants.COMMA;
import static org.apache.asterix.external.util.ExternalDataConstants.CR;
@@ -134,7 +135,7 @@ public class SemiStructuredRecordReader extends
StreamRecordReader {
lineNumber++;
}
isLastCharCR = c == CR;
- if (c == SPACE || c == TAB || c == LF || c == CR) {
+ if (c == SPACE || c == TAB || c == LF || c == CR || c ==
BYTE_ORDER_MARK) {
continue;
}
if (c == recordStart && state != State.NESTED_OBJECT) {
diff --git
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
index b462bd9..89d1132 100644
---
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
+++
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
@@ -255,6 +255,7 @@ public class ExternalDataConstants {
public static final char OPEN_BRACKET = '[';
public static final char CLOSING_BRACKET = ']';
public static final char COMMA = ',';
+ public static final char BYTE_ORDER_MARK = '\uFEFF';
/**
* Constant byte characters