[
https://issues.apache.org/jira/browse/HADOOP-16942?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
vijayant soni updated HADOOP-16942:
-----------------------------------
Description:
Using S3A URL scheme while writing out data from Spark to S3 is creating many
folder level delete markers.
Writing the same with S3 URL scheme, does not create any delete markers at all.
Spark - 2.4.4
Hadoop - 3.2.1
EMR version - 6.0.0
Write Mode - Append
{code:scala}
[hadoop@ip-192-0-161-212 ~]$ spark-shell
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use
setLogLevel(newLevel).
20/03/27 07:37:19 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive
is set, falling back to uploading libraries under SPARK_HOME.
Spark context Web UI available at http://ip-192-0-161-212.ec2.internal:4040
Spark context available as 'sc' (master = yarn, app id =
application_1585294390030_0003).
Spark session available as 'spark'.
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.4.4
/_/
Using Scala version 2.12.10 (OpenJDK 64-Bit Server VM, Java 1.8.0_242)
Type in expressions to have them evaluated.
Type :help for more information.
scala> val df = spark.sql("select 1 as a")
df: org.apache.spark.sql.DataFrame = [a: int]
scala>
df.write.mode(org.apache.spark.sql.SaveMode.Append).save("s3://my-bucket/tmp/vijayant/test/s3/")
scala>
df.write.mode(org.apache.spark.sql.SaveMode.Append).save("s3a://my-bucket/tmp/vijayant/test/s3a/")
scala>
{code}
Getting delete markers from `s3` write
{code:bash}
aws s3api list-object-versions --bucket my-bucket --prefix tmp/vijayant/test/s3/
{
"Versions": [
{
"LastModified": "2020-03-27T07:38:17.000Z",
"VersionId": "V06OzeE7j221Tq7keSGj8bveCYyJFIcf",
"ETag": "\"d41d8cd98f00b204e9800998ecf8427e\"",
"StorageClass": "STANDARD",
"Key": "tmp/vijayant/test/s3/_SUCCESS",
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": true,
"Size": 0
},
{
"LastModified": "2020-03-27T07:38:16.000Z",
"VersionId": "dLYtHDugLhFIdw2YHLFmoFOxXkm.21Wo",
"ETag": "\"26e70a1e26c709e3e8498acd49cfaaa3-1\"",
"StorageClass": "STANDARD",
"Key":
"tmp/vijayant/test/s3/part-00000-9d9a8925-f119-415d-b547-b742396e2ca7-c000.snappy.parquet",
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": true,
"Size": 384
}
]
}
{code}
Getting delete markers from `s3a` write
{code:bash}
aws s3api list-object-versions --bucket my-bucket --prefix
tmp/vijayant/test/s3a/
{
"DeleteMarkers": [
{
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": true,
"VersionId": "NJWRZMcb_eYYwCJh_isX4H1Ox6W362Wb",
"Key": "tmp/vijayant/test/s3a/",
"LastModified": "2020-03-27T07:39:11.000Z"
},
{
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": false,
"VersionId": "F0h0mLcVVwkMtcHxd95Hj7BACL4Up_Q9",
"Key": "tmp/vijayant/test/s3a/",
"LastModified": "2020-03-27T07:39:10.000Z"
},
{
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": false,
"VersionId": ".sBcE6cXeggekOnSgZ4n7pyCDHnsLERK",
"Key": "tmp/vijayant/test/s3a/",
"LastModified": "2020-03-27T07:39:10.000Z"
},
{
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": false,
"VersionId": "nzm39jiUPC4H0ZaS.5Shp0FYPnR8wNf9",
"Key": "tmp/vijayant/test/s3a/",
"LastModified": "2020-03-27T07:39:09.000Z"
},
{
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": false,
"VersionId": "BPM65R1HkZngPDYtDL3zPZYPw_G_m9Ic",
"Key": "tmp/vijayant/test/s3a/",
"LastModified": "2020-03-27T07:39:08.000Z"
},
{
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": true,
"VersionId": "LJt8_MVDOiD4UdgUqEMycxjvtinJlTNt",
"Key": "tmp/vijayant/test/s3a/_temporary/",
"LastModified": "2020-03-27T07:39:11.000Z"
},
{
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": false,
"VersionId": "RqunJTn8Od0PgFR4yu44PX4kL54k6EDv",
"Key": "tmp/vijayant/test/s3a/_temporary/",
"LastModified": "2020-03-27T07:39:09.000Z"
},
{
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": false,
"VersionId": "4vY8cnqUI5VJAk3VfEt_VD_KEczo3bmY",
"Key": "tmp/vijayant/test/s3a/_temporary/",
"LastModified": "2020-03-27T07:39:08.000Z"
},
{
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": true,
"VersionId": "ln47YYy.yiE.k70cvqvfgYCEQoYFnKQW",
"Key": "tmp/vijayant/test/s3a/_temporary/0/",
"LastModified": "2020-03-27T07:39:11.000Z"
},
{
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": false,
"VersionId": "5Bsrt7s1caM90mzGNgk0MsTU9q8UjTTA",
"Key": "tmp/vijayant/test/s3a/_temporary/0/",
"LastModified": "2020-03-27T07:39:09.000Z"
},
{
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": true,
"VersionId": "pN3HzDfnmqIqrMwAL2jqKEBkvoHZALor",
"Key": "tmp/vijayant/test/s3a/_temporary/0/_temporary/",
"LastModified": "2020-03-27T07:39:11.000Z"
},
{
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": false,
"VersionId": "wg91poO1KXReXxvsZHzZXrHR1IgIX8t2",
"Key": "tmp/vijayant/test/s3a/_temporary/0/_temporary/",
"LastModified": "2020-03-27T07:39:09.000Z"
},
{
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": true,
"VersionId": "cv5Noykq3sMilQqJXAH3E.N7qAWnIBx7",
"Key":
"tmp/vijayant/test/s3a/_temporary/0/_temporary/attempt_20200327073907_0001_m_000000_1/",
"LastModified": "2020-03-27T07:39:11.000Z"
},
{
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": false,
"VersionId": "6xzt9SxlCUJaOLD8krkE3yXfQU14rErX",
"Key":
"tmp/vijayant/test/s3a/_temporary/0/_temporary/attempt_20200327073907_0001_m_000000_1/",
"LastModified": "2020-03-27T07:39:09.000Z"
},
{
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": true,
"VersionId": "wGmJAo7x_gkLWAiHzxPGdPMVSus7Wcp1",
"Key":
"tmp/vijayant/test/s3a/_temporary/0/_temporary/attempt_20200327073907_0001_m_000000_1/part-00000-3923e1b1-406c-4202-b9a8-3bd7cb2d97b2-c000.snappy.parquet",
"LastModified": "2020-03-27T07:39:10.000Z"
}
],
"Versions": [
{
"LastModified": "2020-03-27T07:39:11.000Z",
"VersionId": "2py_ZXKl7yh6fwhzksAx8Os1BriDJCBb",
"ETag": "\"d41d8cd98f00b204e9800998ecf8427e\"",
"StorageClass": "STANDARD",
"Key": "tmp/vijayant/test/s3a/_SUCCESS",
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": true,
"Size": 0
},
{
"LastModified": "2020-03-27T07:39:08.000Z",
"VersionId": "lDqTnLCqDYtjrOiY.V7E6AKTRQLKrqUT",
"ETag": "\"d41d8cd98f00b204e9800998ecf8427e\"",
"StorageClass": "STANDARD",
"Key": "tmp/vijayant/test/s3a/_temporary/0/",
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": false,
"Size": 0
},
{
"LastModified": "2020-03-27T07:39:10.000Z",
"VersionId": "g.rGoTDdmrGrNjrLchvwz3jMmGePkgiD",
"ETag": "\"d41d8cd98f00b204e9800998ecf8427e\"",
"StorageClass": "STANDARD",
"Key":
"tmp/vijayant/test/s3a/_temporary/0/_temporary/attempt_20200327073907_0001_m_000000_1/",
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": false,
"Size": 0
},
{
"LastModified": "2020-03-27T07:39:09.000Z",
"VersionId": ".ZCpY2UW4hRlbLL87dFUJRuk021Hyq8p",
"ETag": "\"3def7238a0858c17c62d7045290175cf\"",
"StorageClass": "STANDARD",
"Key":
"tmp/vijayant/test/s3a/_temporary/0/_temporary/attempt_20200327073907_0001_m_000000_1/part-00000-3923e1b1-406c-4202-b9a8-3bd7cb2d97b2-c000.snappy.parquet",
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": false,
"Size": 384
},
{
"LastModified": "2020-03-27T07:39:10.000Z",
"VersionId": "JSNjTDHSQqe9zSAV93bc6TXPuqA.vDJE",
"ETag": "\"3def7238a0858c17c62d7045290175cf\"",
"StorageClass": "STANDARD",
"Key":
"tmp/vijayant/test/s3a/part-00000-3923e1b1-406c-4202-b9a8-3bd7cb2d97b2-c000.snappy.parquet",
"Owner": {
"DisplayName": "sysops+stage",
"ID":
"08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
},
"IsLatest": true,
"Size": 384
}
]
}
{code}
This in turn makes listing objects slow and we have even noticed timeouts due
to too many delete markers.
was:
Using S3A URL scheme while writing out data from Spark to S3 is creating many
folder level delete markers.
Writing the same with S3 URL scheme, does not create any delete markers at all.
Spark - 2.4.4
Hadoop - 3.2.1
EMR version - 6.0.0
{code:scala}
spark-shell
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.4.4
/_/
Using Scala version 2.12.10 (OpenJDK 64-Bit Server VM, Java 1.8.0_242)
Type in expressions to have them evaluated.
Type :help for more information.
scala> val df = spark.sql("select 1 as a")
df: org.apache.spark.sql.DataFrame = [a: int]
scala> df.show(false)
+---+
|a |
+---+
|1 |
+---+
scala> // Writing to S3 using s3
scala>
df.write.mode(org.apache.spark.sql.SaveMode.Overwrite).save("s3://my_bucket/tmp/vijayant/s3/")
scala> // Writing to S3 using s3a
scala>
df.write.mode(org.apache.spark.sql.SaveMode.Overwrite).save("s3a://my_bucket/tmp/vijayant/s3a/")
scala>
{code}
Getting delete markers from `s3` write
{code:bash}
aws s3api list-object-versions --bucket my_bucket --prefix tmp/vijayant/s3
{
"Versions": [
{
"LastModified": "2020-03-26T12:57:54.000Z",
"VersionId": "h7_SIsHYoC.1il2s4qporAFnVbLgiLN5",
"ETag": "\"d41d8cd98f00b204e9800998ecf8427e\"",
"StorageClass": "STANDARD",
"Key": "tmp/vijayant/s3/_SUCCESS",
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": true,
"Size": 0
},
{
"LastModified": "2020-03-26T12:57:54.000Z",
"VersionId": "pOALzyzpBR7glCEk3cqPOR.u8QCIcLnC",
"ETag": "\"26e70a1e26c709e3e8498acd49cfaaa3-1\"",
"StorageClass": "STANDARD",
"Key":
"tmp/vijayant/s3/part-00000-9af16781-7944-497d-9b19-f31ab1e5f850-c000.snappy.parquet",
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": true,
"Size": 384
}
]
}
{code}
Getting delete markers from `s3a` write
{code:bash}
aws s3api list-object-versions --bucket my_bucket --prefix tmp/vijayant/s3a
{
"DeleteMarkers": [
{
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": true,
"VersionId": "Jd8PHlUK3TbVJY2RWQxi74a6.2Gp2mUL",
"Key": "tmp/vijayant/s3a/",
"LastModified": "2020-03-26T13:00:14.000Z"
},
{
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": false,
"VersionId": "j2llpkiXzEaomJr5xLhQ9xTmfoq_8dOy",
"Key": "tmp/vijayant/s3a/",
"LastModified": "2020-03-26T13:00:13.000Z"
},
{
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": false,
"VersionId": "ry6BqTUGvyY3U.eqFfgg2hJ2BBMxVcwH",
"Key": "tmp/vijayant/s3a/",
"LastModified": "2020-03-26T13:00:13.000Z"
},
{
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": false,
"VersionId": "xG2oNiCpovqWCSZxaWiqtL.E7znE7AmR",
"Key": "tmp/vijayant/s3a/",
"LastModified": "2020-03-26T13:00:12.000Z"
},
{
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": false,
"VersionId": "OedjkEU1VeWl0ZZouur.13dufhYa7JXm",
"Key": "tmp/vijayant/s3a/",
"LastModified": "2020-03-26T13:00:01.000Z"
},
{
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": true,
"VersionId": "Nn8O947GGwqCePelc9VL9O2sWsmSsy2i",
"Key": "tmp/vijayant/s3a/_temporary/",
"LastModified": "2020-03-26T13:00:13.000Z"
},
{
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": false,
"VersionId": "7DcZ4b3pmoIi_TuzoRsykdtzyUGDLUo9",
"Key": "tmp/vijayant/s3a/_temporary/",
"LastModified": "2020-03-26T13:00:12.000Z"
},
{
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": false,
"VersionId": "eFahQrYnglWeRHZHTod6IszSoNE3jPCH",
"Key": "tmp/vijayant/s3a/_temporary/",
"LastModified": "2020-03-26T13:00:01.000Z"
},
{
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": true,
"VersionId": "m4mGr.QA3sO0pQb_tuZEZX6OVIeprgwl",
"Key": "tmp/vijayant/s3a/_temporary/0/",
"LastModified": "2020-03-26T13:00:13.000Z"
},
{
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": false,
"VersionId": "2TBn8RzdMKzEqn6cP8O_CI9OdZkhvv53",
"Key": "tmp/vijayant/s3a/_temporary/0/",
"LastModified": "2020-03-26T13:00:12.000Z"
},
{
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": true,
"VersionId": "NXVubIX_eu9RYLDWpD4JH91VK08OmHwu",
"Key": "tmp/vijayant/s3a/_temporary/0/_temporary/",
"LastModified": "2020-03-26T13:00:13.000Z"
},
{
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": false,
"VersionId": "FtubGslxkfMiT5uxuuEorWsg0OIvXmzY",
"Key": "tmp/vijayant/s3a/_temporary/0/_temporary/",
"LastModified": "2020-03-26T13:00:12.000Z"
},
{
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": true,
"VersionId": "0tv9I0s1mvurxP4KX_Zgqr7P8OQ5bIs7",
"Key":
"tmp/vijayant/s3a/_temporary/0/_temporary/attempt_20200326130000_0002_m_000000_2/",
"LastModified": "2020-03-26T13:00:14.000Z"
},
{
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": false,
"VersionId": "y058RX4xXC.a_ltup_OxdI7S5o288h38",
"Key":
"tmp/vijayant/s3a/_temporary/0/_temporary/attempt_20200326130000_0002_m_000000_2/",
"LastModified": "2020-03-26T13:00:12.000Z"
},
{
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": true,
"VersionId": "T54GB8P1SVmWUu_9lXXogf970cYZOszE",
"Key":
"tmp/vijayant/s3a/_temporary/0/_temporary/attempt_20200326130000_0002_m_000000_2/part-00000-8ecb77ed-8279-4256-9ef1-5ea352318c1a-c000.snappy.parquet",
"LastModified": "2020-03-26T13:00:13.000Z"
}
],
"Versions": [
{
"LastModified": "2020-03-26T13:00:14.000Z",
"VersionId": "3HSpCqBQyrVoh9X1tTfskNEiQIet7f_0",
"ETag": "\"d41d8cd98f00b204e9800998ecf8427e\"",
"StorageClass": "STANDARD",
"Key": "tmp/vijayant/s3a/_SUCCESS",
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": true,
"Size": 0
},
{
"LastModified": "2020-03-26T13:00:00.000Z",
"VersionId": "zB.ELKr2RcK9RgdSgx5wwj55YPlZTWD0",
"ETag": "\"d41d8cd98f00b204e9800998ecf8427e\"",
"StorageClass": "STANDARD",
"Key": "tmp/vijayant/s3a/_temporary/0/",
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": false,
"Size": 0
},
{
"LastModified": "2020-03-26T13:00:13.000Z",
"VersionId": "XoIDfWRP0Y6DySn_FVkh3z.LCSCv1H4x",
"ETag": "\"d41d8cd98f00b204e9800998ecf8427e\"",
"StorageClass": "STANDARD",
"Key":
"tmp/vijayant/s3a/_temporary/0/_temporary/attempt_20200326130000_0002_m_000000_2/",
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": false,
"Size": 0
},
{
"LastModified": "2020-03-26T13:00:12.000Z",
"VersionId": "OL24nTI4C0DJFur6ZfXeWFH1N_eo.SIl",
"ETag": "\"1c1179f44b770f1d661f06b9324c27da\"",
"StorageClass": "STANDARD",
"Key":
"tmp/vijayant/s3a/_temporary/0/_temporary/attempt_20200326130000_0002_m_000000_2/part-00000-8ecb77ed-8279-4256-9ef1-5ea352318c1a-c000.snappy.parquet",
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": false,
"Size": 384
},
{
"LastModified": "2020-03-26T13:00:13.000Z",
"VersionId": "EscpHQeMrYBhDGdmnH5TPEDLpLUwZzBS",
"ETag": "\"1c1179f44b770f1d661f06b9324c27da\"",
"StorageClass": "STANDARD",
"Key":
"tmp/vijayant/s3a/part-00000-8ecb77ed-8279-4256-9ef1-5ea352318c1a-c000.snappy.parquet",
"Owner": {
"DisplayName": "<display-name>",
"ID": "<owner-id>"
},
"IsLatest": true,
"Size": 384
}
]
}
{code}
This in turn makes listing objects slow and we have even noticed timeouts due
to too many delete markers.
> S3A creating folder level delete markers
> ----------------------------------------
>
> Key: HADOOP-16942
> URL: https://issues.apache.org/jira/browse/HADOOP-16942
> Project: Hadoop Common
> Issue Type: Task
> Components: fs/s3
> Affects Versions: 3.2.1
> Reporter: vijayant soni
> Priority: Major
>
> Using S3A URL scheme while writing out data from Spark to S3 is creating many
> folder level delete markers.
> Writing the same with S3 URL scheme, does not create any delete markers at
> all.
>
> Spark - 2.4.4
> Hadoop - 3.2.1
> EMR version - 6.0.0
> Write Mode - Append
> {code:scala}
> [hadoop@ip-192-0-161-212 ~]$ spark-shell
> Setting default log level to "WARN".
> To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use
> setLogLevel(newLevel).
> 20/03/27 07:37:19 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive
> is set, falling back to uploading libraries under SPARK_HOME.
> Spark context Web UI available at http://ip-192-0-161-212.ec2.internal:4040
> Spark context available as 'sc' (master = yarn, app id =
> application_1585294390030_0003).
> Spark session available as 'spark'.
> Welcome to
> ____ __
> / __/__ ___ _____/ /__
> _\ \/ _ \/ _ `/ __/ '_/
> /___/ .__/\_,_/_/ /_/\_\ version 2.4.4
> /_/
>
> Using Scala version 2.12.10 (OpenJDK 64-Bit Server VM, Java 1.8.0_242)
> Type in expressions to have them evaluated.
> Type :help for more information.
> scala> val df = spark.sql("select 1 as a")
> df: org.apache.spark.sql.DataFrame = [a: int]
> scala>
> df.write.mode(org.apache.spark.sql.SaveMode.Append).save("s3://my-bucket/tmp/vijayant/test/s3/")
>
>
> scala>
> df.write.mode(org.apache.spark.sql.SaveMode.Append).save("s3a://my-bucket/tmp/vijayant/test/s3a/")
>
>
> scala>
> {code}
> Getting delete markers from `s3` write
> {code:bash}
> aws s3api list-object-versions --bucket my-bucket --prefix
> tmp/vijayant/test/s3/
> {
> "Versions": [
> {
> "LastModified": "2020-03-27T07:38:17.000Z",
> "VersionId": "V06OzeE7j221Tq7keSGj8bveCYyJFIcf",
> "ETag": "\"d41d8cd98f00b204e9800998ecf8427e\"",
> "StorageClass": "STANDARD",
> "Key": "tmp/vijayant/test/s3/_SUCCESS",
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": true,
> "Size": 0
> },
> {
> "LastModified": "2020-03-27T07:38:16.000Z",
> "VersionId": "dLYtHDugLhFIdw2YHLFmoFOxXkm.21Wo",
> "ETag": "\"26e70a1e26c709e3e8498acd49cfaaa3-1\"",
> "StorageClass": "STANDARD",
> "Key":
> "tmp/vijayant/test/s3/part-00000-9d9a8925-f119-415d-b547-b742396e2ca7-c000.snappy.parquet",
>
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": true,
> "Size": 384
> }
> ]
> }
> {code}
> Getting delete markers from `s3a` write
> {code:bash}
> aws s3api list-object-versions --bucket my-bucket --prefix
> tmp/vijayant/test/s3a/
> {
> "DeleteMarkers": [
> {
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": true,
> "VersionId": "NJWRZMcb_eYYwCJh_isX4H1Ox6W362Wb",
> "Key": "tmp/vijayant/test/s3a/",
> "LastModified": "2020-03-27T07:39:11.000Z"
> },
> {
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": false,
> "VersionId": "F0h0mLcVVwkMtcHxd95Hj7BACL4Up_Q9",
> "Key": "tmp/vijayant/test/s3a/",
> "LastModified": "2020-03-27T07:39:10.000Z"
> },
> {
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": false,
> "VersionId": ".sBcE6cXeggekOnSgZ4n7pyCDHnsLERK",
> "Key": "tmp/vijayant/test/s3a/",
> "LastModified": "2020-03-27T07:39:10.000Z"
> },
> {
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": false,
> "VersionId": "nzm39jiUPC4H0ZaS.5Shp0FYPnR8wNf9",
> "Key": "tmp/vijayant/test/s3a/",
> "LastModified": "2020-03-27T07:39:09.000Z"
> },
> {
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": false,
> "VersionId": "BPM65R1HkZngPDYtDL3zPZYPw_G_m9Ic",
> "Key": "tmp/vijayant/test/s3a/",
> "LastModified": "2020-03-27T07:39:08.000Z"
> },
> {
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": true,
> "VersionId": "LJt8_MVDOiD4UdgUqEMycxjvtinJlTNt",
> "Key": "tmp/vijayant/test/s3a/_temporary/",
> "LastModified": "2020-03-27T07:39:11.000Z"
> },
> {
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": false,
> "VersionId": "RqunJTn8Od0PgFR4yu44PX4kL54k6EDv",
> "Key": "tmp/vijayant/test/s3a/_temporary/",
> "LastModified": "2020-03-27T07:39:09.000Z"
> },
> {
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": false,
> "VersionId": "4vY8cnqUI5VJAk3VfEt_VD_KEczo3bmY",
> "Key": "tmp/vijayant/test/s3a/_temporary/",
> "LastModified": "2020-03-27T07:39:08.000Z"
> },
> {
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": true,
> "VersionId": "ln47YYy.yiE.k70cvqvfgYCEQoYFnKQW",
> "Key": "tmp/vijayant/test/s3a/_temporary/0/",
> "LastModified": "2020-03-27T07:39:11.000Z"
> },
> {
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": false,
> "VersionId": "5Bsrt7s1caM90mzGNgk0MsTU9q8UjTTA",
> "Key": "tmp/vijayant/test/s3a/_temporary/0/",
> "LastModified": "2020-03-27T07:39:09.000Z"
> },
> {
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": true,
> "VersionId": "pN3HzDfnmqIqrMwAL2jqKEBkvoHZALor",
> "Key": "tmp/vijayant/test/s3a/_temporary/0/_temporary/",
> "LastModified": "2020-03-27T07:39:11.000Z"
> },
> {
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": false,
> "VersionId": "wg91poO1KXReXxvsZHzZXrHR1IgIX8t2",
> "Key": "tmp/vijayant/test/s3a/_temporary/0/_temporary/",
> "LastModified": "2020-03-27T07:39:09.000Z"
> },
> {
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": true,
> "VersionId": "cv5Noykq3sMilQqJXAH3E.N7qAWnIBx7",
> "Key":
> "tmp/vijayant/test/s3a/_temporary/0/_temporary/attempt_20200327073907_0001_m_000000_1/",
>
> "LastModified": "2020-03-27T07:39:11.000Z"
> },
> {
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": false,
> "VersionId": "6xzt9SxlCUJaOLD8krkE3yXfQU14rErX",
> "Key":
> "tmp/vijayant/test/s3a/_temporary/0/_temporary/attempt_20200327073907_0001_m_000000_1/",
>
> "LastModified": "2020-03-27T07:39:09.000Z"
> },
> {
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": true,
> "VersionId": "wGmJAo7x_gkLWAiHzxPGdPMVSus7Wcp1",
> "Key":
> "tmp/vijayant/test/s3a/_temporary/0/_temporary/attempt_20200327073907_0001_m_000000_1/part-00000-3923e1b1-406c-4202-b9a8-3bd7cb2d97b2-c000.snappy.parquet",
>
> "LastModified": "2020-03-27T07:39:10.000Z"
> }
> ],
> "Versions": [
> {
> "LastModified": "2020-03-27T07:39:11.000Z",
> "VersionId": "2py_ZXKl7yh6fwhzksAx8Os1BriDJCBb",
> "ETag": "\"d41d8cd98f00b204e9800998ecf8427e\"",
> "StorageClass": "STANDARD",
> "Key": "tmp/vijayant/test/s3a/_SUCCESS",
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": true,
> "Size": 0
> },
> {
> "LastModified": "2020-03-27T07:39:08.000Z",
> "VersionId": "lDqTnLCqDYtjrOiY.V7E6AKTRQLKrqUT",
> "ETag": "\"d41d8cd98f00b204e9800998ecf8427e\"",
> "StorageClass": "STANDARD",
> "Key": "tmp/vijayant/test/s3a/_temporary/0/",
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": false,
> "Size": 0
> },
> {
> "LastModified": "2020-03-27T07:39:10.000Z",
> "VersionId": "g.rGoTDdmrGrNjrLchvwz3jMmGePkgiD",
> "ETag": "\"d41d8cd98f00b204e9800998ecf8427e\"",
> "StorageClass": "STANDARD",
> "Key":
> "tmp/vijayant/test/s3a/_temporary/0/_temporary/attempt_20200327073907_0001_m_000000_1/",
>
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": false,
> "Size": 0
> },
> {
> "LastModified": "2020-03-27T07:39:09.000Z",
> "VersionId": ".ZCpY2UW4hRlbLL87dFUJRuk021Hyq8p",
> "ETag": "\"3def7238a0858c17c62d7045290175cf\"",
> "StorageClass": "STANDARD",
> "Key":
> "tmp/vijayant/test/s3a/_temporary/0/_temporary/attempt_20200327073907_0001_m_000000_1/part-00000-3923e1b1-406c-4202-b9a8-3bd7cb2d97b2-c000.snappy.parquet",
>
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": false,
> "Size": 384
> },
> {
> "LastModified": "2020-03-27T07:39:10.000Z",
> "VersionId": "JSNjTDHSQqe9zSAV93bc6TXPuqA.vDJE",
> "ETag": "\"3def7238a0858c17c62d7045290175cf\"",
> "StorageClass": "STANDARD",
> "Key":
> "tmp/vijayant/test/s3a/part-00000-3923e1b1-406c-4202-b9a8-3bd7cb2d97b2-c000.snappy.parquet",
>
> "Owner": {
> "DisplayName": "sysops+stage",
> "ID":
> "08939105f417dc74b1fa237e211185ff2d9f528d54b1380501de07bd0657b5e1"
> },
> "IsLatest": true,
> "Size": 384
> }
> ]
> }
> {code}
> This in turn makes listing objects slow and we have even noticed timeouts due
> to too many delete markers.
--
This message was sent by Atlassian Jira
(v8.3.4#803005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]