IMPALA-6549: Enable file handle cache by default The file handle cache was disabled by default due to two HDFS issues: HDFS-12528 and HDFS-14872. Both have been fixed and the CDH components in the toolchain include both fixes.
This reenables the file handle cache by default. Change-Id: I6935825a1c4c7b2da0bb877f732027be1a57a8b7 Reviewed-on: http://gerrit.cloudera.org:8080/9371 Reviewed-by: Joe McDonnell <joemcdonn...@cloudera.com> Tested-by: Impala Public Jenkins Reviewed-on: http://gerrit.cloudera.org:8080/9426 Reviewed-by: Tim Armstrong <tarmstr...@cloudera.com> Tested-by: Tim Armstrong <tarmstr...@cloudera.com> Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/876f289f Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/876f289f Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/876f289f Branch: refs/heads/2.x Commit: 876f289fe005a5bb9084d6d3176dfaa11cfa7271 Parents: 74e7245 Author: Joe McDonnell <joemcdonn...@cloudera.com> Authored: Tue Feb 20 16:37:29 2018 -0800 Committer: Tim Armstrong <tarmstr...@cloudera.com> Committed: Sat Feb 24 01:58:46 2018 +0000 ---------------------------------------------------------------------- be/src/runtime/io/disk-io-mgr.cc | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/876f289f/be/src/runtime/io/disk-io-mgr.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/io/disk-io-mgr.cc b/be/src/runtime/io/disk-io-mgr.cc index 6c7b9e6..0ac3669 100644 --- a/be/src/runtime/io/disk-io-mgr.cc +++ b/be/src/runtime/io/disk-io-mgr.cc @@ -98,10 +98,7 @@ DEFINE_int32(max_free_io_buffers, 128, // uses about 6kB of memory. 20k file handles will thus reserve ~120MB of memory. // The actual amount of memory that is associated with a file handle can be larger // or smaller, depending on the replication factor for this file or the path name. -// TODO: This is currently disabled due to HDFS-12528, which can disable short circuit -// reads when file handle caching is enabled. This should be reenabled by default -// when that issue is fixed. -DEFINE_uint64(max_cached_file_handles, 0, "Maximum number of HDFS file handles " +DEFINE_uint64(max_cached_file_handles, 20000, "Maximum number of HDFS file handles " "that will be cached. Disabled if set to 0."); // The unused file handle timeout specifies how long a file handle will remain in the @@ -112,11 +109,12 @@ DEFINE_uint64(max_cached_file_handles, 0, "Maximum number of HDFS file handles " // If a file is deleted through HDFS, this open file descriptor can keep the disk space // from being freed. When the metadata sees that a file has been deleted, the file handle // will no longer be used by future queries. Aging out this file handle allows the -// disk space to be freed in an appropriate period of time. -// TODO: HDFS-12528 (which can disable short circuit reads) is more likely to happen -// if file handles are cached for longer than 5 minutes. Use a conservative value for -// the unused file handle cache timeout until HDFS-12528 is fixed. -DEFINE_uint64(unused_file_handle_timeout_sec, 270, "Maximum time, in seconds, that an " +// disk space to be freed in an appropriate period of time. The default value is +// 6 hours. This was chosen to be less than a typical value for HDFS's fs.trash.interval. +// This means that when files are deleted via the trash, the file handle cache will +// have evicted the file handle before the files are flushed from the trash. This +// means that the file handle cache won't impact available disk space. +DEFINE_uint64(unused_file_handle_timeout_sec, 21600, "Maximum time, in seconds, that an " "unused HDFS file handle will remain in the file handle cache. Disabled if set " "to 0.");