This is an automated email from the ASF dual-hosted git repository.

thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new cd8830b43f GH-36990: [R] Expose Parquet ReaderProperties (#36992)
cd8830b43f is described below

commit cd8830b43fa16d6017ff36282ebeb705a83eb709
Author: Nic Crane <[email protected]>
AuthorDate: Mon Aug 14 14:45:11 2023 +0100

    GH-36990: [R] Expose Parquet ReaderProperties (#36992)
    
    ### Rationale for this change
    
    Expose the ReaderProperties class in R so that the thrift size settings can 
be altered.
    
    ### What changes are included in this PR?
    
    Add R6 class, link it up to the C++ class, use it when reading Parquet 
files.
    
    ### Are these changes tested?
    
    Yes
    
    ### Are there any user-facing changes?
    
    Nope
    * Closes: #36990
    
    Authored-by: Nic Crane <[email protected]>
    Signed-off-by: Nic Crane <[email protected]>
---
 r/NAMESPACE                      |   1 +
 r/R/arrowExports.R               |  28 +++++++++--
 r/R/dataset-format.R             |  16 +++++-
 r/R/parquet.R                    |  48 +++++++++++++++++-
 r/_pkgdown.yml                   |   1 +
 r/man/FragmentScanOptions.Rd     |   7 +++
 r/man/ParquetFileReader.Rd       |   1 +
 r/man/ParquetReaderProperties.Rd |  27 ++++++++++
 r/src/arrowExports.cpp           | 106 +++++++++++++++++++++++++++++++++++----
 r/src/arrow_types.h              |   1 +
 r/src/dataset.cpp                |   7 ++-
 r/src/parquet.cpp                |  34 ++++++++++++-
 r/tests/testthat/test-dataset.R  |  14 ++++++
 r/tests/testthat/test-parquet.R  |  40 +++++++++++++++
 14 files changed, 311 insertions(+), 20 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 7eaa51bc57..f479917642 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -246,6 +246,7 @@ export(ParquetFileFormat)
 export(ParquetFileReader)
 export(ParquetFileWriter)
 export(ParquetFragmentScanOptions)
+export(ParquetReaderProperties)
 export(ParquetVersionType)
 export(ParquetWriterProperties)
 export(Partitioning)
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index 10732100cd..f4ff3ef894 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -748,8 +748,8 @@ dataset___JsonFragmentScanOptions__Make <- 
function(parse_options, read_options)
   .Call(`_arrow_dataset___JsonFragmentScanOptions__Make`, parse_options, 
read_options)
 }
 
-dataset___ParquetFragmentScanOptions__Make <- function(use_buffered_stream, 
buffer_size, pre_buffer) {
-  .Call(`_arrow_dataset___ParquetFragmentScanOptions__Make`, 
use_buffered_stream, buffer_size, pre_buffer)
+dataset___ParquetFragmentScanOptions__Make <- function(use_buffered_stream, 
buffer_size, pre_buffer, thrift_string_size_limit, thrift_container_size_limit) 
{
+  .Call(`_arrow_dataset___ParquetFragmentScanOptions__Make`, 
use_buffered_stream, buffer_size, pre_buffer, thrift_string_size_limit, 
thrift_container_size_limit)
 }
 
 dataset___DirectoryPartitioning <- function(schm, segment_encoding) {
@@ -1592,6 +1592,26 @@ parquet___arrow___ArrowReaderProperties__Make <- 
function(use_threads) {
   .Call(`_arrow_parquet___arrow___ArrowReaderProperties__Make`, use_threads)
 }
 
+parquet___arrow___ReaderProperties__Make <- function() {
+  .Call(`_arrow_parquet___arrow___ReaderProperties__Make`)
+}
+
+parquet___arrow___ReaderProperties__get_thrift_string_size_limit <- 
function(properties) {
+  
.Call(`_arrow_parquet___arrow___ReaderProperties__get_thrift_string_size_limit`,
 properties)
+}
+
+parquet___arrow___ReaderProperties__set_thrift_string_size_limit <- 
function(properties, size) {
+  
invisible(.Call(`_arrow_parquet___arrow___ReaderProperties__set_thrift_string_size_limit`,
 properties, size))
+}
+
+parquet___arrow___ReaderProperties__get_thrift_container_size_limit <- 
function(properties) {
+  
.Call(`_arrow_parquet___arrow___ReaderProperties__get_thrift_container_size_limit`,
 properties)
+}
+
+parquet___arrow___ReaderProperties__set_thrift_container_size_limit <- 
function(properties, size) {
+  
invisible(.Call(`_arrow_parquet___arrow___ReaderProperties__set_thrift_container_size_limit`,
 properties, size))
+}
+
 parquet___arrow___ArrowReaderProperties__set_use_threads <- 
function(properties, use_threads) {
   
invisible(.Call(`_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads`,
 properties, use_threads))
 }
@@ -1616,8 +1636,8 @@ 
parquet___arrow___ArrowReaderProperties__get_coerce_int96_timestamp_unit <- func
   
.Call(`_arrow_parquet___arrow___ArrowReaderProperties__get_coerce_int96_timestamp_unit`,
 properties)
 }
 
-parquet___arrow___FileReader__OpenFile <- function(file, props) {
-  .Call(`_arrow_parquet___arrow___FileReader__OpenFile`, file, props)
+parquet___arrow___FileReader__OpenFile <- function(file, props, reader_props) {
+  .Call(`_arrow_parquet___arrow___FileReader__OpenFile`, file, props, 
reader_props)
 }
 
 parquet___arrow___FileReader__ReadTable1 <- function(reader) {
diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R
index 8798e0248e..9c7e332f5e 100644
--- a/r/R/dataset-format.R
+++ b/r/R/dataset-format.R
@@ -518,6 +518,13 @@ csv_file_format_read_opts <- function(schema = NULL, ...) {
 #'   * `buffer_size`: Size of buffered stream, if enabled. Default is 8KB.
 #'   * `pre_buffer`: Pre-buffer the raw Parquet data. This can improve 
performance
 #'                   on high-latency filesystems. Disabled by default.
+#'   * `thrift_string_size_limit`: Maximum string size allocated for decoding 
thrift
+#'                                 strings. May need to be increased in order 
to read
+#'                                 files with especially large headers. 
Default value
+#'                                 100000000.
+#'   * `thrift_container_size_limit`: Maximum size of thrift containers.  May 
need to be
+#'                                    increased in order to read files with 
especially large
+#'                                    headers. Default value 1000000.
 #
 #'   `format = "text"`: see [CsvConvertOptions]. Note that options can only be
 #'   specified with the Arrow C++ library naming. Also, "block_size" from
@@ -571,8 +578,13 @@ CsvFragmentScanOptions$create <- function(...,
 ParquetFragmentScanOptions <- R6Class("ParquetFragmentScanOptions", inherit = 
FragmentScanOptions)
 ParquetFragmentScanOptions$create <- function(use_buffered_stream = FALSE,
                                               buffer_size = 8196,
-                                              pre_buffer = TRUE) {
-  dataset___ParquetFragmentScanOptions__Make(use_buffered_stream, buffer_size, 
pre_buffer)
+                                              pre_buffer = TRUE,
+                                              thrift_string_size_limit = 
100000000,
+                                              thrift_container_size_limit = 
1000000) {
+  dataset___ParquetFragmentScanOptions__Make(
+    use_buffered_stream, buffer_size, pre_buffer, thrift_string_size_limit,
+    thrift_container_size_limit
+  )
 }
 
 #' @usage NULL
diff --git a/r/R/parquet.R b/r/R/parquet.R
index db224a41e4..a58f7810b6 100644
--- a/r/R/parquet.R
+++ b/r/R/parquet.R
@@ -457,6 +457,7 @@ ParquetFileWriter$create <- function(schema,
 #'    (e.g. `RandomAccessFile`).
 #' - `props` Optional [ParquetArrowReaderProperties]
 #' - `mmap` Logical: whether to memory-map the file (default `TRUE`)
+#' - `reader_props` Optional [ParquetReaderProperties]
 #' - `...` Additional arguments, currently ignored
 #'
 #' @section Methods:
@@ -541,12 +542,13 @@ ParquetFileReader <- R6Class("ParquetFileReader",
 ParquetFileReader$create <- function(file,
                                      props = 
ParquetArrowReaderProperties$create(),
                                      mmap = TRUE,
+                                     reader_props = 
ParquetReaderProperties$create(),
                                      ...) {
   file <- make_readable_file(file, mmap)
   assert_is(props, "ParquetArrowReaderProperties")
   assert_is(file, "RandomAccessFile")
 
-  parquet___arrow___FileReader__OpenFile(file, props)
+  parquet___arrow___FileReader__OpenFile(file, props, reader_props)
 }
 
 #' @title ParquetArrowReaderProperties class
@@ -625,3 +627,47 @@ calculate_chunk_size <- function(rows, columns,
 
   chunk_size
 }
+
+#' @title ParquetReaderProperties class
+#' @rdname ParquetReaderProperties
+#' @name ParquetReaderProperties
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description This class holds settings to control how a Parquet file is read
+#' by [ParquetFileReader].
+#'
+#' @section Factory:
+#'
+#' The `ParquetReaderProperties$create()` factory method instantiates the 
object
+#' and takes no arguments.
+#'
+#' @section Methods:
+#'
+#' - `$thrift_string_size_limit()`
+#' - `$set_thrift_string_size_limit()`
+#' - `$thrift_container_size_limit()`
+#' - `$set_thrift_container_size_limit()`
+#'
+#' @export
+ParquetReaderProperties <- R6Class("ParquetReaderProperties",
+  inherit = ArrowObject,
+  public = list(
+    thrift_string_size_limit = function() {
+      parquet___arrow___ReaderProperties__get_thrift_string_size_limit(self)
+    },
+    set_thrift_string_size_limit = function(size) {
+      parquet___arrow___ReaderProperties__set_thrift_string_size_limit(self, 
size)
+    },
+    thrift_container_size_limit = function() {
+      parquet___arrow___ReaderProperties__get_thrift_container_size_limit(self)
+    },
+    set_thrift_container_size_limit = function(size) {
+      
parquet___arrow___ReaderProperties__set_thrift_container_size_limit(self, size)
+    }
+  )
+)
+
+ParquetReaderProperties$create <- function() {
+  parquet___arrow___ReaderProperties__Make()
+}
diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml
index 9facce9d1b..10323a4796 100644
--- a/r/_pkgdown.yml
+++ b/r/_pkgdown.yml
@@ -278,6 +278,7 @@ reference:
 - title: File read/writer interface
   contents:
   - ParquetFileReader
+  - ParquetReaderProperties
   - ParquetArrowReaderProperties
   - ParquetFileWriter
   - ParquetWriterProperties
diff --git a/r/man/FragmentScanOptions.Rd b/r/man/FragmentScanOptions.Rd
index 79bb3ea3c3..7b597c2f01 100644
--- a/r/man/FragmentScanOptions.Rd
+++ b/r/man/FragmentScanOptions.Rd
@@ -29,6 +29,13 @@ to reduce memory overhead. Disabled by default.
 \item \code{buffer_size}: Size of buffered stream, if enabled. Default is 8KB.
 \item \code{pre_buffer}: Pre-buffer the raw Parquet data. This can improve 
performance
 on high-latency filesystems. Disabled by default.
+\item \code{thrift_string_size_limit}: Maximum string size allocated for 
decoding thrift
+strings. May need to be increased in order to read
+files with especially large headers. Default value
+100000000.
+\item \code{thrift_container_size_limit}: Maximum size of thrift containers.  
May need to be
+increased in order to read files with especially large
+headers. Default value 1000000.
 \code{format = "text"}: see \link{CsvConvertOptions}. Note that options can 
only be
 specified with the Arrow C++ library naming. Also, "block_size" from
 \link{CsvReadOptions} may be given.
diff --git a/r/man/ParquetFileReader.Rd b/r/man/ParquetFileReader.Rd
index 30d0725a49..59ec0d9a3b 100644
--- a/r/man/ParquetFileReader.Rd
+++ b/r/man/ParquetFileReader.Rd
@@ -17,6 +17,7 @@ takes the following arguments:
 (e.g. \code{RandomAccessFile}).
 \item \code{props} Optional \link{ParquetArrowReaderProperties}
 \item \code{mmap} Logical: whether to memory-map the file (default \code{TRUE})
+\item \code{reader_props} Optional \link{ParquetReaderProperties}
 \item \code{...} Additional arguments, currently ignored
 }
 }
diff --git a/r/man/ParquetReaderProperties.Rd b/r/man/ParquetReaderProperties.Rd
new file mode 100644
index 0000000000..1779fffb14
--- /dev/null
+++ b/r/man/ParquetReaderProperties.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/parquet.R
+\docType{class}
+\name{ParquetReaderProperties}
+\alias{ParquetReaderProperties}
+\title{ParquetReaderProperties class}
+\description{
+This class holds settings to control how a Parquet file is read
+by \link{ParquetFileReader}.
+}
+\section{Factory}{
+
+
+The \code{ParquetReaderProperties$create()} factory method instantiates the 
object
+and takes no arguments.
+}
+
+\section{Methods}{
+
+\itemize{
+\item \verb{$thrift_string_size_limit()}
+\item \verb{$set_thrift_string_size_limit()}
+\item \verb{$thrift_container_size_limit()}
+\item \verb{$set_thrift_container_size_limit()}
+}
+}
+
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 1d617b252e..790207efce 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -2024,17 +2024,19 @@ extern "C" SEXP 
_arrow_dataset___JsonFragmentScanOptions__Make(SEXP parse_option
 
 // dataset.cpp
 #if defined(ARROW_R_WITH_DATASET)
-std::shared_ptr<ds::ParquetFragmentScanOptions> 
dataset___ParquetFragmentScanOptions__Make(bool use_buffered_stream, int64_t 
buffer_size, bool pre_buffer);
-extern "C" SEXP _arrow_dataset___ParquetFragmentScanOptions__Make(SEXP 
use_buffered_stream_sexp, SEXP buffer_size_sexp, SEXP pre_buffer_sexp){
+std::shared_ptr<ds::ParquetFragmentScanOptions> 
dataset___ParquetFragmentScanOptions__Make(bool use_buffered_stream, int64_t 
buffer_size, bool pre_buffer, int64_t thrift_string_size_limit, int64_t 
thrift_container_size_limit);
+extern "C" SEXP _arrow_dataset___ParquetFragmentScanOptions__Make(SEXP 
use_buffered_stream_sexp, SEXP buffer_size_sexp, SEXP pre_buffer_sexp, SEXP 
thrift_string_size_limit_sexp, SEXP thrift_container_size_limit_sexp){
 BEGIN_CPP11
        arrow::r::Input<bool>::type 
use_buffered_stream(use_buffered_stream_sexp);
        arrow::r::Input<int64_t>::type buffer_size(buffer_size_sexp);
        arrow::r::Input<bool>::type pre_buffer(pre_buffer_sexp);
-       return 
cpp11::as_sexp(dataset___ParquetFragmentScanOptions__Make(use_buffered_stream, 
buffer_size, pre_buffer));
+       arrow::r::Input<int64_t>::type 
thrift_string_size_limit(thrift_string_size_limit_sexp);
+       arrow::r::Input<int64_t>::type 
thrift_container_size_limit(thrift_container_size_limit_sexp);
+       return 
cpp11::as_sexp(dataset___ParquetFragmentScanOptions__Make(use_buffered_stream, 
buffer_size, pre_buffer, thrift_string_size_limit, 
thrift_container_size_limit));
 END_CPP11
 }
 #else
-extern "C" SEXP _arrow_dataset___ParquetFragmentScanOptions__Make(SEXP 
use_buffered_stream_sexp, SEXP buffer_size_sexp, SEXP pre_buffer_sexp){
+extern "C" SEXP _arrow_dataset___ParquetFragmentScanOptions__Make(SEXP 
use_buffered_stream_sexp, SEXP buffer_size_sexp, SEXP pre_buffer_sexp, SEXP 
thrift_string_size_limit_sexp, SEXP thrift_container_size_limit_sexp){
        Rf_error("Cannot call dataset___ParquetFragmentScanOptions__Make(). See 
https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow 
C++ libraries. ");
 }
 #endif
@@ -4062,6 +4064,84 @@ extern "C" SEXP 
_arrow_parquet___arrow___ArrowReaderProperties__Make(SEXP use_th
 }
 #endif
 
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+std::shared_ptr<parquet::ReaderProperties> 
parquet___arrow___ReaderProperties__Make();
+extern "C" SEXP _arrow_parquet___arrow___ReaderProperties__Make(){
+BEGIN_CPP11
+       return cpp11::as_sexp(parquet___arrow___ReaderProperties__Make());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___ReaderProperties__Make(){
+       Rf_error("Cannot call parquet___arrow___ReaderProperties__Make(). See 
https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow 
C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+int parquet___arrow___ReaderProperties__get_thrift_string_size_limit(const 
std::shared_ptr<parquet::ReaderProperties>& properties);
+extern "C" SEXP 
_arrow_parquet___arrow___ReaderProperties__get_thrift_string_size_limit(SEXP 
properties_sexp){
+BEGIN_CPP11
+       arrow::r::Input<const 
std::shared_ptr<parquet::ReaderProperties>&>::type properties(properties_sexp);
+       return 
cpp11::as_sexp(parquet___arrow___ReaderProperties__get_thrift_string_size_limit(properties));
+END_CPP11
+}
+#else
+extern "C" SEXP 
_arrow_parquet___arrow___ReaderProperties__get_thrift_string_size_limit(SEXP 
properties_sexp){
+       Rf_error("Cannot call 
parquet___arrow___ReaderProperties__get_thrift_string_size_limit(). See 
https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow 
C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+void parquet___arrow___ReaderProperties__set_thrift_string_size_limit(const 
std::shared_ptr<parquet::ReaderProperties>& properties, int size);
+extern "C" SEXP 
_arrow_parquet___arrow___ReaderProperties__set_thrift_string_size_limit(SEXP 
properties_sexp, SEXP size_sexp){
+BEGIN_CPP11
+       arrow::r::Input<const 
std::shared_ptr<parquet::ReaderProperties>&>::type properties(properties_sexp);
+       arrow::r::Input<int>::type size(size_sexp);
+       
parquet___arrow___ReaderProperties__set_thrift_string_size_limit(properties, 
size);
+       return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP 
_arrow_parquet___arrow___ReaderProperties__set_thrift_string_size_limit(SEXP 
properties_sexp, SEXP size_sexp){
+       Rf_error("Cannot call 
parquet___arrow___ReaderProperties__set_thrift_string_size_limit(). See 
https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow 
C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+int parquet___arrow___ReaderProperties__get_thrift_container_size_limit(const 
std::shared_ptr<parquet::ReaderProperties>& properties);
+extern "C" SEXP 
_arrow_parquet___arrow___ReaderProperties__get_thrift_container_size_limit(SEXP 
properties_sexp){
+BEGIN_CPP11
+       arrow::r::Input<const 
std::shared_ptr<parquet::ReaderProperties>&>::type properties(properties_sexp);
+       return 
cpp11::as_sexp(parquet___arrow___ReaderProperties__get_thrift_container_size_limit(properties));
+END_CPP11
+}
+#else
+extern "C" SEXP 
_arrow_parquet___arrow___ReaderProperties__get_thrift_container_size_limit(SEXP 
properties_sexp){
+       Rf_error("Cannot call 
parquet___arrow___ReaderProperties__get_thrift_container_size_limit(). See 
https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow 
C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+void parquet___arrow___ReaderProperties__set_thrift_container_size_limit(const 
std::shared_ptr<parquet::ReaderProperties>& properties, int size);
+extern "C" SEXP 
_arrow_parquet___arrow___ReaderProperties__set_thrift_container_size_limit(SEXP 
properties_sexp, SEXP size_sexp){
+BEGIN_CPP11
+       arrow::r::Input<const 
std::shared_ptr<parquet::ReaderProperties>&>::type properties(properties_sexp);
+       arrow::r::Input<int>::type size(size_sexp);
+       
parquet___arrow___ReaderProperties__set_thrift_container_size_limit(properties, 
size);
+       return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP 
_arrow_parquet___arrow___ReaderProperties__set_thrift_container_size_limit(SEXP 
properties_sexp, SEXP size_sexp){
+       Rf_error("Cannot call 
parquet___arrow___ReaderProperties__set_thrift_container_size_limit(). See 
https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow 
C++ libraries. ");
+}
+#endif
+
 // parquet.cpp
 #if defined(ARROW_R_WITH_PARQUET)
 void parquet___arrow___ArrowReaderProperties__set_use_threads(const 
std::shared_ptr<parquet::ArrowReaderProperties>& properties, bool use_threads);
@@ -4163,16 +4243,17 @@ extern "C" SEXP 
_arrow_parquet___arrow___ArrowReaderProperties__get_coerce_int96
 
 // parquet.cpp
 #if defined(ARROW_R_WITH_PARQUET)
-std::shared_ptr<parquet::arrow::FileReader> 
parquet___arrow___FileReader__OpenFile(const 
std::shared_ptr<arrow::io::RandomAccessFile>& file, const 
std::shared_ptr<parquet::ArrowReaderProperties>& props);
-extern "C" SEXP _arrow_parquet___arrow___FileReader__OpenFile(SEXP file_sexp, 
SEXP props_sexp){
+std::shared_ptr<parquet::arrow::FileReader> 
parquet___arrow___FileReader__OpenFile(const 
std::shared_ptr<arrow::io::RandomAccessFile>& file, const 
std::shared_ptr<parquet::ArrowReaderProperties>& props, const 
std::shared_ptr<parquet::ReaderProperties>& reader_props);
+extern "C" SEXP _arrow_parquet___arrow___FileReader__OpenFile(SEXP file_sexp, 
SEXP props_sexp, SEXP reader_props_sexp){
 BEGIN_CPP11
        arrow::r::Input<const 
std::shared_ptr<arrow::io::RandomAccessFile>&>::type file(file_sexp);
        arrow::r::Input<const 
std::shared_ptr<parquet::ArrowReaderProperties>&>::type props(props_sexp);
-       return cpp11::as_sexp(parquet___arrow___FileReader__OpenFile(file, 
props));
+       arrow::r::Input<const 
std::shared_ptr<parquet::ReaderProperties>&>::type 
reader_props(reader_props_sexp);
+       return cpp11::as_sexp(parquet___arrow___FileReader__OpenFile(file, 
props, reader_props));
 END_CPP11
 }
 #else
-extern "C" SEXP _arrow_parquet___arrow___FileReader__OpenFile(SEXP file_sexp, 
SEXP props_sexp){
+extern "C" SEXP _arrow_parquet___arrow___FileReader__OpenFile(SEXP file_sexp, 
SEXP props_sexp, SEXP reader_props_sexp){
        Rf_error("Cannot call parquet___arrow___FileReader__OpenFile(). See 
https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow 
C++ libraries. ");
 }
 #endif
@@ -5774,7 +5855,7 @@ static const R_CallMethodDef CallEntries[] = {
                { "_arrow_dataset___FragmentScanOptions__type_name", (DL_FUNC) 
&_arrow_dataset___FragmentScanOptions__type_name, 1}, 
                { "_arrow_dataset___CsvFragmentScanOptions__Make", (DL_FUNC) 
&_arrow_dataset___CsvFragmentScanOptions__Make, 2}, 
                { "_arrow_dataset___JsonFragmentScanOptions__Make", (DL_FUNC) 
&_arrow_dataset___JsonFragmentScanOptions__Make, 2}, 
-               { "_arrow_dataset___ParquetFragmentScanOptions__Make", 
(DL_FUNC) &_arrow_dataset___ParquetFragmentScanOptions__Make, 3}, 
+               { "_arrow_dataset___ParquetFragmentScanOptions__Make", 
(DL_FUNC) &_arrow_dataset___ParquetFragmentScanOptions__Make, 5}, 
                { "_arrow_dataset___DirectoryPartitioning", (DL_FUNC) 
&_arrow_dataset___DirectoryPartitioning, 2}, 
                { "_arrow_dataset___DirectoryPartitioning__MakeFactory", 
(DL_FUNC) &_arrow_dataset___DirectoryPartitioning__MakeFactory, 2}, 
                { "_arrow_dataset___HivePartitioning", (DL_FUNC) 
&_arrow_dataset___HivePartitioning, 3}, 
@@ -5985,13 +6066,18 @@ static const R_CallMethodDef CallEntries[] = {
                { "_arrow_ipc___MessageReader__ReadNextMessage", (DL_FUNC) 
&_arrow_ipc___MessageReader__ReadNextMessage, 1}, 
                { "_arrow_ipc___ReadMessage", (DL_FUNC) 
&_arrow_ipc___ReadMessage, 1}, 
                { "_arrow_parquet___arrow___ArrowReaderProperties__Make", 
(DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__Make, 1}, 
+               { "_arrow_parquet___arrow___ReaderProperties__Make", (DL_FUNC) 
&_arrow_parquet___arrow___ReaderProperties__Make, 0}, 
+               { 
"_arrow_parquet___arrow___ReaderProperties__get_thrift_string_size_limit", 
(DL_FUNC) 
&_arrow_parquet___arrow___ReaderProperties__get_thrift_string_size_limit, 1}, 
+               { 
"_arrow_parquet___arrow___ReaderProperties__set_thrift_string_size_limit", 
(DL_FUNC) 
&_arrow_parquet___arrow___ReaderProperties__set_thrift_string_size_limit, 2}, 
+               { 
"_arrow_parquet___arrow___ReaderProperties__get_thrift_container_size_limit", 
(DL_FUNC) 
&_arrow_parquet___arrow___ReaderProperties__get_thrift_container_size_limit, 
1}, 
+               { 
"_arrow_parquet___arrow___ReaderProperties__set_thrift_container_size_limit", 
(DL_FUNC) 
&_arrow_parquet___arrow___ReaderProperties__set_thrift_container_size_limit, 
2}, 
                { 
"_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads", (DL_FUNC) 
&_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads, 2}, 
                { 
"_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads", (DL_FUNC) 
&_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads, 2}, 
                { 
"_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary", 
(DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary, 
2}, 
                { 
"_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary", 
(DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary, 
3}, 
                { 
"_arrow_parquet___arrow___ArrowReaderProperties__set_coerce_int96_timestamp_unit",
 (DL_FUNC) 
&_arrow_parquet___arrow___ArrowReaderProperties__set_coerce_int96_timestamp_unit,
 2}, 
                { 
"_arrow_parquet___arrow___ArrowReaderProperties__get_coerce_int96_timestamp_unit",
 (DL_FUNC) 
&_arrow_parquet___arrow___ArrowReaderProperties__get_coerce_int96_timestamp_unit,
 1}, 
-               { "_arrow_parquet___arrow___FileReader__OpenFile", (DL_FUNC) 
&_arrow_parquet___arrow___FileReader__OpenFile, 2}, 
+               { "_arrow_parquet___arrow___FileReader__OpenFile", (DL_FUNC) 
&_arrow_parquet___arrow___FileReader__OpenFile, 3}, 
                { "_arrow_parquet___arrow___FileReader__ReadTable1", (DL_FUNC) 
&_arrow_parquet___arrow___FileReader__ReadTable1, 1}, 
                { "_arrow_parquet___arrow___FileReader__ReadTable2", (DL_FUNC) 
&_arrow_parquet___arrow___FileReader__ReadTable2, 2}, 
                { "_arrow_parquet___arrow___FileReader__ReadRowGroup1", 
(DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroup1, 2}, 
diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h
index 5f82275fe9..fadc39c75f 100644
--- a/r/src/arrow_types.h
+++ b/r/src/arrow_types.h
@@ -270,6 +270,7 @@ R6_CLASS_NAME(arrow::csv::WriteOptions, "CsvWriteOptions");
 
 #if defined(ARROW_R_WITH_PARQUET)
 R6_CLASS_NAME(parquet::ArrowReaderProperties, "ParquetArrowReaderProperties");
+R6_CLASS_NAME(parquet::ReaderProperties, "ParquetReaderProperties");
 R6_CLASS_NAME(parquet::ArrowWriterProperties, "ParquetArrowWriterProperties");
 R6_CLASS_NAME(parquet::WriterProperties, "ParquetWriterProperties");
 R6_CLASS_NAME(parquet::arrow::FileReader, "ParquetFileReader");
diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp
index 3a4958e4cb..83c430fb63 100644
--- a/r/src/dataset.cpp
+++ b/r/src/dataset.cpp
@@ -342,7 +342,9 @@ std::shared_ptr<ds::JsonFragmentScanOptions> 
dataset___JsonFragmentScanOptions__
 // [[dataset::export]]
 std::shared_ptr<ds::ParquetFragmentScanOptions>
 dataset___ParquetFragmentScanOptions__Make(bool use_buffered_stream, int64_t 
buffer_size,
-                                           bool pre_buffer) {
+                                           bool pre_buffer,
+                                           int64_t thrift_string_size_limit,
+                                           int64_t 
thrift_container_size_limit) {
   auto options = std::make_shared<ds::ParquetFragmentScanOptions>();
   if (use_buffered_stream) {
     options->reader_properties->enable_buffered_stream();
@@ -355,6 +357,9 @@ dataset___ParquetFragmentScanOptions__Make(bool 
use_buffered_stream, int64_t buf
     options->arrow_reader_properties->set_cache_options(
         arrow::io::CacheOptions::LazyDefaults());
   }
+  
options->reader_properties->set_thrift_string_size_limit(thrift_string_size_limit);
+  options->reader_properties->set_thrift_container_size_limit(
+      thrift_container_size_limit);
   return options;
 }
 
diff --git a/r/src/parquet.cpp b/r/src/parquet.cpp
index 3a263f6527..f9f7b0447c 100644
--- a/r/src/parquet.cpp
+++ b/r/src/parquet.cpp
@@ -44,6 +44,35 @@ parquet___arrow___ArrowReaderProperties__Make(bool 
use_threads) {
   return std::make_shared<parquet::ArrowReaderProperties>(use_threads);
 }
 
+// [[parquet::export]]
+std::shared_ptr<parquet::ReaderProperties> 
parquet___arrow___ReaderProperties__Make() {
+  return std::make_shared<parquet::ReaderProperties>();
+}
+
+// [[parquet::export]]
+int parquet___arrow___ReaderProperties__get_thrift_string_size_limit(
+    const std::shared_ptr<parquet::ReaderProperties>& properties) {
+  return properties->thrift_string_size_limit();
+}
+
+// [[parquet::export]]
+void parquet___arrow___ReaderProperties__set_thrift_string_size_limit(
+    const std::shared_ptr<parquet::ReaderProperties>& properties, int size) {
+  properties->set_thrift_string_size_limit(size);
+}
+
+// [[parquet::export]]
+int parquet___arrow___ReaderProperties__get_thrift_container_size_limit(
+    const std::shared_ptr<parquet::ReaderProperties>& properties) {
+  return properties->thrift_container_size_limit();
+}
+
+// [[parquet::export]]
+void parquet___arrow___ReaderProperties__set_thrift_container_size_limit(
+    const std::shared_ptr<parquet::ReaderProperties>& properties, int size) {
+  properties->set_thrift_container_size_limit(size);
+}
+
 // [[parquet::export]]
 void parquet___arrow___ArrowReaderProperties__set_use_threads(
     const std::shared_ptr<parquet::ArrowReaderProperties>& properties, bool 
use_threads) {
@@ -86,10 +115,11 @@ 
parquet___arrow___ArrowReaderProperties__get_coerce_int96_timestamp_unit(
 // [[parquet::export]]
 std::shared_ptr<parquet::arrow::FileReader> 
parquet___arrow___FileReader__OpenFile(
     const std::shared_ptr<arrow::io::RandomAccessFile>& file,
-    const std::shared_ptr<parquet::ArrowReaderProperties>& props) {
+    const std::shared_ptr<parquet::ArrowReaderProperties>& props,
+    const std::shared_ptr<parquet::ReaderProperties>& reader_props) {
   std::unique_ptr<parquet::arrow::FileReader> reader;
   parquet::arrow::FileReaderBuilder builder;
-  PARQUET_THROW_NOT_OK(builder.Open(file));
+  PARQUET_THROW_NOT_OK(builder.Open(file, *reader_props));
   PARQUET_THROW_NOT_OK(
       
builder.memory_pool(gc_memory_pool())->properties(*props)->Build(&reader));
   return std::move(reader);
diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R
index b9972901a7..cbeb081d0b 100644
--- a/r/tests/testthat/test-dataset.R
+++ b/r/tests/testthat/test-dataset.R
@@ -1516,3 +1516,17 @@ test_that("can add in augmented fields", {
     list.files(hive_dir, full.names = TRUE, recursive = TRUE)
   )
 })
+
+test_that("can set thrift size string and container limits for datasets", {
+  expect_r6_class(open_dataset(dataset_dir, thrift_string_size_limit = 
1000000), "FileSystemDataset")
+  expect_error(
+    open_dataset(dataset_dir, thrift_string_size_limit = 1),
+    "TProtocolException: Exceeded size limit"
+  )
+
+  expect_r6_class(open_dataset(dataset_dir, thrift_container_size_limit = 
1000000), "FileSystemDataset")
+  expect_error(
+    open_dataset(dataset_dir, thrift_container_size_limit = 1),
+    "TProtocolException: Exceeded size limit"
+  )
+})
diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R
index 7c5d9ebef6..dbc4a04735 100644
--- a/r/tests/testthat/test-parquet.R
+++ b/r/tests/testthat/test-parquet.R
@@ -484,3 +484,43 @@ test_that("Can read Parquet files from a URL", {
   expect_true(tibble::is_tibble(pu))
   expect_identical(dim(pu), c(10L, 11L))
 })
+
+test_that("thrift string and container size can be specified when reading 
Parquet files", {
+
+  tf <- tempfile()
+  on.exit(unlink(tf))
+  table <- arrow_table(example_data)
+  write_parquet(table, tf)
+  file <- make_readable_file(tf)
+  on.exit(file$close())
+
+  # thrift string size
+  reader_props <- ParquetReaderProperties$create()
+  reader_props$set_thrift_string_size_limit(1)
+  expect_identical(reader_props$thrift_string_size_limit(), 1L)
+
+  # We get an error if we set the Thrift string size limit too small
+  expect_error(ParquetFileReader$create(file, reader_props = reader_props), 
"TProtocolException: Exceeded size limit")
+
+  # Increase the size and we can read successfully
+  reader_props$set_thrift_string_size_limit(10000)
+  reader <- ParquetFileReader$create(file, reader_props = reader_props)
+  data <- reader$ReadTable()
+  expect_identical(collect.ArrowTabular(data), example_data)
+
+  # thrift container size
+  reader_props_container <- ParquetReaderProperties$create()
+  reader_props_container$set_thrift_container_size_limit(1)
+  expect_identical(reader_props_container$thrift_container_size_limit(), 1L)
+
+  expect_error(
+    ParquetFileReader$create(file, reader_props = reader_props_container),
+    "TProtocolException: Exceeded size limit"
+  )
+
+  reader_props_container$set_thrift_container_size_limit(100)
+
+  reader_container <- ParquetFileReader$create(file, reader_props = 
reader_props_container)
+  data <- reader_container$ReadTable()
+  expect_identical(collect.ArrowTabular(data), example_data)
+})

Reply via email to