LukasBec opened a new issue #11043:
URL: https://github.com/apache/arrow/issues/11043
Currently I am working on a tool that needs to write a previously unknown
amount of data to a parquet file.
The parquet::StreamWriter seems to be the correct solution for this.
I noticed that the memory usage of process increases over time.
This would not be the case if I were to write to a csv file, for example.
The used memory seems to roughly double every time it grows. This indicates
that the memory is allocated by a std::vector or a container type which has the
same exponential growth behavior.
Is there a reason that the stream writer requires this increasing amount of
memory as the output file grows?
arrow version: release 5.0.0
minimal example:
``` c++
#include <fstream>
#include <iostream>
#include <string>
#include "arrow/io/file.h"
#include "parquet/exception.h"
#include "parquet/stream_writer.h"
constexpr size_t COLUMNS = 100;
constexpr size_t ROWS_IN_ROW_GROUP = 4000;
constexpr size_t ROW_GROUPS = 5000;
std::shared_ptr<parquet::schema::GroupNode> get_schema() {
parquet::schema::NodeVector fields;
for (int i = 0; i < COLUMNS; ++i) {
fields.push_back(parquet::schema::PrimitiveNode::Make(std::string("column_field_uint32_")
+ std::to_string(i),
parquet::Repetition::OPTIONAL, parquet::Type::INT32,
parquet::ConvertedType::UINT_32));
fields.push_back(parquet::schema::PrimitiveNode::Make(std::string("column_field_float_")
+ std::to_string(i),
parquet::Repetition::OPTIONAL, parquet::Type::FLOAT,
parquet::ConvertedType::NONE));
fields.push_back(parquet::schema::PrimitiveNode::Make(std::string("column_field_uint16_")
+ std::to_string(i),
parquet::Repetition::OPTIONAL, parquet::Type::INT32,
parquet::ConvertedType::UINT_16));
fields.push_back(parquet::schema::PrimitiveNode::Make(std::string("column_field_uint8_")
+ std::to_string(i),
parquet::Repetition::OPTIONAL, parquet::Type::INT32,
parquet::ConvertedType::UINT_8));
}
return std::static_pointer_cast<parquet::schema::GroupNode>(
parquet::schema::GroupNode::Make("schema",
parquet::Repetition::REQUIRED, fields));
}
// print the memory usage by printing the line starting with VmSize: of
/proc/self/status
void print_memory_usage() {
std::ifstream file;
file.open("/proc/self/status");
std::string line;
const std::string vmsize = "VmSize:";
while (file) {
std::getline(file, line);
auto idx = line.find(vmsize);
if (idx != std::string::npos) {
std::cout << line.substr(idx + vmsize.length());
break;
}
}
file.close();
}
int main() {
std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW(outfile,
arrow::io::FileOutputStream::Open("very_big.parquet"));
parquet::WriterProperties::Builder builder;
builder.compression(parquet::Compression::SNAPPY);
parquet::StreamWriter os{parquet::ParquetFileWriter::Open(outfile,
get_schema(), builder.build())};
os.SetMaxRowGroupSize(0); // row groups will be ended manually with os
<< parquet::EndRowGroup.
for (int i = 0; i < ROW_GROUPS; ++i) {
if (i % 100 == 0) { // print memory usage every 100 row groups
std::cout << "group " << i << " from " << ROW_GROUPS << ",
memory usage: ";
print_memory_usage();
std::cout << std::endl;
}
for (int j = 0; j < ROWS_IN_ROW_GROUP; ++j) {
for (int k = 0; k < COLUMNS; ++k) {
os << static_cast<uint32_t>(i * k * j);
os << static_cast<float>(i * k * j * 0.3333);
os << static_cast<uint16_t>(i * k * j * 2);
os << static_cast<uint8_t>((i * k * j) % 255);
}
os << parquet::EndRow;
}
os << parquet::EndRowGroup;
}
}
```
The produced output is:
```
group 0 from 5000, memory usage: 51492 kB
group 100 from 5000, memory usage: 249672 kB
group 200 from 5000, memory usage: 286528 kB
group 300 from 5000, memory usage: 360200 kB
group 400 from 5000, memory usage: 360200 kB
group 500 from 5000, memory usage: 360200 kB
group 600 from 5000, memory usage: 507556 kB
group 700 from 5000, memory usage: 507556 kB
group 800 from 5000, memory usage: 507556 kB
group 900 from 5000, memory usage: 507556 kB
group 1000 from 5000, memory usage: 507556 kB
group 1100 from 5000, memory usage: 802260 kB
group 1200 from 5000, memory usage: 802260 kB
group 1300 from 5000, memory usage: 802260 kB
group 1400 from 5000, memory usage: 802260 kB
group 1500 from 5000, memory usage: 802260 kB
group 1600 from 5000, memory usage: 802260 kB
group 1700 from 5000, memory usage: 802260 kB
group 1800 from 5000, memory usage: 802260 kB
group 1900 from 5000, memory usage: 802260 kB
group 2000 from 5000, memory usage: 802260 kB
group 2100 from 5000, memory usage: 1391668 kB
group 2200 from 5000, memory usage: 1391668 kB
group 2300 from 5000, memory usage: 1391668 kB
group 2400 from 5000, memory usage: 1391668 kB
group 2500 from 5000, memory usage: 1391668 kB
group 2600 from 5000, memory usage: 1391668 kB
group 2700 from 5000, memory usage: 1391668 kB
group 2800 from 5000, memory usage: 1391668 kB
group 2900 from 5000, memory usage: 1391668 kB
group 3000 from 5000, memory usage: 1391668 kB
group 3100 from 5000, memory usage: 1391668 kB
group 3200 from 5000, memory usage: 1391668 kB
group 3300 from 5000, memory usage: 1391668 kB
group 3400 from 5000, memory usage: 1391668 kB
group 3500 from 5000, memory usage: 1391668 kB
group 3600 from 5000, memory usage: 1391668 kB
group 3700 from 5000, memory usage: 1391668 kB
group 3800 from 5000, memory usage: 1391668 kB
group 3900 from 5000, memory usage: 1391668 kB
group 4000 from 5000, memory usage: 1391668 kB
group 4100 from 5000, memory usage: 2570104 kB
group 4200 from 5000, memory usage: 2570104 kB
group 4300 from 5000, memory usage: 2570104 kB
group 4400 from 5000, memory usage: 2570104 kB
group 4500 from 5000, memory usage: 2570104 kB
group 4600 from 5000, memory usage: 2570104 kB
group 4700 from 5000, memory usage: 2570104 kB
group 4800 from 5000, memory usage: 2570104 kB
group 4900 from 5000, memory usage: 2570104 kB
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]