[
https://issues.apache.org/jira/browse/PARQUET-1033?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
yugu updated PARQUET-1033:
--------------------------
Description:
The readbatchspaced reads in more lines than the actual data in file with nulls.
So I've been trying to write something like [bla.csv] with mixed nulls.
The problem is that, when I use writebatchspaced to write and readbatchspaced
to read back,
Instead of getting the correct values, I'm getting less values than I initially
wrote and additional nulls in the middle, a brief example as follows
written
{code:c++}
-2147483648
-2147483648
30
40
50
60
70
80
90
-2147483648
-2147483648
{code}
actual read
{code:c++}
-2147483648
-2147483648
-2147483648
-2147483648
30
40
50
60
70
-2147483648
9
80
90
-2147483648
-2147483648
{code}
My code for reader
{code: c++}
int64_t rows_read = _c_reader->ReadBatchSpaced(arraysize,
definition_level.data(), repetition_level.data(), ivalues.data(),
valid_bits.data(), 0, &levels_read, &values_read, &null_count);
for (int tmp = 0; tmp < rows_read; tmp ++)
{
if (definition_level[tmp] < col_rep_type[__c])
{
ivalues[tmp] = NA_INTEGER;
}
//simply set value
if (fsize != 1 && filter[tmp + offset + cur_offset])
{
//rvec[__c].set(fcnt[__c],0,values[tmp]);
dff.set_value(fcnt[__c],0,__c,ivalues[tmp]);
fcnt[__c] ++;
}
else if (fsize == 1)
{
//rvec[__c].set(tmp,offset+cur_offset,values[tmp]);
dff.set_value(tmp,offset+cur_offset,__c,ivalues[tmp]);
}
}
{code}
my code for writer
{code: c++}
parquet::Int64Writer* int64_writer =
static_cast<parquet::Int64Writer*>(rg_writer->NextColumn());
IntegerVector tmpvec = df[__c];
for (int tmp = 0; tmp < rows_to_write; tmp++)
{
ivec[tmp] = tmpvec[tmp+offset];
if (tmpvec[tmp+offset] == NA_INTEGER)
{
def_level[tmp]=0;
}
}
int64_writer->WriteBatchSpaced(rows_to_write, def_level.data(),
rep_level.data(), valid_bits.data(), 0, ivec.data());
{code}
was:
The readbatchspaced reads in more lines than the actual data in file with nulls.
So I've been trying to write something like [bla.csv] with mixed nulls.
The problem is that, when I use writebatchspaced to write and readbatchspaced
to read back,
Instead of getting the correct values, I'm getting less values than I initially
wrote and additional nulls in the middle, a brief example as follows
written
{code:c++}
-2147483648
-2147483648
30
40
50
60
70
80
90
-2147483648
-2147483648
{code}
actual read
{code:c++}
-2147483648
-2147483648
-2147483648
-2147483648
30
40
50
60
70
-2147483648
9
80
90
-2147483648
-2147483648
{code}
My code for reader
{code:c++}
int64_t rows_read = _c_reader->ReadBatchSpaced(arraysize,
definition_level.data(), repetition_level.data(), ivalues.data(),
valid_bits.data(), 0, &levels_read, &values_read, &null_count);
for (int tmp = 0; tmp < rows_read; tmp ++)
{
if (definition_level[tmp] < col_rep_type[__c])
{
ivalues[tmp] = NA_INTEGER;
}
//simply set value
if (fsize != 1 && filter[tmp + offset + cur_offset])
{
//rvec[__c].set(fcnt[__c],0,values[tmp]);
dff.set_value(fcnt[__c],0,__c,ivalues[tmp]);
fcnt[__c] ++;
}
else if (fsize == 1)
{
//rvec[__c].set(tmp,offset+cur_offset,values[tmp]);
dff.set_value(tmp,offset+cur_offset,__c,ivalues[tmp]);
}
}
{code}
my code for writer
{code:c++}
parquet::Int64Writer* int64_writer =
static_cast<parquet::Int64Writer*>(rg_writer->NextColumn());
IntegerVector tmpvec = df[__c];
for (int tmp = 0; tmp < rows_to_write; tmp++)
{
ivec[tmp] = tmpvec[tmp+offset];
if (tmpvec[tmp+offset] == NA_INTEGER)
{
def_level[tmp]=0;
}
}
int64_writer->WriteBatchSpaced(rows_to_write, def_level.data(),
rep_level.data(), valid_bits.data(), 0, ivec.data());
{code}
> Mismatched Read and Write
> -------------------------
>
> Key: PARQUET-1033
> URL: https://issues.apache.org/jira/browse/PARQUET-1033
> Project: Parquet
> Issue Type: Bug
> Components: parquet-cpp
> Affects Versions: cpp-1.1.0
> Environment: Rstudio
> Reporter: yugu
> Attachments: bla.csv, wrong.csv
>
>
> The readbatchspaced reads in more lines than the actual data in file with
> nulls.
> So I've been trying to write something like [bla.csv] with mixed nulls.
> The problem is that, when I use writebatchspaced to write and readbatchspaced
> to read back,
> Instead of getting the correct values, I'm getting less values than I
> initially wrote and additional nulls in the middle, a brief example as follows
> written
> {code:c++}
> -2147483648
> -2147483648
> 30
> 40
> 50
> 60
> 70
> 80
> 90
> -2147483648
> -2147483648
> {code}
> actual read
> {code:c++}
> -2147483648
> -2147483648
> -2147483648
> -2147483648
> 30
> 40
> 50
> 60
> 70
> -2147483648
> 9
> 80
> 90
> -2147483648
> -2147483648
> {code}
> My code for reader
> {code: c++}
> int64_t rows_read = _c_reader->ReadBatchSpaced(arraysize,
> definition_level.data(), repetition_level.data(), ivalues.data(),
> valid_bits.data(), 0, &levels_read, &values_read, &null_count);
> for (int tmp = 0; tmp < rows_read; tmp ++)
> {
> if (definition_level[tmp] < col_rep_type[__c])
> {
> ivalues[tmp] = NA_INTEGER;
> }
> //simply set value
> if (fsize != 1 && filter[tmp + offset + cur_offset])
> {
> //rvec[__c].set(fcnt[__c],0,values[tmp]);
> dff.set_value(fcnt[__c],0,__c,ivalues[tmp]);
> fcnt[__c] ++;
> }
> else if (fsize == 1)
> {
> //rvec[__c].set(tmp,offset+cur_offset,values[tmp]);
> dff.set_value(tmp,offset+cur_offset,__c,ivalues[tmp]);
> }
> }
> {code}
> my code for writer
> {code: c++}
> parquet::Int64Writer* int64_writer =
> static_cast<parquet::Int64Writer*>(rg_writer->NextColumn());
> IntegerVector tmpvec = df[__c];
> for (int tmp = 0; tmp < rows_to_write; tmp++)
> {
> ivec[tmp] = tmpvec[tmp+offset];
> if (tmpvec[tmp+offset] == NA_INTEGER)
> {
> def_level[tmp]=0;
> }
> }
> int64_writer->WriteBatchSpaced(rows_to_write, def_level.data(),
> rep_level.data(), valid_bits.data(), 0, ivec.data());
> {code}
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)