[ https://issues.apache.org/jira/browse/PARQUET-2242?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
xjlem updated PARQUET-2242: --------------------------- Description: org.apache.parquet.hadoop.InternalParquetRecordWriter#checkBlockSizeReached {code:java} private void checkBlockSizeReached() throws IOException { if (recordCount >= recordCountForNextMemCheck) { // checking the memory size is relatively expensive, so let's not do it for every record. long memSize = columnStore.getBufferedSize(); long recordSize = memSize / recordCount; // flush the row group if it is within ~2 records of the limit // it is much better to be slightly under size than to be over at all if (memSize > (nextRowGroupSize - 2 * recordSize)) { LOG.info("mem size {} > {}: flushing {} records to disk.", memSize, nextRowGroupSize, recordCount); flushRowGroupToStore(); initStore(); recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCount / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK); this.lastRowGroupEndPos = parquetFileWriter.getPos(); } else { recordCountForNextMemCheck = min( max(MINIMUM_RECORD_COUNT_FOR_CHECK, (recordCount + (long)(nextRowGroupSize / ((float)recordSize))) / 2), // will check halfway recordCount + MAXIMUM_RECORD_COUNT_FOR_CHECK // will not look more than max records ahead ); LOG.debug("Checked mem at {} will check again at: {}", recordCount, recordCountForNextMemCheck); } } } {code} in this code,if the block size is small ,for example 8M,and the first 100 lines record size is small and after 100 lines the record size is big,it will cause big row group,in our real scene,it will more than 64M > record count for row group size check configurable > --------------------------------------------------- > > Key: PARQUET-2242 > URL: https://issues.apache.org/jira/browse/PARQUET-2242 > Project: Parquet > Issue Type: Improvement > Components: parquet-mr > Affects Versions: 1.10.1 > Reporter: xjlem > Priority: Major > > org.apache.parquet.hadoop.InternalParquetRecordWriter#checkBlockSizeReached > {code:java} > private void checkBlockSizeReached() throws IOException { > if (recordCount >= recordCountForNextMemCheck) { // checking the memory > size is relatively expensive, so let's not do it for every record. > long memSize = columnStore.getBufferedSize(); > long recordSize = memSize / recordCount; > // flush the row group if it is within ~2 records of the limit > // it is much better to be slightly under size than to be over at all > if (memSize > (nextRowGroupSize - 2 * recordSize)) { > LOG.info("mem size {} > {}: flushing {} records to disk.", memSize, > nextRowGroupSize, recordCount); > flushRowGroupToStore(); > initStore(); > recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, > recordCount / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK); > this.lastRowGroupEndPos = parquetFileWriter.getPos(); > } else { > recordCountForNextMemCheck = min( > max(MINIMUM_RECORD_COUNT_FOR_CHECK, (recordCount + > (long)(nextRowGroupSize / ((float)recordSize))) / 2), // will check halfway > recordCount + MAXIMUM_RECORD_COUNT_FOR_CHECK // will not look > more than max records ahead > ); > LOG.debug("Checked mem at {} will check again at: {}", recordCount, > recordCountForNextMemCheck); > } > } > } {code} > in this code,if the block size is small ,for example 8M,and the first 100 > lines record size is small and after 100 lines the record size is big,it > will cause big row group,in our real scene,it will more than 64M -- This message was sent by Atlassian Jira (v8.20.10#820010)