From 29a0581f5bfcad86a6493854f8be8fcb6ffe2fbc Mon Sep 17 00:00:00 2001 From: Matthew McNew Date: Tue, 20 Feb 2024 19:59:57 -0600 Subject: [PATCH] GH-39870: [Go] Include buffered pages in TotalBytesWritten (#40105) ### Rationale for this change Currently, buffered data pages are not included in TotalBytesWritten this means that their is not an accurate estimate of the size of the current size. ### Are there any user-facing changes? `RowGroupTotalBytesWritten` will include the TotalBytes in buffered DataPages minus the buffered data pages headers. * Closes: #39870 Authored-by: Matthew McNew Signed-off-by: Matt Topol --- go/parquet/file/column_writer.go | 7 ++++++- go/parquet/file/column_writer_test.go | 14 ++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/go/parquet/file/column_writer.go b/go/parquet/file/column_writer.go index 4d603c547ca6a..91f5d18942958 100755 --- a/go/parquet/file/column_writer.go +++ b/go/parquet/file/column_writer.go @@ -198,7 +198,12 @@ func (w *columnWriter) TotalCompressedBytes() int64 { } func (w *columnWriter) TotalBytesWritten() int64 { - return w.totalBytesWritten + bufferedPagesBytes := int64(0) + for _, p := range w.pages { + bufferedPagesBytes += int64(len(p.Data())) + } + + return w.totalBytesWritten + bufferedPagesBytes } func (w *columnWriter) RowsWritten() int { diff --git a/go/parquet/file/column_writer_test.go b/go/parquet/file/column_writer_test.go index dd597e280b850..d78e1c6761be0 100755 --- a/go/parquet/file/column_writer_test.go +++ b/go/parquet/file/column_writer_test.go @@ -430,6 +430,11 @@ func (p *PrimitiveWriterTestSuite) testDictionaryFallbackEncoding(version parque } func (p *PrimitiveWriterTestSuite) testDictionaryFallbackAndCompressedSize(version parquet.Version) { + // skip boolean as dictionary encoding is not used + if p.Typ.Kind() == reflect.Bool { + return + } + p.GenerateData(SmallSize) props := parquet.DefaultColumnProperties() props.DictionaryEnabled = true @@ -440,13 +445,14 @@ func (p *PrimitiveWriterTestSuite) testDictionaryFallbackAndCompressedSize(versi props.Encoding = parquet.Encodings.RLEDict } - writer := p.buildWriter(SmallSize, props, parquet.WithVersion(version)) + writer := p.buildWriter(SmallSize, props, parquet.WithVersion(version), parquet.WithDataPageSize(SmallSize-1)) p.WriteBatchValues(writer, nil, nil) + p.NotZero(writer.TotalBytesWritten()) writer.FallbackToPlain() - p.NotEqual(0, writer.TotalCompressedBytes()) + p.NotZero(writer.TotalCompressedBytes()) writer.Close() - p.NotEqual(0, writer.TotalCompressedBytes()) - p.NotEqual(0, writer.TotalBytesWritten()) + p.NotZero(writer.TotalCompressedBytes()) + p.NotZero(writer.TotalBytesWritten()) } func (p *PrimitiveWriterTestSuite) TestRequiredPlain() {