From f3ae4d13b60ddcab9208ab7999e2b2cd44dd9807 Mon Sep 17 00:00:00 2001 From: jangalasriramd7 Date: Thu, 30 Jan 2025 02:55:35 +0530 Subject: [PATCH 1/4] Fix SizeBasedDataRewriter constructors and related logic --- .../apache/iceberg/actions/SizeBasedDataRewriter.java | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java b/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java index 61b90d9fc6e3..4cd0217a020d 100644 --- a/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java +++ b/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java @@ -47,12 +47,19 @@ public abstract class SizeBasedDataRewriter extends SizeBasedFileRewriter Date: Thu, 30 Jan 2025 18:19:55 +0530 Subject: [PATCH 2/4] Modified the constructor logic by using options --- .../iceberg/actions/SizeBasedDataRewriter.java | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java b/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java index 4cd0217a020d..7e74886597de 100644 --- a/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java +++ b/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java @@ -47,19 +47,13 @@ public abstract class SizeBasedDataRewriter extends SizeBasedFileRewriter validOptions() { public void init(Map options) { super.init(options); this.deleteFileThreshold = deleteFileThreshold(options); + this.deleteRatioThreshold = PropertyUtil.propertyAsDouble( + options, "delete-ratio-threshold", DEFAULT_DELETE_THRESHOLD); } @Override From f0d3c212919cfbed824fd662c2aab58158606e91 Mon Sep 17 00:00:00 2001 From: jangalasriramd7 Date: Thu, 30 Jan 2025 18:34:54 +0530 Subject: [PATCH 3/4] Added the testcases --- .../actions/TestSizeBasedRewriter.java | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java b/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java index 77d16d3bc821..cabe9d47ab08 100644 --- a/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java +++ b/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java @@ -76,6 +76,27 @@ public void testSplitSizeLowerBound() { assertThat(splitSize).isLessThan(maxFileSize); } + @TestTemplate + public void testDeleteFileThresholdOption() { + SizeBasedDataFileRewriterImpl rewriter = new SizeBasedDataFileRewriterImpl(table); + + Map options = ImmutableMap.of( + SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, "5" + ); + rewriter.init(options); + + assertThat(rewriter.getDeleteFileThreshold()).isEqualTo(5); + } + + @TestTemplate + public void testHighDeleteRatioTriggersRewrite() { + SizeBasedDataFileRewriterImpl rewriter = new SizeBasedDataFileRewriterImpl(table); + + FileScanTask task = new MockFileScanTask(100L * 1024 * 1024, 80); // 80% delete ratio + + assertThat(rewriter.tooHighDeleteRatio(task)).isTrue(); + } + private static class SizeBasedDataFileRewriterImpl extends SizeBasedDataRewriter { SizeBasedDataFileRewriterImpl(Table table) { From 7e036ba436457aa28b30886ef721aca0b4ed8e6f Mon Sep 17 00:00:00 2001 From: jangalasriramd7 Date: Sat, 1 Feb 2025 15:38:38 +0530 Subject: [PATCH 4/4] Modified as per the comments --- .../apache/iceberg/actions/SizeBasedDataRewriter.java | 7 ++++--- .../apache/iceberg/actions/TestSizeBasedRewriter.java | 9 ++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java b/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java index 7e74886597de..295f6fc46d01 100644 --- a/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java +++ b/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java @@ -50,9 +50,10 @@ public abstract class SizeBasedDataRewriter extends SizeBasedFileRewriter options) { super.init(options); this.deleteFileThreshold = deleteFileThreshold(options); this.deleteRatioThreshold = PropertyUtil.propertyAsDouble( - options, "delete-ratio-threshold", DEFAULT_DELETE_THRESHOLD); + options, DELETE_FILE_THRESHOLD, DEFAULT_DELETE_THRESHOLD); } @Override @@ -119,7 +120,7 @@ private boolean tooHighDeleteRatio(FileScanTask task) { double deletedRecords = (double) Math.min(knownDeletedRecordCount, task.file().recordCount()); double deleteRatio = deletedRecords / task.file().recordCount(); - return deleteRatio >= DELETE_RATIO_THRESHOLD; + return deleteRatio >= this.deleteRatioThreshold; } @Override diff --git a/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java b/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java index cabe9d47ab08..9a2ee69dee45 100644 --- a/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java +++ b/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java @@ -81,7 +81,7 @@ public void testDeleteFileThresholdOption() { SizeBasedDataFileRewriterImpl rewriter = new SizeBasedDataFileRewriterImpl(table); Map options = ImmutableMap.of( - SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, "5" + SizeBasedDataRewriter.DEFAULT_DELETE_THRESHOLD, "5" ); rewriter.init(options); @@ -97,6 +97,13 @@ public void testHighDeleteRatioTriggersRewrite() { assertThat(rewriter.tooHighDeleteRatio(task)).isTrue(); } + @Test + private void validateThreshold(double threshold) { + if (threshold <= 0.0 || threshold > 1.0) { + throw new IllegalArgumentException("Threshold must be greater than 0.0 and less than or equal to 1.0"); + } + } + private static class SizeBasedDataFileRewriterImpl extends SizeBasedDataRewriter { SizeBasedDataFileRewriterImpl(Table table) {