Skip to content

Commit 509b95a

Browse files
committed
review comments #2
1 parent e4d2878 commit 509b95a

File tree

4 files changed

+50
-49
lines changed

4 files changed

+50
-49
lines changed

iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/udf/GenericUDFIcebergBucket.java

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.apache.hadoop.hive.ql.stats.estimator.StatEstimator;
3131
import org.apache.hadoop.hive.ql.stats.estimator.StatEstimatorProvider;
3232
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
33+
import org.apache.hadoop.hive.ql.util.JavaDataModel;
3334
import org.apache.hadoop.hive.serde2.io.DateWritableV2;
3435
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
3536
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
@@ -217,28 +218,30 @@ public String getDisplayString(String[] children) {
217218

218219
@Override
219220
public StatEstimator getStatEstimator() {
220-
return new BucketStatEstimator();
221+
return new BucketStatEstimator(numBuckets);
221222
}
222223

223-
private static class BucketStatEstimator implements StatEstimator {
224+
static class BucketStatEstimator implements StatEstimator {
225+
private final int numBuckets;
226+
227+
BucketStatEstimator(int numBuckets) {
228+
this.numBuckets = numBuckets;
229+
}
230+
224231
@Override
225232
public Optional<ColStatistics> estimate(List<ColStatistics> argStats) {
226-
if (argStats.size() != 2) {
227-
return Optional.empty();
228-
}
229-
ColStatistics inputStats = argStats.get(0);
230-
ColStatistics bucketCountStats = argStats.get(1);
231-
ColStatistics.Range bucketRange = bucketCountStats.getRange();
232-
if (bucketRange == null || bucketRange.minValue == null) {
233+
if (argStats.isEmpty() || numBuckets <= 0) {
233234
return Optional.empty();
234235
}
235-
long numBuckets = bucketRange.minValue.longValue();
236-
if (numBuckets <= 0) {
237-
return Optional.empty();
238-
}
239-
ColStatistics result = inputStats.clone();
236+
ColStatistics inputStats = argStats.getFirst();
237+
238+
ColStatistics result = new ColStatistics();
240239
result.setCountDistint(Math.min(inputStats.getCountDistint(), numBuckets));
240+
result.setNumNulls(inputStats.getNumNulls());
241+
result.setAvgColLen(JavaDataModel.get().primitive1());
241242
result.setRange(0, numBuckets - 1);
243+
result.setIsEstimated(true);
244+
242245
return Optional.of(result);
243246
}
244247
}

iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/udf/TestGenericUDFIcebergBucketStatEstimator.java

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,13 +61,11 @@ public void testZeroBucketsReturnsEmpty() {
6161
Assert.assertFalse(result.isPresent());
6262
}
6363

64-
private Optional<ColStatistics> estimateBucket(long sourceNdv, long numBuckets) {
64+
private static Optional<ColStatistics> estimateBucket(long sourceNdv, int numBuckets) {
6565
ColStatistics sourceStats = new ColStatistics("col", "int");
6666
sourceStats.setCountDistint(sourceNdv);
67-
ColStatistics numBucketsStats = new ColStatistics("numBuckets", "int");
68-
numBucketsStats.setRange(numBuckets, numBuckets);
6967

70-
StatEstimator estimator = new GenericUDFIcebergBucket().getStatEstimator();
71-
return estimator.estimate(Arrays.asList(sourceStats, numBucketsStats));
68+
StatEstimator estimator = new GenericUDFIcebergBucket.BucketStatEstimator(numBuckets);
69+
return estimator.estimate(Arrays.asList(sourceStats));
7270
}
7371
}

iceberg/iceberg-handler/src/test/results/positive/dynamic_partition_writes.q.out

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ Stage-3
240240
File Output Operator [FS_17]
241241
Select Operator [SEL_16] (rows=3 width=574)
242242
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12"]
243-
Group By Operator [GBY_15] (rows=3 width=334)
243+
Group By Operator [GBY_15] (rows=3 width=336)
244244
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"],aggregations:["min(VALUE._col0)","max(VALUE._col1)","count(VALUE._col2)","count(VALUE._col3)","compute_bit_vector_hll(VALUE._col4)","max(VALUE._col5)","avg(VALUE._col6)","count(VALUE._col7)","compute_bit_vector_hll(VALUE._col8)"],keys:KEY._col0
245245
<-Map 1 [SIMPLE_EDGE] vectorized
246246
File Output Operator [FS_11]
@@ -251,7 +251,7 @@ Stage-3
251251
default@tbl_src,tbl_src,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b"]
252252
SHUFFLE [RS_14]
253253
PartitionCols:_col0
254-
Group By Operator [GBY_13] (rows=4 width=402)
254+
Group By Operator [GBY_13] (rows=4 width=404)
255255
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"],aggregations:["min(a)","max(a)","count(1)","count(a)","compute_bit_vector_hll(a)","max(length(ccy))","avg(COALESCE(length(ccy),0))","count(ccy)","compute_bit_vector_hll(ccy)"],keys:iceberg_bucket(ccy, 3)
256256
Select Operator [SEL_12] (rows=22 width=87)
257257
Output:["a","ccy"]
@@ -341,12 +341,12 @@ Stage-3
341341
File Output Operator [FS_21]
342342
Select Operator [SEL_20] (rows=11 width=1030)
343343
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12","_col13","_col14","_col15","_col16","_col17","_col18"]
344-
Group By Operator [GBY_19] (rows=11 width=595)
344+
Group By Operator [GBY_19] (rows=11 width=591)
345345
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12","_col13","_col14"],aggregations:["min(VALUE._col0)","max(VALUE._col1)","count(VALUE._col2)","count(VALUE._col3)","compute_bit_vector_hll(VALUE._col4)","max(VALUE._col5)","avg(VALUE._col6)","count(VALUE._col7)","compute_bit_vector_hll(VALUE._col8)","min(VALUE._col9)","max(VALUE._col10)","count(VALUE._col11)","compute_bit_vector_hll(VALUE._col12)"],keys:KEY._col0, KEY._col1
346346
<-Map 1 [SIMPLE_EDGE] vectorized
347347
SHUFFLE [RS_16]
348348
PartitionCols:_col0, _col1
349-
Group By Operator [GBY_15] (rows=11 width=663)
349+
Group By Operator [GBY_15] (rows=11 width=659)
350350
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12","_col13","_col14"],aggregations:["min(a)","max(a)","count(1)","count(a)","compute_bit_vector_hll(a)","max(length(ccy))","avg(COALESCE(length(ccy),0))","count(ccy)","compute_bit_vector_hll(ccy)","min(c)","max(c)","count(c)","compute_bit_vector_hll(c)"],keys:ccy, iceberg_bucket(c, 3)
351351
Select Operator [SEL_14] (rows=22 width=94)
352352
Output:["a","ccy","c"]
@@ -464,7 +464,7 @@ Stage-3
464464
File Output Operator [FS_20]
465465
Select Operator [SEL_19] (rows=3 width=1030)
466466
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12","_col13","_col14","_col15","_col16","_col17","_col18"]
467-
Group By Operator [GBY_18] (rows=3 width=595)
467+
Group By Operator [GBY_18] (rows=3 width=591)
468468
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12","_col13","_col14"],aggregations:["min(VALUE._col0)","max(VALUE._col1)","count(VALUE._col2)","count(VALUE._col3)","compute_bit_vector_hll(VALUE._col4)","max(VALUE._col5)","avg(VALUE._col6)","count(VALUE._col7)","compute_bit_vector_hll(VALUE._col8)","min(VALUE._col9)","max(VALUE._col10)","count(VALUE._col11)","compute_bit_vector_hll(VALUE._col12)"],keys:KEY._col0, KEY._col1
469469
<-Map 1 [SIMPLE_EDGE] vectorized
470470
File Output Operator [FS_14]
@@ -477,7 +477,7 @@ Stage-3
477477
default@tbl_src,tbl_src,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"]
478478
SHUFFLE [RS_17]
479479
PartitionCols:_col0, _col1
480-
Group By Operator [GBY_16] (rows=3 width=663)
480+
Group By Operator [GBY_16] (rows=3 width=659)
481481
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12","_col13","_col14"],aggregations:["min(a)","max(a)","count(1)","count(a)","compute_bit_vector_hll(a)","max(length(ccy))","avg(COALESCE(length(ccy),0))","count(ccy)","compute_bit_vector_hll(ccy)","min(c)","max(c)","count(c)","compute_bit_vector_hll(c)"],keys:ccy, iceberg_bucket(c, 3)
482482
Select Operator [SEL_15] (rows=4 width=99)
483483
Output:["a","ccy","c"]
@@ -516,7 +516,7 @@ Stage-3
516516
File Output Operator [FS_20]
517517
Select Operator [SEL_19] (rows=1 width=1030)
518518
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12","_col13","_col14","_col15","_col16","_col17","_col18"]
519-
Group By Operator [GBY_18] (rows=1 width=595)
519+
Group By Operator [GBY_18] (rows=1 width=591)
520520
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12","_col13","_col14"],aggregations:["min(VALUE._col0)","max(VALUE._col1)","count(VALUE._col2)","count(VALUE._col3)","compute_bit_vector_hll(VALUE._col4)","max(VALUE._col5)","avg(VALUE._col6)","count(VALUE._col7)","compute_bit_vector_hll(VALUE._col8)","min(VALUE._col9)","max(VALUE._col10)","count(VALUE._col11)","compute_bit_vector_hll(VALUE._col12)"],keys:KEY._col0, KEY._col1
521521
<-Map 1 [SIMPLE_EDGE] vectorized
522522
File Output Operator [FS_14]
@@ -529,7 +529,7 @@ Stage-3
529529
default@tbl_src,tbl_src,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"]
530530
SHUFFLE [RS_17]
531531
PartitionCols:_col0, _col1
532-
Group By Operator [GBY_16] (rows=1 width=663)
532+
Group By Operator [GBY_16] (rows=1 width=659)
533533
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12","_col13","_col14"],aggregations:["min(a)","max(a)","count(1)","count(a)","compute_bit_vector_hll(a)","max(length(ccy))","avg(COALESCE(length(ccy),0))","count(ccy)","compute_bit_vector_hll(ccy)","min(c)","max(c)","count(c)","compute_bit_vector_hll(c)"],keys:ccy, iceberg_bucket(c, 3)
534534
Select Operator [SEL_15] (rows=1 width=99)
535535
Output:["a","ccy","c"]
@@ -1706,7 +1706,7 @@ Stage-3
17061706
File Output Operator [FS_17]
17071707
Select Operator [SEL_16] (rows=3 width=574)
17081708
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12"]
1709-
Group By Operator [GBY_15] (rows=3 width=334)
1709+
Group By Operator [GBY_15] (rows=3 width=336)
17101710
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"],aggregations:["min(VALUE._col0)","max(VALUE._col1)","count(VALUE._col2)","count(VALUE._col3)","compute_bit_vector_hll(VALUE._col4)","max(VALUE._col5)","avg(VALUE._col6)","count(VALUE._col7)","compute_bit_vector_hll(VALUE._col8)"],keys:KEY._col0
17111711
<-Map 1 [SIMPLE_EDGE] vectorized
17121712
File Output Operator [FS_11]
@@ -1717,7 +1717,7 @@ Stage-3
17171717
default@tbl_src,tbl_src,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b"]
17181718
SHUFFLE [RS_14]
17191719
PartitionCols:_col0
1720-
Group By Operator [GBY_13] (rows=4 width=402)
1720+
Group By Operator [GBY_13] (rows=4 width=404)
17211721
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"],aggregations:["min(a)","max(a)","count(1)","count(a)","compute_bit_vector_hll(a)","max(length(ccy))","avg(COALESCE(length(ccy),0))","count(ccy)","compute_bit_vector_hll(ccy)"],keys:iceberg_bucket(ccy, 3)
17221722
Select Operator [SEL_12] (rows=22 width=87)
17231723
Output:["a","ccy"]
@@ -1810,12 +1810,12 @@ Stage-3
18101810
File Output Operator [FS_21]
18111811
Select Operator [SEL_20] (rows=3 width=574)
18121812
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12"]
1813-
Group By Operator [GBY_19] (rows=3 width=334)
1813+
Group By Operator [GBY_19] (rows=3 width=336)
18141814
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"],aggregations:["min(VALUE._col0)","max(VALUE._col1)","count(VALUE._col2)","count(VALUE._col3)","compute_bit_vector_hll(VALUE._col4)","max(VALUE._col5)","avg(VALUE._col6)","count(VALUE._col7)","compute_bit_vector_hll(VALUE._col8)"],keys:KEY._col0
18151815
<-Map 1 [SIMPLE_EDGE] vectorized
18161816
SHUFFLE [RS_16]
18171817
PartitionCols:_col0
1818-
Group By Operator [GBY_15] (rows=4 width=402)
1818+
Group By Operator [GBY_15] (rows=4 width=404)
18191819
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"],aggregations:["min(a)","max(a)","count(1)","count(a)","compute_bit_vector_hll(a)","max(length(ccy))","avg(COALESCE(length(ccy),0))","count(ccy)","compute_bit_vector_hll(ccy)"],keys:iceberg_bucket(ccy, 3)
18201820
Select Operator [SEL_14] (rows=22 width=87)
18211821
Output:["a","ccy"]
@@ -1908,12 +1908,12 @@ Stage-3
19081908
File Output Operator [FS_21]
19091909
Select Operator [SEL_20] (rows=3 width=574)
19101910
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12"]
1911-
Group By Operator [GBY_19] (rows=3 width=334)
1911+
Group By Operator [GBY_19] (rows=3 width=336)
19121912
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"],aggregations:["min(VALUE._col0)","max(VALUE._col1)","count(VALUE._col2)","count(VALUE._col3)","compute_bit_vector_hll(VALUE._col4)","max(VALUE._col5)","avg(VALUE._col6)","count(VALUE._col7)","compute_bit_vector_hll(VALUE._col8)"],keys:KEY._col0
19131913
<-Map 1 [SIMPLE_EDGE] vectorized
19141914
SHUFFLE [RS_16]
19151915
PartitionCols:_col0
1916-
Group By Operator [GBY_15] (rows=4 width=402)
1916+
Group By Operator [GBY_15] (rows=4 width=404)
19171917
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"],aggregations:["min(a)","max(a)","count(1)","count(a)","compute_bit_vector_hll(a)","max(length(ccy))","avg(COALESCE(length(ccy),0))","count(ccy)","compute_bit_vector_hll(ccy)"],keys:iceberg_bucket(ccy, 3)
19181918
Select Operator [SEL_14] (rows=22 width=87)
19191919
Output:["a","ccy"]
@@ -1986,7 +1986,7 @@ Stage-3
19861986
File Output Operator [FS_17]
19871987
Select Operator [SEL_16] (rows=3 width=574)
19881988
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12"]
1989-
Group By Operator [GBY_15] (rows=3 width=334)
1989+
Group By Operator [GBY_15] (rows=3 width=336)
19901990
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"],aggregations:["min(VALUE._col0)","max(VALUE._col1)","count(VALUE._col2)","count(VALUE._col3)","compute_bit_vector_hll(VALUE._col4)","max(VALUE._col5)","avg(VALUE._col6)","count(VALUE._col7)","compute_bit_vector_hll(VALUE._col8)"],keys:KEY._col0
19911991
<-Map 1 [SIMPLE_EDGE] vectorized
19921992
File Output Operator [FS_11]
@@ -1997,7 +1997,7 @@ Stage-3
19971997
default@tbl_src,tbl_src,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b"]
19981998
SHUFFLE [RS_14]
19991999
PartitionCols:_col0
2000-
Group By Operator [GBY_13] (rows=4 width=402)
2000+
Group By Operator [GBY_13] (rows=4 width=404)
20012001
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"],aggregations:["min(a)","max(a)","count(1)","count(a)","compute_bit_vector_hll(a)","max(length(ccy))","avg(COALESCE(length(ccy),0))","count(ccy)","compute_bit_vector_hll(ccy)"],keys:iceberg_bucket(ccy, 3)
20022002
Select Operator [SEL_12] (rows=22 width=87)
20032003
Output:["a","ccy"]
@@ -2057,12 +2057,12 @@ Stage-3
20572057
File Output Operator [FS_21]
20582058
Select Operator [SEL_20] (rows=2 width=574)
20592059
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12"]
2060-
Group By Operator [GBY_19] (rows=2 width=334)
2060+
Group By Operator [GBY_19] (rows=2 width=336)
20612061
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"],aggregations:["min(VALUE._col0)","max(VALUE._col1)","count(VALUE._col2)","count(VALUE._col3)","compute_bit_vector_hll(VALUE._col4)","max(VALUE._col5)","avg(VALUE._col6)","count(VALUE._col7)","compute_bit_vector_hll(VALUE._col8)"],keys:KEY._col0
20622062
<-Map 1 [SIMPLE_EDGE] vectorized
20632063
SHUFFLE [RS_16]
20642064
PartitionCols:_col0
2065-
Group By Operator [GBY_15] (rows=3 width=402)
2065+
Group By Operator [GBY_15] (rows=3 width=404)
20662066
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"],aggregations:["min(a)","max(a)","count(1)","count(a)","compute_bit_vector_hll(a)","max(length(ccy))","avg(COALESCE(length(ccy),0))","count(ccy)","compute_bit_vector_hll(ccy)"],keys:iceberg_bucket(ccy, 2)
20672067
Select Operator [SEL_14] (rows=22 width=87)
20682068
Output:["a","ccy"]

0 commit comments

Comments
 (0)