Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion velox/connectors/hive/iceberg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ set(
IcebergColumnHandle.cpp
IcebergConfig.cpp
IcebergConnector.cpp
IcebergDataFileStatistics.cpp
IcebergDataSink.cpp
IcebergDataSource.cpp
IcebergPartitionName.cpp
Expand All @@ -31,6 +32,10 @@ set(
WriterOptionsAdapter.cpp
)

if(VELOX_ENABLE_PARQUET)
list(APPEND ICEBERG_SOURCES IcebergParquetStatsCollector.cpp)
endif()

velox_add_library(
velox_hive_iceberg_splitreader
${ICEBERG_SOURCES}
Expand All @@ -41,10 +46,12 @@ velox_add_library(
IcebergColumnHandle.h
IcebergConfig.h
IcebergConnector.h
IcebergDataFileStatistics.h
IcebergDataSink.h
IcebergDataSource.h
IcebergDeleteFile.h
IcebergMetadataColumns.h
IcebergParquetStatsCollector.h
IcebergPartitionName.h
IcebergSplit.h
IcebergSplitReader.h
Expand All @@ -58,13 +65,14 @@ velox_add_library(
velox_link_libraries(
velox_hive_iceberg_splitreader
velox_connector
velox_dwio_parquet_field_id
velox_functions_iceberg
velox_dwio_dwrf_writer
Folly::folly
)

if(VELOX_ENABLE_PARQUET)
velox_link_libraries(velox_hive_iceberg_splitreader velox_dwio_parquet_field_id)
velox_link_libraries(velox_hive_iceberg_splitreader velox_dwio_arrow_parquet_writer)
endif()

if(${VELOX_BUILD_TESTING})
Expand Down
58 changes: 58 additions & 0 deletions velox/connectors/hive/iceberg/IcebergDataFileStatistics.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "velox/connectors/hive/iceberg/IcebergDataFileStatistics.h"

namespace facebook::velox::connector::hive::iceberg {

folly::dynamic IcebergDataFileStatistics::toJson() const {
folly::dynamic json = folly::dynamic::object;
json["recordCount"] = numRecords;

folly::dynamic columnSizes = folly::dynamic::object;
folly::dynamic valueCounts = folly::dynamic::object;
folly::dynamic nullValueCounts = folly::dynamic::object;
folly::dynamic nanValueCounts = folly::dynamic::object;
folly::dynamic lowerBounds = folly::dynamic::object;
folly::dynamic upperBounds = folly::dynamic::object;

for (const auto& [fieldId, stats] : columnStats) {
auto fieldIdStr = folly::to<std::string>(fieldId);
columnSizes[fieldIdStr] = stats.columnSize;
valueCounts[fieldIdStr] = stats.valueCount;
nullValueCounts[fieldIdStr] = stats.nullValueCount;
if (stats.nanValueCount.has_value()) {
nanValueCounts[fieldIdStr] = stats.nanValueCount.value();
}
if (stats.lowerBound.has_value()) {
lowerBounds[fieldIdStr] = stats.lowerBound.value();
}
if (stats.upperBound.has_value()) {
upperBounds[fieldIdStr] = stats.upperBound.value();
}
}

json["columnSizes"] = std::move(columnSizes);
json["valueCounts"] = std::move(valueCounts);
json["nullValueCounts"] = std::move(nullValueCounts);
json["nanValueCounts"] = std::move(nanValueCounts);
json["lowerBounds"] = std::move(lowerBounds);
json["upperBounds"] = std::move(upperBounds);

return json;
}

} // namespace facebook::velox::connector::hive::iceberg
71 changes: 71 additions & 0 deletions velox/connectors/hive/iceberg/IcebergDataFileStatistics.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cstdint>
#include <optional>
#include <string>

#include <folly/container/F14Map.h>
#include <folly/json/dynamic.h>

namespace facebook::velox::connector::hive::iceberg {

/// Statistics for an Iceberg data file, corresponding to the `data_file`
/// structure defined in the Iceberg specification:
/// https://iceberg.apache.org/spec/#data-file-fields.
///
/// All column-level statistics maps are keyed by Iceberg field IDs (`int32_t`),
/// which uniquely identify columns in the Iceberg schema independent of column
/// names or physical column positions.
struct IcebergDataFileStatistics {
struct ColumnStats {
int64_t columnSize{0};

/// Total number of values for this field ID in the file, including null and
/// NaN values.
///
/// For primitive (flat) columns, this is equal to the number of rows in the
/// file: numRows = valueCount = (nonNullValues + numNulls + numNaNs).
///
/// For nested columns (e.g. elements inside an array), this represents the
/// total occurrences of the field across all rows, which is not necessarily
/// related to the top-level record count.
int64_t valueCount{0};
int64_t nullValueCount{0};
std::optional<int64_t> nanValueCount;
/// Base64 encoded lower bound.
std::optional<std::string> lowerBound;
/// Base64 encoded upper bound.
std::optional<std::string> upperBound;
};

int64_t numRecords{0};
folly::F14FastMap<int32_t, ColumnStats> columnStats;

/// Returns a IcebergDataFileStatistics with all values set to zero/empty.
/// Useful for empty data files that have no actual data.
static IcebergDataFileStatistics empty() {
return IcebergDataFileStatistics{.numRecords = 0, .columnStats = {}};
}

folly::dynamic toJson() const;
};

using IcebergDataFileStatisticsPtr = std::shared_ptr<IcebergDataFileStatistics>;

} // namespace facebook::velox::connector::hive::iceberg
Loading
Loading