diff --git a/.baseline/checkstyle/checkstyle.xml b/.baseline/checkstyle/checkstyle.xml index 943d299b338f..f94848450a9b 100644 --- a/.baseline/checkstyle/checkstyle.xml +++ b/.baseline/checkstyle/checkstyle.xml @@ -450,6 +450,11 @@ + + + + + diff --git a/.github/workflows/api-binary-compatibility.yml b/.github/workflows/api-binary-compatibility.yml index da04904fb769..58a04c9427e0 100644 --- a/.github/workflows/api-binary-compatibility.yml +++ b/.github/workflows/api-binary-compatibility.yml @@ -46,7 +46,7 @@ jobs: revapi: runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: # fetch-depth of zero ensures that the tags are pulled in and we're not in a detached HEAD state # revapi depends on the tags, specifically the tag from git describe, to find the relevant override @@ -55,15 +55,15 @@ jobs: # See https://github.com/actions/checkout/issues/124 fetch-depth: 0 persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: 17 - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: | echo "Using the old version tag, as per git describe, of $(git describe)"; - run: ./gradlew revapi --rerun-tasks - - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: failure() with: name: test logs diff --git a/.github/workflows/asf-allowlist-check.yml b/.github/workflows/asf-allowlist-check.yml index d4e84c5922c8..8d7952a9d29b 100644 --- a/.github/workflows/asf-allowlist-check.yml +++ b/.github/workflows/asf-allowlist-check.yml @@ -40,8 +40,7 @@ jobs: asf-allowlist-check: runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - # Intentionally unpinned to always use the latest allowlist from the ASF. - - uses: apache/infrastructure-actions/allowlist-check@main # zizmor: ignore[unpinned-uses] + - uses: apache/infrastructure-actions/allowlist-check@4e9c961f587f72b170874b6f5cd4ac15f7f26eb8 # main diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 81bc6b16f82e..98685f3ced21 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -41,16 +41,16 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - name: Initialize CodeQL - uses: github/codeql-action/init@c10b8064de6f491fea524254123dbe5e09572f13 # v4 + uses: github/codeql-action/init@e46ed2cbd01164d986452f91f178727624ae40d7 # v4.35.3 with: languages: actions - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@c10b8064de6f491fea524254123dbe5e09572f13 # v4 + uses: github/codeql-action/analyze@e46ed2cbd01164d986452f91f178727624ae40d7 # v4.35.3 with: category: "/language:actions" diff --git a/.github/workflows/delta-conversion-ci.yml b/.github/workflows/delta-conversion-ci.yml index 4c576e88a896..a1fb7fea9fdf 100644 --- a/.github/workflows/delta-conversion-ci.yml +++ b/.github/workflows/delta-conversion-ci.yml @@ -80,17 +80,17 @@ jobs: env: SPARK_LOCAL_IP: localhost steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: ./gradlew -DsparkVersions=3.5 -DscalaVersion=2.12 -DkafkaVersions= -DflinkVersions= :iceberg-delta-lake:check -Pquick=true -x javadoc - - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: failure() with: name: test logs @@ -106,17 +106,17 @@ jobs: env: SPARK_LOCAL_IP: localhost steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: ./gradlew -DsparkVersions=3.5 -DscalaVersion=2.13 -DkafkaVersions= -DflinkVersions= :iceberg-delta-lake:check -Pquick=true -x javadoc - - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: failure() with: name: test logs diff --git a/.github/workflows/docs-ci.yml b/.github/workflows/docs-ci.yml index ff6c6bdbd8cf..2bcda0bbc090 100644 --- a/.github/workflows/docs-ci.yml +++ b/.github/workflows/docs-ci.yml @@ -36,10 +36,10 @@ jobs: matrix: os: [ubuntu-latest, macos-latest] steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 + - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: 3.x - name: Build Iceberg documentation diff --git a/.github/workflows/flink-ci.yml b/.github/workflows/flink-ci.yml index 8f49b1c6242f..a515a71fa3be 100644 --- a/.github/workflows/flink-ci.yml +++ b/.github/workflows/flink-ci.yml @@ -84,17 +84,17 @@ jobs: env: SPARK_LOCAL_IP: localhost steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: ./gradlew -DsparkVersions= -DkafkaVersions= -DflinkVersions=${{ matrix.flink }} :iceberg-flink:iceberg-flink-${{ matrix.flink }}:check :iceberg-flink:iceberg-flink-runtime-${{ matrix.flink }}:check -Pquick=true -x javadoc -DtestParallelism=auto - - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: failure() with: name: test logs diff --git a/.github/workflows/hive-ci.yml b/.github/workflows/hive-ci.yml index 8effb1d9e9a0..4853508b854b 100644 --- a/.github/workflows/hive-ci.yml +++ b/.github/workflows/hive-ci.yml @@ -81,17 +81,17 @@ jobs: env: SPARK_LOCAL_IP: localhost steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: ./gradlew -DsparkVersions= -DflinkVersions= -DkafkaVersions= -Pquick=true :iceberg-mr:check -x javadoc - - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: failure() with: name: test logs diff --git a/.github/workflows/java-ci.yml b/.github/workflows/java-ci.yml index 4ef0a30b8225..670fd78fb64e 100644 --- a/.github/workflows/java-ci.yml +++ b/.github/workflows/java-ci.yml @@ -76,17 +76,17 @@ jobs: env: SPARK_LOCAL_IP: localhost steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: ./gradlew check -DsparkVersions= -DflinkVersions= -DkafkaVersions= -Pquick=true -x javadoc - - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: failure() with: name: test logs @@ -100,14 +100,14 @@ jobs: matrix: jvm: [17, 21] steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: ./gradlew -DallModules build -x test -x javadoc -x integrationTest build-javadoc: @@ -117,12 +117,25 @@ jobs: matrix: jvm: [17, 21] steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: ./gradlew -Pquick=true javadoc + + check-runtime-deps: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 + with: + distribution: zulu + java-version: 17 + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 + - run: ./gradlew checkAllRuntimeDeps -q -DallModules=true diff --git a/.github/workflows/jmh-benchmarks.yml b/.github/workflows/jmh-benchmarks.yml index 354bb1e106f4..e2c9522a757c 100644 --- a/.github/workflows/jmh-benchmarks.yml +++ b/.github/workflows/jmh-benchmarks.yml @@ -49,7 +49,7 @@ jobs: matrix: ${{ steps.set-matrix.outputs.matrix }} foundlabel: ${{ steps.set-matrix.outputs.foundlabel }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: repository: ${{ github.event.inputs.repo }} ref: ${{ github.event.inputs.ref }} @@ -94,16 +94,16 @@ jobs: env: SPARK_LOCAL_IP: localhost steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: repository: ${{ github.event.inputs.repo }} ref: ${{ github.event.inputs.ref }} persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: 17 - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - name: Run Benchmark @@ -113,7 +113,7 @@ jobs: BENCHMARK: ${{ matrix.benchmark }} run: ./gradlew -DsparkVersions=${SPARK_VERSION} -DscalaVersion=${SCALA_VERSION} :iceberg-spark:iceberg-spark-${SPARK_VERSION}_${SCALA_VERSION}:jmh -PjmhIncludeRegex=${BENCHMARK} -PjmhOutputPath=benchmark/${BENCHMARK}.txt - - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: ${{ always() }} with: name: benchmark-${{ matrix.benchmark }} diff --git a/.github/workflows/kafka-connect-ci.yml b/.github/workflows/kafka-connect-ci.yml index fc86b77bcefc..3b962aefbb02 100644 --- a/.github/workflows/kafka-connect-ci.yml +++ b/.github/workflows/kafka-connect-ci.yml @@ -81,14 +81,14 @@ jobs: env: SPARK_LOCAL_IP: localhost steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: | ./gradlew -DsparkVersions= -DflinkVersions= -DkafkaVersions=3 \ @@ -97,7 +97,7 @@ jobs: :iceberg-kafka-connect:iceberg-kafka-connect:check \ :iceberg-kafka-connect:iceberg-kafka-connect-runtime:check \ -Pquick=true -x javadoc - - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: failure() with: name: test logs diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 16aac23a5683..3735367053ce 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -28,6 +28,6 @@ jobs: triage: runs-on: ubuntu-slim steps: - - uses: actions/labeler@634933edcd8ababfe52f92936142cc22ac488b1b # v6 + - uses: actions/labeler@634933edcd8ababfe52f92936142cc22ac488b1b # v6.0.1 with: sync-labels: true diff --git a/.github/workflows/license-check.yml b/.github/workflows/license-check.yml index ccd2a9a429f3..cc285eabe101 100644 --- a/.github/workflows/license-check.yml +++ b/.github/workflows/license-check.yml @@ -27,7 +27,7 @@ jobs: rat: runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - run: | diff --git a/.github/workflows/open-api.yml b/.github/workflows/open-api.yml index b57bbcdad177..28cd2ad89dfa 100644 --- a/.github/workflows/open-api.yml +++ b/.github/workflows/open-api.yml @@ -44,11 +44,11 @@ jobs: runs-on: ubuntu-slim steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - name: Install uv - uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0 + uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 with: enable-cache: false - name: Install dependencies diff --git a/.github/workflows/pr-title-check.yml b/.github/workflows/pr-title-check.yml new file mode 100644 index 000000000000..48c4b652ce2b --- /dev/null +++ b/.github/workflows/pr-title-check.yml @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +name: PR Title Check + +on: + pull_request: + types: [opened, edited, reopened] + +concurrency: + group: pr-title-${{ github.event.pull_request.number }} + cancel-in-progress: true + +permissions: {} + +jobs: + check-pr-title: + runs-on: ubuntu-slim + steps: + - name: Check PR Title + env: + PR_TITLE: ${{ github.event.pull_request.title }} + run: | + PATTERN='^[A-Za-z][A-Za-z0-9._+/&-]*: .+' + if ! echo "$PR_TITLE" | grep -Eq "$PATTERN"; then + echo "::error::PR title must follow 'Module: Description' format. Got: '$PR_TITLE'" + echo "Examples: 'Core: Fix ...', 'Spark: Add ...', 'API: Remove ...', 'Docs: Update ...'" + exit 1 + fi + + echo "PR title is valid: '$PR_TITLE'" diff --git a/.github/workflows/publish-iceberg-rest-fixture-docker.yml b/.github/workflows/publish-iceberg-rest-fixture-docker.yml index 9504ae51bcd7..264e402deaac 100644 --- a/.github/workflows/publish-iceberg-rest-fixture-docker.yml +++ b/.github/workflows/publish-iceberg-rest-fixture-docker.yml @@ -41,14 +41,14 @@ jobs: runs-on: ubuntu-latest environment: docker-publish steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: 21 - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - name: Build Iceberg Open API project run: ./gradlew :iceberg-open-api:shadowJar - name: Login to Docker Hub @@ -56,7 +56,7 @@ jobs: DOCKERHUB_USER: ${{ secrets.DOCKERHUB_USER }} DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} run: | - docker login -u "$DOCKERHUB_USER" -p "$DOCKERHUB_TOKEN" + echo "$DOCKERHUB_TOKEN" | docker login --username "$DOCKERHUB_USER" --password-stdin - name: Set the tagged version # for tag 'apache-iceberg-1.7.1', publish image 'apache/iceberg-rest-fixture:1.7.1' if: github.event_name == 'push' && contains(github.ref, 'refs/tags/') @@ -69,7 +69,7 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0 - name: Build and Push - uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7.0.0 + uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0 with: context: ./ file: ./docker/iceberg-rest-fixture/Dockerfile diff --git a/.github/workflows/publish-snapshot.yml b/.github/workflows/publish-snapshot.yml index dac63bb9fa2e..a8557c44f32b 100644 --- a/.github/workflows/publish-snapshot.yml +++ b/.github/workflows/publish-snapshot.yml @@ -34,16 +34,16 @@ jobs: runs-on: ubuntu-24.04 environment: maven-publish steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: # we need to fetch all tags so that getProjectVersion() in build.gradle correctly determines the next SNAPSHOT version from the newest tag fetch-depth: 0 persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: 17 - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - env: NEXUS_USER: ${{ secrets.NEXUS_USER }} NEXUS_PW: ${{ secrets.NEXUS_PW }} diff --git a/.github/workflows/recurring-jmh-benchmarks.yml b/.github/workflows/recurring-jmh-benchmarks.yml index da918a6972b4..88bb10566e43 100644 --- a/.github/workflows/recurring-jmh-benchmarks.yml +++ b/.github/workflows/recurring-jmh-benchmarks.yml @@ -51,20 +51,20 @@ jobs: env: SPARK_LOCAL_IP: localhost steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: 17 - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - name: Run Benchmark run: ./gradlew -DsparkVersions=${{ matrix.spark }} -DscalaVersion=${{ matrix.scala }} :iceberg-spark:iceberg-spark-${{ matrix.spark }}_${{ matrix.scala }}:jmh -PjmhIncludeRegex=${{ matrix.benchmark }} -PjmhOutputPath=benchmark/${{ matrix.benchmark }}.txt -PjmhJsonOutputPath=benchmark/${{ matrix.benchmark }}.json - - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: ${{ always() }} with: name: benchmark-${{ matrix.benchmark }} diff --git a/.github/workflows/site-ci.yml b/.github/workflows/site-ci.yml index 6152d4970305..fbd18caeb6da 100644 --- a/.github/workflows/site-ci.yml +++ b/.github/workflows/site-ci.yml @@ -36,10 +36,10 @@ jobs: permissions: contents: write steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 + - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: 3.x - name: Deploy Iceberg documentation diff --git a/.github/workflows/spark-ci.yml b/.github/workflows/spark-ci.yml index d346e238a6c4..a6e7b1504231 100644 --- a/.github/workflows/spark-ci.yml +++ b/.github/workflows/spark-ci.yml @@ -91,14 +91,14 @@ jobs: env: SPARK_LOCAL_IP: localhost steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 with: tool-cache: false @@ -109,7 +109,7 @@ jobs: :iceberg-spark:iceberg-spark-extensions-${{ matrix.spark }}_${{ matrix.scala }}:check \ :iceberg-spark:iceberg-spark-runtime-${{ matrix.spark }}_${{ matrix.scala }}:check \ -Pquick=true -x javadoc - - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: failure() with: name: test logs diff --git a/.github/workflows/zizmor.yml b/.github/workflows/zizmor.yml index 313835fcbe16..0df5e1362ac5 100644 --- a/.github/workflows/zizmor.yml +++ b/.github/workflows/zizmor.yml @@ -39,6 +39,8 @@ jobs: persist-credentials: false - name: Run zizmor 🌈 - uses: zizmorcore/zizmor-action@71321a20a9ded102f6e9ce5718a2fcec2c4f70d8 # v0.5.2 + uses: zizmorcore/zizmor-action@b1d7e1fb5de872772f31590499237e7cce841e8e # v0.5.3 with: advanced-security: false + min-severity: medium + min-confidence: medium diff --git a/.gitignore b/.gitignore index 7812e21f89e0..98ccfc356d8b 100644 --- a/.gitignore +++ b/.gitignore @@ -58,6 +58,7 @@ coverage.xml # vscode/eclipse files .classpath +.factorypath .project .settings bin/ @@ -77,3 +78,6 @@ derby.log # sdkman .sdkmanrc + +# git hooks like pre-commit +.githooks/ diff --git a/LICENSE b/LICENSE index 0f907148aa13..573a126294a6 100644 --- a/LICENSE +++ b/LICENSE @@ -336,6 +336,10 @@ This product includes code from Apache Flink. * Parameter provider annotation for parameterized tests in Parameters.java * Parameter field annotation for parameterized tests in Parameter.java * Primary key validation logic in FlinkSchemaUtil.java +* Avro to RowData conversion logic in AvroToRowDataConverters.java +* RowData to Avro conversion logic in RowDataToAvroConverters.java +* Avro schema conversion logic in AvroSchemaConverter.java +* Joda optional dependency encapsulation in JodaConverter.java Copyright: 1999-2022 The Apache Software Foundation. Home page: https://flink.apache.org/ diff --git a/api/src/main/java/org/apache/iceberg/FileContent.java b/api/src/main/java/org/apache/iceberg/FileContent.java index 2c9a2fa51bd2..9d5ab8ceeec9 100644 --- a/api/src/main/java/org/apache/iceberg/FileContent.java +++ b/api/src/main/java/org/apache/iceberg/FileContent.java @@ -18,19 +18,35 @@ */ package org.apache.iceberg; -/** Content type stored in a file, one of DATA, POSITION_DELETES, or EQUALITY_DELETES. */ +import java.util.Locale; + +/** Content type stored in a file. */ public enum FileContent { DATA(0), POSITION_DELETES(1), - EQUALITY_DELETES(2); + EQUALITY_DELETES(2), + DATA_MANIFEST(3), + DELETE_MANIFEST(4); + + private static final FileContent[] VALUES = FileContent.values(); private final int id; + private final String lowerCaseName; FileContent(int id) { this.id = id; + this.lowerCaseName = name().toLowerCase(Locale.ROOT); } public int id() { return id; } + + public String lowerCaseName() { + return lowerCaseName; + } + + public static FileContent fromId(int id) { + return VALUES[id]; + } } diff --git a/api/src/main/java/org/apache/iceberg/PartitionSpec.java b/api/src/main/java/org/apache/iceberg/PartitionSpec.java index c9350077e9a6..90d2dc259dd1 100644 --- a/api/src/main/java/org/apache/iceberg/PartitionSpec.java +++ b/api/src/main/java/org/apache/iceberg/PartitionSpec.java @@ -402,21 +402,22 @@ private void checkAndAddPartitionName(String name, Integer sourceColumnId) { Types.NestedField schemaField = this.caseSensitive ? schema.findField(name) : schema.caseInsensitiveFindField(name); if (checkConflicts) { - if (sourceColumnId != null) { - // for identity transform case we allow conflicts between partition and schema field name - // as - // long as they are sourced from the same schema field - Preconditions.checkArgument( - schemaField == null || schemaField.fieldId() == sourceColumnId, - "Cannot create identity partition sourced from different field in schema: %s", - name); - } else { - // for all other transforms we don't allow conflicts between partition name and schema - // field name + if (sourceColumnId == null) { Preconditions.checkArgument( schemaField == null, "Cannot create partition from name that exists in schema: %s", name); + } else { + boolean sourceFieldExists = schema.findField(sourceColumnId) != null; + // For identity transforms, require the partition name to match the source column when it + // still exists in the schema. When the source was dropped, the spec may be historical; + // skip the identity name check in that case. + if (sourceFieldExists) { + Preconditions.checkArgument( + schemaField == null || schemaField.fieldId() == sourceColumnId, + "Cannot create identity partition sourced from different field in schema: %s", + name); + } } } Preconditions.checkArgument(!name.isEmpty(), "Cannot use empty partition name: %s", name); diff --git a/api/src/main/java/org/apache/iceberg/catalog/TableIdentifier.java b/api/src/main/java/org/apache/iceberg/catalog/TableIdentifier.java index cbb5dc8d8fd2..9b9fbdcbb0b7 100644 --- a/api/src/main/java/org/apache/iceberg/catalog/TableIdentifier.java +++ b/api/src/main/java/org/apache/iceberg/catalog/TableIdentifier.java @@ -80,7 +80,9 @@ public String name() { public TableIdentifier toLowerCase() { String[] newLevels = - Arrays.stream(namespace().levels()).map(String::toLowerCase).toArray(String[]::new); + Arrays.stream(namespace().levels()) + .map(s -> s.toLowerCase(Locale.ROOT)) + .toArray(String[]::new); String newName = name().toLowerCase(Locale.ROOT); return TableIdentifier.of(Namespace.of(newLevels), newName); } diff --git a/api/src/main/java/org/apache/iceberg/exceptions/NoSuchWarehouseException.java b/api/src/main/java/org/apache/iceberg/exceptions/NoSuchWarehouseException.java new file mode 100644 index 000000000000..94ae50cd1c25 --- /dev/null +++ b/api/src/main/java/org/apache/iceberg/exceptions/NoSuchWarehouseException.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.exceptions; + +import com.google.errorprone.annotations.FormatMethod; + +/** Exception raised when attempting to load a warehouse that does not exist. */ +public class NoSuchWarehouseException extends RuntimeException { + @FormatMethod + public NoSuchWarehouseException(String message, Object... args) { + super(String.format(message, args)); + } + + @FormatMethod + public NoSuchWarehouseException(Throwable cause, String message, Object... args) { + super(String.format(message, args), cause); + } +} diff --git a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java index c225f21da8a8..f57ba8bc2793 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java @@ -22,6 +22,7 @@ import java.nio.ByteBuffer; import java.util.Collection; +import java.util.Comparator; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; @@ -29,6 +30,7 @@ import org.apache.iceberg.DataFile; import org.apache.iceberg.Schema; import org.apache.iceberg.expressions.ExpressionVisitors.BoundExpressionVisitor; +import org.apache.iceberg.types.Comparators; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.util.NaNUtil; @@ -462,13 +464,77 @@ public Boolean notIn(BoundReference ref, Set literalSet) { @Override public Boolean startsWith(BoundReference ref, Literal lit) { + int id = ref.fieldId(); + if (isNestedColumn(id)) { + return ROWS_MIGHT_NOT_MATCH; + } + + if (canContainNulls(id)) { + return ROWS_MIGHT_NOT_MATCH; + } + + if (lowerBounds != null + && lowerBounds.containsKey(id) + && upperBounds != null + && upperBounds.containsKey(id)) { + String prefix = (String) lit.value(); + Comparator comparator = Comparators.charSequences(); + CharSequence lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); + CharSequence upper = Conversions.fromByteBuffer(ref.type(), upperBounds.get(id)); + + // if lower is shorter than the prefix then lower doesn't start with the prefix + if (lower.length() < prefix.length()) { + return ROWS_MIGHT_NOT_MATCH; + } + + if (comparator.compare(lower.subSequence(0, prefix.length()), prefix) == 0) { + // if upper is shorter than the prefix then upper can't start with the prefix + if (upper.length() < prefix.length()) { + return ROWS_MIGHT_NOT_MATCH; + } + + if (comparator.compare(upper.subSequence(0, prefix.length()), prefix) == 0) { + // both bounds start with the prefix, so all rows must start with the prefix + return ROWS_MUST_MATCH; + } + } + } + return ROWS_MIGHT_NOT_MATCH; } @Override public Boolean notStartsWith(BoundReference ref, Literal lit) { - // TODO: Handle cases that definitely cannot match, such as notStartsWith("x") when the bounds - // are ["a", "b"]. + int id = ref.fieldId(); + if (isNestedColumn(id)) { + return ROWS_MIGHT_NOT_MATCH; + } + + if (containsNullsOnly(id)) { + return ROWS_MUST_MATCH; + } + + String prefix = (String) lit.value(); + Comparator comparator = Comparators.charSequences(); + + if (lowerBounds != null && lowerBounds.containsKey(id)) { + CharSequence lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); + // truncate lower bound so that its length is not greater than the length of prefix + int length = Math.min(prefix.length(), lower.length()); + if (comparator.compare(lower.subSequence(0, length), prefix) > 0) { + return ROWS_MUST_MATCH; + } + } + + if (upperBounds != null && upperBounds.containsKey(id)) { + CharSequence upper = Conversions.fromByteBuffer(ref.type(), upperBounds.get(id)); + // truncate upper bound so that its length is not greater than the length of prefix + int length = Math.min(prefix.length(), upper.length()); + if (comparator.compare(upper.subSequence(0, length), prefix) < 0) { + return ROWS_MUST_MATCH; + } + } + return ROWS_MIGHT_NOT_MATCH; } diff --git a/api/src/main/java/org/apache/iceberg/io/FileRange.java b/api/src/main/java/org/apache/iceberg/io/FileRange.java index f6d5d9b41cca..695d516725a6 100644 --- a/api/src/main/java/org/apache/iceberg/io/FileRange.java +++ b/api/src/main/java/org/apache/iceberg/io/FileRange.java @@ -31,10 +31,8 @@ public class FileRange { public FileRange(CompletableFuture byteBuffer, long offset, int length) throws EOFException { Preconditions.checkNotNull(byteBuffer, "byteBuffer can't be null"); - Preconditions.checkArgument( - length() >= 0, "Invalid length: %s in range (must be >= 0)", length); - Preconditions.checkArgument( - offset() >= 0, "Invalid offset: %s in range (must be >= 0)", offset); + Preconditions.checkArgument(length >= 0, "Invalid length: %s in range (must be >= 0)", length); + Preconditions.checkArgument(offset >= 0, "Invalid offset: %s in range (must be >= 0)", offset); this.byteBuffer = byteBuffer; this.offset = offset; diff --git a/api/src/test/java/org/apache/iceberg/TestFileContent.java b/api/src/test/java/org/apache/iceberg/TestFileContent.java new file mode 100644 index 000000000000..bd5e44ed3cf3 --- /dev/null +++ b/api/src/test/java/org/apache/iceberg/TestFileContent.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.stream.IntStream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.MethodSource; + +class TestFileContent { + + @ParameterizedTest + @EnumSource(FileContent.class) + void fromId(FileContent content) { + assertThat(FileContent.fromId(content.id())).isEqualTo(content); + } + + static IntStream invalidContentTypeIds() { + return IntStream.of(-1, FileContent.values().length); + } + + @ParameterizedTest + @MethodSource("invalidContentTypeIds") + void fromIdInvalid(int id) { + assertThatThrownBy(() -> FileContent.fromId(id)) + .isInstanceOf(ArrayIndexOutOfBoundsException.class) + .hasMessageContaining(String.valueOf(id)); + } +} diff --git a/api/src/test/java/org/apache/iceberg/TestPartitionSpecValidation.java b/api/src/test/java/org/apache/iceberg/TestPartitionSpecValidation.java index b8e16a9ee45e..a1709d2a2e06 100644 --- a/api/src/test/java/org/apache/iceberg/TestPartitionSpecValidation.java +++ b/api/src/test/java/org/apache/iceberg/TestPartitionSpecValidation.java @@ -242,6 +242,22 @@ public void testSettingPartitionTransformsWithCustomTargetNamesThatAlreadyExist( "Cannot create identity partition sourced from different field in schema: another_ts"); } + @Test + public void testStalePartitionSourceIdWithReusedColumnName() { + int newFieldId = 2; + int droppedFieldId = 1; + Schema schema = + new Schema(NestedField.required(newFieldId, "category", Types.StringType.get())); + PartitionSpec spec = + PartitionSpec.builderFor(schema) + .withSpecId(0) + .add(droppedFieldId, 1000, "category", Transforms.alwaysNull()) + .build(); + assertThat(spec.fields()).hasSize(1); + assertThat(spec.fields().get(0).sourceId()).isEqualTo(droppedFieldId); + assertThat(spec.fields().get(0).name()).isEqualTo("category"); + } + @Test public void testMissingSourceColumn() { assertThatThrownBy(() -> PartitionSpec.builderFor(SCHEMA).year("missing").build()) diff --git a/api/src/test/java/org/apache/iceberg/catalog/TestTableIdentifier.java b/api/src/test/java/org/apache/iceberg/catalog/TestTableIdentifier.java index ca9569436bab..13781ccaa7f4 100644 --- a/api/src/test/java/org/apache/iceberg/catalog/TestTableIdentifier.java +++ b/api/src/test/java/org/apache/iceberg/catalog/TestTableIdentifier.java @@ -22,6 +22,7 @@ import static org.assertj.core.api.Assertions.assertThatThrownBy; import org.junit.jupiter.api.Test; +import org.junitpioneer.jupiter.DefaultLocale; public class TestTableIdentifier { @@ -52,6 +53,13 @@ public void testToLowerCase() { .isEqualTo(TableIdentifier.of("Catalog", "dB", "TBL").toLowerCase()); } + @Test + @DefaultLocale(language = "tr") + public void testToLowerCaseIsLocaleIndependent() { + assertThat(TableIdentifier.of("information", "db", "tbl")) + .isEqualTo(TableIdentifier.of("INFORMATION", "DB", "TBL").toLowerCase()); + } + @Test public void testInvalidTableName() { assertThatThrownBy(() -> TableIdentifier.of(Namespace.empty(), "")) diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java index f34cd730df77..b55f4efb1726 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java @@ -32,7 +32,9 @@ import static org.apache.iceberg.expressions.Expressions.notIn; import static org.apache.iceberg.expressions.Expressions.notNaN; import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.notStartsWith; import static org.apache.iceberg.expressions.Expressions.or; +import static org.apache.iceberg.expressions.Expressions.startsWith; import static org.apache.iceberg.types.Conversions.toByteBuffer; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; @@ -72,8 +74,8 @@ public class TestStrictMetricsEvaluator { "struct", Types.StructType.of( Types.NestedField.optional(16, "nested_col_no_stats", Types.IntegerType.get()), - Types.NestedField.optional( - 17, "nested_col_with_stats", Types.IntegerType.get())))); + Types.NestedField.optional(17, "nested_col_with_stats", Types.IntegerType.get()), + Types.NestedField.optional(18, "nested_string_col", Types.StringType.get())))); private static final int INT_MIN_VALUE = 30; private static final int INT_MAX_VALUE = 79; @@ -172,6 +174,40 @@ public class TestStrictMetricsEvaluator { // upper bounds ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb"))); + // String-focused file: required column 3 has no nulls and string bounds ["abc", "abd"] + private static final DataFile STRING_FILE = + new TestDataFile( + "string_file.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.of(3, 50L), + // null value counts + ImmutableMap.of(), + // nan value counts + null, + // lower bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abc")), + // upper bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abd"))); + + // String file with wider range: required column 3 has no nulls and bounds ["aa", "dC"] + private static final DataFile STRING_FILE_2 = + new TestDataFile( + "string_file_2.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.of(3, 50L), + // null value counts + ImmutableMap.of(), + // nan value counts + null, + // lower bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "aa")), + // upper bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "dC"))); + @Test public void testAllNulls() { boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notNull("all_nulls")).eval(FILE); @@ -684,4 +720,205 @@ SCHEMA, lessThanOrEqual("struct.nested_col_with_stats", INT_MAX_VALUE)) new StrictMetricsEvaluator(SCHEMA, notNull("struct.nested_col_with_stats")).eval(FILE); assertThat(shouldRead).as("notNull nested column should not match").isFalse(); } + + @Test + public void testNotStartsWithAllNulls() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("all_nulls", "a")).eval(FILE); + assertThat(shouldRead).as("Should match: all null values satisfy notStartsWith").isTrue(); + } + + @Test + public void testNotStartsWithBoundsAbovePrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "aaa")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: all values are above the prefix range").isTrue(); + } + + @Test + public void testNotStartsWithBoundsBelowPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "zzz")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: all values are below the prefix range").isTrue(); + } + + @Test + public void testNotStartsWithBoundsOverlapPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "ab")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: bounds overlap the prefix range").isFalse(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "abc")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: lower bound starts with the prefix").isFalse(); + } + + @Test + public void testNotStartsWithWiderRange() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "e")).eval(STRING_FILE_2); + assertThat(shouldRead).as("Should match: all values are below the prefix").isTrue(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "a")).eval(STRING_FILE_2); + assertThat(shouldRead).as("Should not match: lower bound starts with the prefix").isFalse(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "c")).eval(STRING_FILE_2); + assertThat(shouldRead).as("Should not match: prefix is within the bounds range").isFalse(); + } + + @Test + public void testNotStartsWithNoStats() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "a")).eval(FILE); + assertThat(shouldRead).as("Should not match: no bounds available for column").isFalse(); + } + + @Test + void testStartsWithBothBoundsMatchPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "ab")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: both bounds start with the prefix").isTrue(); + } + + @Test + void testStartsWithSingleCharPrefixBothBoundsMatch() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "a")).eval(STRING_FILE); + assertThat(shouldRead) + .as("Should match: both bounds start with the single char prefix") + .isTrue(); + } + + @Test + void testStartsWithOnlyLowerBoundMatchesPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "abc")).eval(STRING_FILE); + assertThat(shouldRead) + .as("Should not match: upper bound does not start with the prefix") + .isFalse(); + } + + @Test + void testStartsWithBoundsDoNotMatchPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "zzz")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: no bounds start with the prefix").isFalse(); + } + + @Test + void testStartsWithWiderRange() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "a")).eval(STRING_FILE_2); + assertThat(shouldRead) + .as("Should not match: upper bound does not start with the prefix") + .isFalse(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "e")).eval(STRING_FILE_2); + assertThat(shouldRead).as("Should not match: no bounds start with the prefix").isFalse(); + } + + @Test + void testStartsWithNoStats() { + boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, startsWith("required", "a")).eval(FILE); + assertThat(shouldRead).as("Should not match: no bounds available for column").isFalse(); + } + + @Test + public void testNotStartsWithSomeNullsBoundsOutsidePrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("some_nulls", "zzz")).eval(FILE_2); + assertThat(shouldRead).as("Should match: all values are below the prefix").isTrue(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("some_nulls", "aaa")).eval(FILE_2); + assertThat(shouldRead).as("Should match: all values are above the prefix").isTrue(); + } + + @Test + public void testNotStartsWithPrefixLongerThanBounds() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "aaaaaaa")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: all values are above the long prefix").isTrue(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "zzzzzzz")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: all values are below the long prefix").isTrue(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "abcdef")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: prefix overlaps with bound range").isFalse(); + } + + @Test + void testNotStartsWithEmptyPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: all strings start with empty prefix").isFalse(); + } + + @Test + void testNotStartsWithExactBoundMatch() { + // FILE_3 has column 5 (some_nulls) with exact bounds ["bbb", "bbb"] + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("some_nulls", "bbb")).eval(FILE_3); + assertThat(shouldRead).as("Should not match: bounds exactly equal the prefix").isFalse(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("some_nulls", "zzz")).eval(FILE_3); + assertThat(shouldRead).as("Should match: all values are below the prefix").isTrue(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("some_nulls", "aaa")).eval(FILE_3); + assertThat(shouldRead).as("Should match: all values are above the prefix").isTrue(); + } + + @Test + public void testNotStartsWithNestedColumn() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("struct.nested_string_col", "a")) + .eval(FILE); + assertThat(shouldRead).as("notStartsWith nested column should not match").isFalse(); + } + + @Test + void testStartsWithAllNulls() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("all_nulls", "a")).eval(FILE); + assertThat(shouldRead) + .as("Should not match: all null values do not satisfy startsWith") + .isFalse(); + } + + @Test + void testStartsWithSomeNulls() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("some_nulls", "b")).eval(FILE_2); + assertThat(shouldRead) + .as("Should not match: some nulls means not all rows can satisfy startsWith") + .isFalse(); + } + + @Test + void testStartsWithPrefixLongerThanBounds() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "abcdef")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: prefix is longer than the bounds").isFalse(); + } + + @Test + void testStartsWithEmptyPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: all strings start with empty prefix").isTrue(); + } + + @Test + void testStartsWithNestedColumn() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("struct.nested_string_col", "a")).eval(FILE); + assertThat(shouldRead).as("Should not match: nested column is not supported").isFalse(); + } } diff --git a/api/src/test/java/org/apache/iceberg/io/TestFileRange.java b/api/src/test/java/org/apache/iceberg/io/TestFileRange.java new file mode 100644 index 000000000000..dc4ede9ec3b4 --- /dev/null +++ b/api/src/test/java/org/apache/iceberg/io/TestFileRange.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.io; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.EOFException; +import java.nio.ByteBuffer; +import java.util.concurrent.CompletableFuture; +import org.junit.jupiter.api.Test; + +public class TestFileRange { + + @Test + public void validRange() throws EOFException { + CompletableFuture future = new CompletableFuture<>(); + FileRange range = new FileRange(future, 10L, 100); + assertThat(range.offset()).isEqualTo(10L); + assertThat(range.length()).isEqualTo(100); + assertThat(range.byteBuffer()).isSameAs(future); + } + + @Test + public void negativeLength() { + CompletableFuture future = new CompletableFuture<>(); + assertThatThrownBy(() -> new FileRange(future, 0L, -1)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid length: -1 in range (must be >= 0)"); + } + + @Test + public void negativeOffset() { + CompletableFuture future = new CompletableFuture<>(); + assertThatThrownBy(() -> new FileRange(future, -1L, 0)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid offset: -1 in range (must be >= 0)"); + } + + @Test + public void nullByteBuffer() { + assertThatThrownBy(() -> new FileRange(null, 0L, 0)) + .isInstanceOf(NullPointerException.class) + .hasMessage("byteBuffer can't be null"); + } +} diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java index 2cc7cde4541a..e9ebed2826f4 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java @@ -588,10 +588,18 @@ public Optional visit( int bitWidth = intLogicalType.getBitWidth(); if (bitWidth == 8 || bitWidth == 16 || bitWidth == 32) { + // Iceberg has no unsigned integer type. Reading UINT32 into a 32-bit signed value would + // silently produce negative results for inputs above Integer.MAX_VALUE. UINT8 and UINT16 + // both fit losslessly in a signed int32 and are allowed, matching the policy in + // BaseParquetReaders for the non-vectorized path. + Preconditions.checkArgument( + intLogicalType.isSigned() || bitWidth < 32, "Cannot read UINT32 as an int value"); ((IntVector) vector).allocateNew(batchSize); return Optional.of( new LogicalTypeVisitorResult(vector, ReadType.INT, (int) IntVector.TYPE_WIDTH)); } else if (bitWidth == 64) { + Preconditions.checkArgument( + intLogicalType.isSigned(), "Cannot read UINT64 as a long value"); ((BigIntVector) vector).allocateNew(batchSize); return Optional.of( new LogicalTypeVisitorResult(vector, ReadType.LONG, (int) BigIntVector.TYPE_WIDTH)); diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java index 15b55fb48d4a..3fbd797c26fb 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java @@ -154,6 +154,16 @@ public VectorizedReader struct( return null; } + @Override + public VectorizedReader variant( + Types.VariantType iVariant, GroupType variant, VectorizedReader result) { + if (iVariant != null) { + throw new UnsupportedOperationException( + "Vectorized reads are not supported yet for variant fields"); + } + return null; + } + @Override public VectorizedReader primitive( org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/TestArrowReader.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/TestArrowReader.java index 34e83de15207..cf3eb2700265 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/TestArrowReader.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/TestArrowReader.java @@ -21,6 +21,7 @@ import static org.apache.iceberg.Files.localInput; import static org.apache.parquet.schema.Types.primitive; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.File; import java.io.IOException; @@ -41,6 +42,7 @@ import java.util.concurrent.TimeUnit; import java.util.function.BiFunction; import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.DateDayVector; @@ -101,6 +103,9 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; /** * Test cases for {@link ArrowReader}. @@ -383,6 +388,111 @@ public void testTimestampMillisAreReadCorrectly() throws Exception { assertThat(totalRowsRead).as("Should read all rows").isEqualTo(millisValues.size()); } + @ParameterizedTest + @MethodSource("rejectedUnsignedIntegerCases") + public void testUnsignedIntegerColumnThrowsException( + int unsignedBitWidth, + PrimitiveType.PrimitiveTypeName physicalType, + Schema schema, + String expectedMessage) + throws Exception { + Table table = createSingleRowUnsignedIntTable(schema, physicalType, unsignedBitWidth, 100L); + + assertThatThrownBy( + () -> { + try (VectorizedTableScanIterable vectorizedReader = + new VectorizedTableScanIterable(table.newScan(), 1024, false)) { + for (ColumnarBatch batch : vectorizedReader) { + batch.createVectorSchemaRootFromVectors().close(); + } + } + }) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining(expectedMessage); + } + + @ParameterizedTest + @MethodSource("acceptedUnsignedSmallIntegerCases") + public void testUnsignedSmallIntegerColumnRoundtrips(int unsignedBitWidth, int value) + throws Exception { + Schema schema = new Schema(Types.NestedField.optional(1, "col", Types.IntegerType.get())); + Table table = + createSingleRowUnsignedIntTable( + schema, PrimitiveType.PrimitiveTypeName.INT32, unsignedBitWidth, value); + + int totalRows = 0; + try (VectorizedTableScanIterable vectorizedReader = + new VectorizedTableScanIterable(table.newScan(), 1024, false)) { + for (ColumnarBatch batch : vectorizedReader) { + VectorSchemaRoot root = batch.createVectorSchemaRootFromVectors(); + assertThat(((IntVector) root.getVector("col")).get(0)) + .as("UINT%d value should round-trip through int", unsignedBitWidth) + .isEqualTo(value); + totalRows += root.getRowCount(); + root.close(); + } + } + + assertThat(totalRows).isEqualTo(1); + } + + private static Stream rejectedUnsignedIntegerCases() { + return Stream.of( + Arguments.of( + 32, + PrimitiveType.PrimitiveTypeName.INT32, + new Schema(Types.NestedField.optional(1, "col", Types.IntegerType.get())), + "Cannot read UINT32 as an int value"), + Arguments.of( + 64, + PrimitiveType.PrimitiveTypeName.INT64, + new Schema(Types.NestedField.optional(1, "col", Types.LongType.get())), + "Cannot read UINT64 as a long value")); + } + + private static Stream acceptedUnsignedSmallIntegerCases() { + return Stream.of(Arguments.of(8, 250), Arguments.of(16, 50000)); + } + + private Table createSingleRowUnsignedIntTable( + Schema schema, PrimitiveType.PrimitiveTypeName physicalType, int unsignedBitWidth, long value) + throws IOException { + tables = new HadoopTables(); + Table table = tables.create(schema, tempDir.toURI() + "/uint" + unsignedBitWidth); + + MessageType parquetSchema = + new MessageType( + "test", + primitive(physicalType, Type.Repetition.OPTIONAL) + .as(LogicalTypeAnnotation.intType(unsignedBitWidth, false)) + .id(1) + .named("col")); + + File testFile = + new File(tempDir, "unsigned-int" + unsignedBitWidth + "-" + System.nanoTime() + ".parquet"); + try (ParquetWriter writer = + ExampleParquetWriter.builder(new Path(testFile.toURI())).withType(parquetSchema).build()) { + SimpleGroupFactory factory = new SimpleGroupFactory(parquetSchema); + Group group = factory.newGroup(); + if (physicalType == PrimitiveType.PrimitiveTypeName.INT64) { + group.add("col", value); + } else { + group.add("col", (int) value); + } + writer.write(group); + } + + DataFile dataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withPath(testFile.getAbsolutePath()) + .withFileSizeInBytes(testFile.length()) + .withFormat(FileFormat.PARQUET) + .withRecordCount(1) + .build(); + table.newAppend().appendFile(dataFile).commit(); + return table; + } + /** * Run the following verifications: * diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/TestVectorizedReaderBuilder.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/TestVectorizedReaderBuilder.java new file mode 100644 index 000000000000..e3d76515bcc7 --- /dev/null +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/TestVectorizedReaderBuilder.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.arrow.vectorized; + +import static org.assertj.core.api.Assertions.assertThatNoException; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import org.apache.iceberg.Schema; +import org.apache.iceberg.parquet.TypeWithSchemaVisitor; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.types.Types.IntegerType; +import org.apache.iceberg.types.Types.NestedField; +import org.apache.iceberg.types.Types.VariantType; +import org.apache.iceberg.variants.Variant; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; +import org.junit.jupiter.api.Test; + +public class TestVectorizedReaderBuilder { + + @Test + public void testVariantNotSupportedInVectorizedReads() { + Schema icebergSchema = + new Schema( + NestedField.required(1, "id", IntegerType.get()), + NestedField.optional(2, "data", VariantType.get())); + + MessageType parquetSchema = parquetSchemaWithVariant(); + + VectorizedReaderBuilder builder = + new VectorizedReaderBuilder( + icebergSchema, parquetSchema, false, ImmutableMap.of(), readers -> null); + + assertThatThrownBy( + () -> TypeWithSchemaVisitor.visit(icebergSchema.asStruct(), parquetSchema, builder)) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessageContaining("Vectorized reads are not supported yet for variant fields"); + } + + @Test + public void testVariantSkippedWhenNotInProjection() { + Schema icebergSchema = new Schema(NestedField.required(1, "id", IntegerType.get())); + + MessageType parquetSchema = parquetSchemaWithVariant(); + + VectorizedReaderBuilder builder = + new VectorizedReaderBuilder( + icebergSchema, parquetSchema, false, ImmutableMap.of(), readers -> null); + + assertThatNoException() + .describedAs("Variant not in projection should not throw") + .isThrownBy( + () -> TypeWithSchemaVisitor.visit(icebergSchema.asStruct(), parquetSchema, builder)); + } + + private static MessageType parquetSchemaWithVariant() { + return Types.buildMessage() + .addField( + Types.primitive(PrimitiveTypeName.INT32, Type.Repetition.REQUIRED).id(1).named("id")) + .addField( + Types.buildGroup(Type.Repetition.OPTIONAL) + .as(LogicalTypeAnnotation.variantType(Variant.VARIANT_SPEC_VERSION)) + .addField( + Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .named("metadata")) + .addField( + Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .named("value")) + .id(2) + .named("data")) + .named("table"); + } +} diff --git a/aws-bundle/LICENSE b/aws-bundle/LICENSE index 997d9652a873..d8484c933f9e 100644 --- a/aws-bundle/LICENSE +++ b/aws-bundle/LICENSE @@ -217,9 +217,18 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Parquet. +This product bundles Apache Parquet (bundled by AWS Analytics Accelerator S3). -Project URL: https://parquet.apache.org +Copyright: 2014-2024 The Apache Software Foundation +Project URL: https://parquet.apache.org/ +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles Apache Thrift (bundled by Parquet). + +Copyright: 2006-2017 The Apache Software Foundation. +Project URL: https://thrift.apache.org/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -241,9 +250,8 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 This product bundles Reactive Streams. Project URL: http://reactive-streams.org -License: MIT -| MIT No Attribution -| +License: MIT-0 + | Copyright 2014 Reactive Streams | | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so. @@ -295,7 +303,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles failsafe. +This product bundles failsafe (bundled by AWS Analytics Accelerator S3). Copyright: Jonathan Halterman and friends Project URL: https://failsafe.dev/ @@ -316,6 +324,7 @@ This product bundles checkerframework checker-qual. Copyright: 2004-2020 the Checker Framework developers Project URL: https://github.com/typetools/checker-framework License: MIT + | The annotations are licensed under the MIT License. (The text of this | license appears below.) More specifically, all the parts of the Checker | Framework that you might want to include with your own program use the @@ -358,3 +367,472 @@ This product bundles JCTools (via Netty). Project URL: https://github.com/JCTools/JCTools License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles the Mozilla Public Suffix List (via Apache HttpComponents). + +Project URL: https://publicsuffix.org/ +License: Mozilla Public License, Version 2.0 - https://mozilla.org/MPL/2.0/ + +| Mozilla Public License Version 2.0 +| ================================== +| +| 1. Definitions +| -------------- +| +| 1.1. "Contributor" +| means each individual or legal entity that creates, contributes to +| the creation of, or owns Covered Software. +| +| 1.2. "Contributor Version" +| means the combination of the Contributions of others (if any) used +| by a Contributor and that particular Contributor's Contribution. +| +| 1.3. "Contribution" +| means Covered Software of a particular Contributor. +| +| 1.4. "Covered Software" +| means Source Code Form to which the initial Contributor has attached +| the notice in Exhibit A, the Executable Form of such Source Code +| Form, and Modifications of such Source Code Form, in each case +| including portions thereof. +| +| 1.5. "Incompatible With Secondary Licenses" +| means +| +| (a) that the initial Contributor has attached the notice described +| in Exhibit B to the Covered Software; or +| +| (b) that the Covered Software was made available under the terms of +| version 1.1 or earlier of the License, but not also under the +| terms of a Secondary License. +| +| 1.6. "Executable Form" +| means any form of the work other than Source Code Form. +| +| 1.7. "Larger Work" +| means a work that combines Covered Software with other material, in +| a separate file or files, that is not Covered Software. +| +| 1.8. "License" +| means this document. +| +| 1.9. "Licensable" +| means having the right to grant, to the maximum extent possible, +| whether at the time of the initial grant or subsequently, any and +| all of the rights conveyed by this License. +| +| 1.10. "Modifications" +| means any of the following: +| +| (a) any file in Source Code Form that results from an addition to, +| deletion from, or modification of the contents of Covered +| Software; or +| +| (b) any new file in Source Code Form that contains any Covered +| Software. +| +| 1.11. "Patent Claims" of a Contributor +| means any patent claim(s), including without limitation, method, +| process, and apparatus claims, in any patent Licensable by such +| Contributor that would be infringed, but for the grant of the +| License, by the making, using, selling, offering for sale, having +| made, import, or transfer of either its Contributions or its +| Contributor Version. +| +| 1.12. "Secondary License" +| means either the GNU General Public License, Version 2.0, the GNU +| Lesser General Public License, Version 2.1, the GNU Affero General +| Public License, Version 3.0, or any later versions of those +| licenses. +| +| 1.13. "Source Code Form" +| means the form of the work preferred for making modifications. +| +| 1.14. "You" (or "Your") +| means an individual or a legal entity exercising rights under this +| License. For legal entities, "You" includes any entity that +| controls, is controlled by, or is under common control with You. For +| purposes of this definition, "control" means (a) the power, direct +| or indirect, to cause the direction or management of such entity, +| whether by contract or otherwise, or (b) ownership of more than +| fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants and Conditions +| -------------------------------- +| +| 2.1. Grants +| +| Each Contributor hereby grants You a world-wide, royalty-free, +| non-exclusive license: +| +| (a) under intellectual property rights (other than patent or trademark) +| Licensable by such Contributor to use, reproduce, make available, +| modify, display, perform, distribute, and otherwise exploit its +| Contributions, either on an unmodified basis, with Modifications, or +| as part of a Larger Work; and +| +| (b) under Patent Claims of such Contributor to make, use, sell, offer +| for sale, have made, import, and otherwise transfer either its +| Contributions or its Contributor Version. +| +| 2.2. Effective Date +| +| The licenses granted in Section 2.1 with respect to any Contribution +| become effective for each Contribution on the date the Contributor first +| distributes such Contribution. +| +| 2.3. Limitations on Grant Scope +| +| The licenses granted in this Section 2 are the only rights granted under +| this License. No additional rights or licenses will be implied from the +| distribution or licensing of Covered Software under this License. +| Notwithstanding Section 2.1(b) above, no patent license is granted by a +| Contributor: +| +| (a) for any code that a Contributor has removed from Covered Software; +| or +| +| (b) for infringements caused by: (i) Your and any other third party's +| modifications of Covered Software, or (ii) the combination of its +| Contributions with other software (except as part of its Contributor +| Version); or +| +| (c) under Patent Claims infringed by Covered Software in the absence of +| its Contributions. +| +| This License does not grant any rights in the trademarks, service marks, +| or logos of any Contributor (except as may be necessary to comply with +| the notice requirements in Section 3.4). +| +| 2.4. Subsequent Licenses +| +| No Contributor makes additional grants as a result of Your choice to +| distribute the Covered Software under a subsequent version of this +| License (see Section 10.2) or under the terms of a Secondary License (if +| permitted under the terms of Section 3.3). +| +| 2.5. Representation +| +| Each Contributor represents that the Contributor believes its +| Contributions are its original creation(s) or it has sufficient rights +| to grant the rights to its Contributions conveyed by this License. +| +| 2.6. Fair Use +| +| This License is not intended to limit any rights You have under +| applicable copyright doctrines of fair use, fair dealing, or other +| equivalents. +| +| 2.7. Conditions +| +| Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +| in Section 2.1. +| +| 3. Responsibilities +| ------------------- +| +| 3.1. Distribution of Source Form +| +| All distribution of Covered Software in Source Code Form, including any +| Modifications that You create or to which You contribute, must be under +| the terms of this License. You must inform recipients that the Source +| Code Form of the Covered Software is governed by the terms of this +| License, and how they can obtain a copy of this License. You may not +| attempt to alter or restrict the recipients' rights in the Source Code +| Form. +| +| 3.2. Distribution of Executable Form +| +| If You distribute Covered Software in Executable Form then: +| +| (a) such Covered Software must also be made available in Source Code +| Form, as described in Section 3.1, and You must inform recipients of +| the Executable Form how they can obtain a copy of such Source Code +| Form by reasonable means in a timely manner, at a charge no more +| than the cost of distribution to the recipient; and +| +| (b) You may distribute such Executable Form under the terms of this +| License, or sublicense it under different terms, provided that the +| license for the Executable Form does not attempt to limit or alter +| the recipients' rights in the Source Code Form under this License. +| +| 3.3. Distribution of a Larger Work +| +| You may create and distribute a Larger Work under terms of Your choice, +| provided that You also comply with the requirements of this License for +| the Covered Software. If the Larger Work is a combination of Covered +| Software with a work governed by one or more Secondary Licenses, and the +| Covered Software is not Incompatible With Secondary Licenses, this +| License permits You to additionally distribute such Covered Software +| under the terms of such Secondary License(s), so that the recipient of +| the Larger Work may, at their option, further distribute the Covered +| Software under the terms of either this License or such Secondary +| License(s). +| +| 3.4. Notices +| +| You may not remove or alter the substance of any license notices +| (including copyright notices, patent notices, disclaimers of warranty, +| or limitations of liability) contained within the Source Code Form of +| the Covered Software, except that You may alter any license notices to +| the extent required to remedy known factual inaccuracies. +| +| 3.5. Application of Additional Terms +| +| You may choose to offer, and to charge a fee for, warranty, support, +| indemnity or liability obligations to one or more recipients of Covered +| Software. However, You may do so only on Your own behalf, and not on +| behalf of any Contributor. You must make it absolutely clear that any +| such warranty, support, indemnity, or liability obligation is offered by +| You alone, and You hereby agree to indemnify every Contributor for any +| liability incurred by such Contributor as a result of warranty, support, +| indemnity or liability terms You offer. You may include additional +| disclaimers of warranty and limitations of liability specific to any +| jurisdiction. +| +| 4. Inability to Comply Due to Statute or Regulation +| --------------------------------------------------- +| +| If it is impossible for You to comply with any of the terms of this +| License with respect to some or all of the Covered Software due to +| statute, judicial order, or regulation then You must: (a) comply with +| the terms of this License to the maximum extent possible; and (b) +| describe the limitations and the code they affect. Such description must +| be placed in a text file included with all distributions of the Covered +| Software under this License. Except to the extent prohibited by statute +| or regulation, such description must be sufficiently detailed for a +| recipient of ordinary skill to be able to understand it. +| +| 5. Termination +| -------------- +| +| 5.1. The rights granted under this License will terminate automatically +| if You fail to comply with any of its terms. However, if You become +| compliant, then the rights granted under this License from a particular +| Contributor are reinstated (a) provisionally, unless and until such +| Contributor explicitly and finally terminates Your grants, and (b) on an +| ongoing basis, if such Contributor fails to notify You of the +| non-compliance by some reasonable means prior to 60 days after You have +| come back into compliance. Moreover, Your grants from a particular +| Contributor are reinstated on an ongoing basis if such Contributor +| notifies You of the non-compliance by some reasonable means, this is the +| first time You have received notice of non-compliance with this License +| from such Contributor, and You become compliant prior to 30 days after +| Your receipt of the notice. +| +| 5.2. If You initiate litigation against any entity by asserting a patent +| infringement claim (excluding declaratory judgment actions, +| counter-claims, and cross-claims) alleging that a Contributor Version +| directly or indirectly infringes any patent, then the rights granted to +| You by any and all Contributors for the Covered Software under Section +| 2.1 of this License shall terminate. +| +| 5.3. In the event of termination under Sections 5.1 or 5.2 above, all +| end user license agreements (excluding distributors and resellers) which +| have been validly granted by You or Your distributors under this License +| prior to termination shall survive termination. +| +| ************************************************************************ +| * * +| * 6. Disclaimer of Warranty * +| * ------------------------- * +| * * +| * Covered Software is provided under this License on an "as is" * +| * basis, without warranty of any kind, either expressed, implied, or * +| * statutory, including, without limitation, warranties that the * +| * Covered Software is free of defects, merchantable, fit for a * +| * particular purpose or non-infringing. The entire risk as to the * +| * quality and performance of the Covered Software is with You. * +| * Should any Covered Software prove defective in any respect, You * +| * (not any Contributor) assume the cost of any necessary servicing, * +| * repair, or correction. This disclaimer of warranty constitutes an * +| * essential part of this License. No use of any Covered Software is * +| * authorized under this License except under this disclaimer. * +| * * +| ************************************************************************ +| +| ************************************************************************ +| * * +| * 7. Limitation of Liability * +| * -------------------------- * +| * * +| * Under no circumstances and under no legal theory, whether tort * +| * (including negligence), contract, or otherwise, shall any * +| * Contributor, or anyone who distributes Covered Software as * +| * permitted above, be liable to You for any direct, indirect, * +| * special, incidental, or consequential damages of any character * +| * including, without limitation, damages for lost profits, loss of * +| * goodwill, work stoppage, computer failure or malfunction, or any * +| * and all other commercial damages or losses, even if such party * +| * shall have been informed of the possibility of such damages. This * +| * limitation of liability shall not apply to liability for death or * +| * personal injury resulting from such party's negligence to the * +| * extent applicable law prohibits such limitation. Some * +| * jurisdictions do not allow the exclusion or limitation of * +| * incidental or consequential damages, so this exclusion and * +| * limitation may not apply to You. * +| * * +| ************************************************************************ +| +| 8. Litigation +| ------------- +| +| Any litigation relating to this License may be brought only in the +| courts of a jurisdiction where the defendant maintains its principal +| place of business and such litigation shall be governed by laws of that +| jurisdiction, without reference to its conflict-of-law provisions. +| Nothing in this Section shall prevent a party's ability to bring +| cross-claims or counter-claims. +| +| 9. Miscellaneous +| ---------------- +| +| This License represents the complete agreement concerning the subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. Any law or regulation which provides +| that the language of a contract shall be construed against the drafter +| shall not be used to construe this License against a Contributor. +| +| 10. Versions of the License +| --------------------------- +| +| 10.1. New Versions +| +| Mozilla Foundation is the license steward. Except as provided in Section +| 10.3, no one other than the license steward has the right to modify or +| publish new versions of this License. Each version will be given a +| distinguishing version number. +| +| 10.2. Effect of New Versions +| +| You may distribute the Covered Software under the terms of the version +| of the License under which You originally received the Covered Software, +| or under the terms of any subsequent version published by the license +| steward. +| +| 10.3. Modified Versions +| +| If you create software not governed by this License, and you want to +| create a new license for such software, you may create and use a +| modified version of this License if you rename the license and remove +| any references to the name of the license steward (except to note that +| such modified license differs from this License). +| +| 10.4. Distributing Source Code Form that is Incompatible With Secondary +| Licenses +| +| If You choose to distribute Source Code Form that is Incompatible With +| Secondary Licenses under the terms of this version of the License, the +| notice described in Exhibit B of this License must be attached. +| +| Exhibit A - Source Code Form License Notice +| ------------------------------------------- +| +| This Source Code Form is subject to the terms of the Mozilla Public +| License, v. 2.0. If a copy of the MPL was not distributed with this +| file, You can obtain one at http://mozilla.org/MPL/2.0/. +| +| If it is not possible or desirable to put the notice in a particular +| file, then You may include the notice in a location (such as a LICENSE +| file in a relevant directory) where a recipient would be likely to look +| for such a notice. +| +| You may add additional accurate notices of copyright ownership. +| +| Exhibit B - "Incompatible With Secondary Licenses" Notice +| --------------------------------------------------------- +| +| This Source Code Form is "Incompatible With Secondary Licenses", as +| defined by the Mozilla Public License, v. 2.0. + +-------------------------------------------------------------------------------- + +This product bundles FastDoubleParser (via Jackson JSON Processor, via AWS SDK third-party-jackson-core). + +Copyright: 2023 Werner Randelshofer, Switzerland +Project URL: https://github.com/wrandelshofer/FastDoubleParser +License: MIT + +| Copyright (c) 2023 Werner Randelshofer, Switzerland +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles fast_float (bundled by FastDoubleParser). + +Copyright: 2021 The fast_float authors +Project URL: https://github.com/fastfloat/fast_float +License: MIT + +| Copyright (c) 2021 The fast_float authors +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles bigint (bundled by FastDoubleParser). + +Copyright: 2022 Tim Buktu +Project URL: https://github.com/tbuktu/bigint +License: BSD 2-Clause + +| Copyright 2022 Tim Buktu +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| 1. Redistributions of source code must retain the above copyright notice, this +| list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/aws-bundle/NOTICE b/aws-bundle/NOTICE index 45a2fba1b43c..39738b74a297 100644 --- a/aws-bundle/NOTICE +++ b/aws-bundle/NOTICE @@ -332,3 +332,8 @@ This product bundles Netty with the following in its NOTICE file: | * license/LICENSE.brotli4j.txt (Apache License 2.0) | * HOMEPAGE: | * https://github.com/hyperxpro/Brotli4j + +-------------------------------------------------------------------------------- + +This product bundles AWS Analytics Accelerator S3 with the following in its NOTICE file: +| Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/aws-bundle/build.gradle b/aws-bundle/build.gradle index 5b9054812a50..541d5ae7a541 100644 --- a/aws-bundle/build.gradle +++ b/aws-bundle/build.gradle @@ -23,6 +23,14 @@ project(":iceberg-aws-bundle") { tasks.jar.dependsOn tasks.shadowJar + configurations { + implementation { + exclude group: 'org.slf4j' + exclude group: 'org.apache.logging.slf4j' + exclude group: 'org.apache.logging.log4j' + } + } + dependencies { implementation platform(libs.awssdk.bom) implementation libs.awssdk.s3accessgrants @@ -52,12 +60,6 @@ project(":iceberg-aws-bundle") { include 'NOTICE' } - dependencies { - exclude(dependency('org.slf4j:.*')) - exclude(dependency('org.apache.logging.log4j:.*')) - exclude(dependency('org.apache.logging.slf4j:.*')) - } - // relocate AWS-specific versions relocate 'org.apache.http', 'org.apache.iceberg.aws.shaded.org.apache.http' relocate 'io.netty', 'org.apache.iceberg.aws.shaded.io.netty' @@ -66,4 +68,6 @@ project(":iceberg-aws-bundle") { jar { enabled = false } + + apply from: "${rootDir}/runtime-deps.gradle" } diff --git a/aws-bundle/runtime-deps.txt b/aws-bundle/runtime-deps.txt new file mode 100644 index 000000000000..73c7e0ef16b9 --- /dev/null +++ b/aws-bundle/runtime-deps.txt @@ -0,0 +1,66 @@ +com.github.ben-manes.caffeine:caffeine:2.9.3 +com.google.errorprone:error_prone_annotations:2.10.0 +commons-codec:commons-codec:1.17.1 +commons-logging:commons-logging:1.2 +io.netty:netty-buffer:4.1.132.Final +io.netty:netty-codec-http2:4.1.132.Final +io.netty:netty-codec-http:4.1.132.Final +io.netty:netty-codec:4.1.132.Final +io.netty:netty-common:4.1.132.Final +io.netty:netty-handler:4.1.132.Final +io.netty:netty-resolver:4.1.132.Final +io.netty:netty-transport-classes-epoll:4.1.132.Final +io.netty:netty-transport-native-unix-common:4.1.132.Final +io.netty:netty-transport:4.1.132.Final +org.apache.httpcomponents:httpclient:4.5.13 +org.apache.httpcomponents:httpcore:4.4.16 +org.checkerframework:checker-qual:3.19.0 +org.reactivestreams:reactive-streams:1.0.4 +software.amazon.awssdk.crt:aws-crt:0.45.1 +software.amazon.awssdk:annotations:2.42.41 +software.amazon.awssdk:apache-client:2.42.41 +software.amazon.awssdk:arns:2.42.41 +software.amazon.awssdk:auth:2.42.41 +software.amazon.awssdk:aws-core:2.42.41 +software.amazon.awssdk:aws-json-protocol:2.42.41 +software.amazon.awssdk:aws-query-protocol:2.42.41 +software.amazon.awssdk:aws-xml-protocol:2.42.41 +software.amazon.awssdk:checksums-spi:2.42.41 +software.amazon.awssdk:checksums:2.42.41 +software.amazon.awssdk:cloudwatch-metric-publisher:2.42.41 +software.amazon.awssdk:cloudwatch:2.42.41 +software.amazon.awssdk:crt-core:2.42.41 +software.amazon.awssdk:dynamodb:2.42.41 +software.amazon.awssdk:endpoints-spi:2.42.41 +software.amazon.awssdk:glue:2.42.41 +software.amazon.awssdk:http-auth-aws-crt:2.42.41 +software.amazon.awssdk:http-auth-aws-eventstream:2.42.41 +software.amazon.awssdk:http-auth-aws:2.42.41 +software.amazon.awssdk:http-auth-spi:2.42.41 +software.amazon.awssdk:http-auth:2.42.41 +software.amazon.awssdk:http-client-spi:2.42.41 +software.amazon.awssdk:iam:2.42.41 +software.amazon.awssdk:identity-spi:2.42.41 +software.amazon.awssdk:json-utils:2.42.41 +software.amazon.awssdk:kms:2.42.41 +software.amazon.awssdk:lakeformation:2.42.41 +software.amazon.awssdk:metrics-spi:2.42.41 +software.amazon.awssdk:netty-nio-client:2.42.41 +software.amazon.awssdk:profiles:2.42.41 +software.amazon.awssdk:protocol-core:2.42.41 +software.amazon.awssdk:regions:2.42.41 +software.amazon.awssdk:retries-spi:2.42.41 +software.amazon.awssdk:retries:2.42.41 +software.amazon.awssdk:s3:2.42.41 +software.amazon.awssdk:s3control:2.42.41 +software.amazon.awssdk:sdk-core:2.42.41 +software.amazon.awssdk:smithy-rpcv2-protocol:2.42.41 +software.amazon.awssdk:sso:2.42.41 +software.amazon.awssdk:sts:2.42.41 +software.amazon.awssdk:third-party-jackson-core:2.42.41 +software.amazon.awssdk:third-party-jackson-dataformat-cbor:2.42.41 +software.amazon.awssdk:utils-lite:2.42.41 +software.amazon.awssdk:utils:2.42.41 +software.amazon.eventstream:eventstream:1.0.1 +software.amazon.s3.accessgrants:aws-s3-accessgrants-java-plugin:2.4.1 +software.amazon.s3.analyticsaccelerator:analyticsaccelerator-s3:1.3.1 diff --git a/aws/src/integration/java/org/apache/iceberg/aws/glue/GlueTestBase.java b/aws/src/integration/java/org/apache/iceberg/aws/glue/GlueTestBase.java index 65e37eba4cd3..b02537bf40b2 100644 --- a/aws/src/integration/java/org/apache/iceberg/aws/glue/GlueTestBase.java +++ b/aws/src/integration/java/org/apache/iceberg/aws/glue/GlueTestBase.java @@ -65,6 +65,7 @@ public class GlueTestBase { // iceberg static GlueCatalog glueCatalog; static GlueCatalog glueCatalogWithSkipNameValidation; + static GlueCatalog glueCatalogWithUniqueLocation; static Schema schema = new Schema(Types.NestedField.required(1, "c1", Types.StringType.get(), "c1")); @@ -105,6 +106,16 @@ public static void beforeClass() { GLUE, null, ImmutableMap.of()); + + glueCatalogWithUniqueLocation = new GlueCatalog(); + glueCatalogWithUniqueLocation.initialize( + CATALOG_NAME, + TEST_BUCKET_PATH, + awsProperties, + s3FileIOProperties, + GLUE, + null, + true /* uniqTableLocation */); } @AfterAll diff --git a/aws/src/integration/java/org/apache/iceberg/aws/glue/TestGlueCatalogTable.java b/aws/src/integration/java/org/apache/iceberg/aws/glue/TestGlueCatalogTable.java index 2c9459c5e36c..cb015b79fb9b 100644 --- a/aws/src/integration/java/org/apache/iceberg/aws/glue/TestGlueCatalogTable.java +++ b/aws/src/integration/java/org/apache/iceberg/aws/glue/TestGlueCatalogTable.java @@ -310,6 +310,22 @@ public void testRenameTable() { assertThat(renamedTable.currentSnapshot()).isEqualTo(table.currentSnapshot()); } + @Test + public void testCreateTableInUniqueLocation() { + String namespace = createNamespace(); + String tableName = createTable(namespace); + String newTableName = tableName + "_renamed"; + + glueCatalogWithUniqueLocation.renameTable( + TableIdentifier.of(namespace, tableName), TableIdentifier.of(namespace, newTableName)); + Table renamedTable = + glueCatalogWithUniqueLocation.loadTable(TableIdentifier.of(namespace, newTableName)); + createTable(namespace, tableName); + Table table = glueCatalogWithUniqueLocation.loadTable(TableIdentifier.of(namespace, tableName)); + + assertThat(renamedTable.location()).isNotEqualTo(table.location()); + } + @Test public void testRenameTableFailsToCreateNewTable() { String namespace = createNamespace(); @@ -743,7 +759,8 @@ public void testTableLevelS3Tags() { new AwsProperties(properties), new S3FileIOProperties(properties), GLUE, - null); + null, + false /* uniqTableLocation */); String namespace = createNamespace(); String tableName = getRandomName(); createTable(namespace, tableName); diff --git a/aws/src/integration/java/org/apache/iceberg/aws/s3/TestS3MultipartUpload.java b/aws/src/integration/java/org/apache/iceberg/aws/s3/TestS3MultipartUpload.java index cbe3051a6711..746015098a40 100644 --- a/aws/src/integration/java/org/apache/iceberg/aws/s3/TestS3MultipartUpload.java +++ b/aws/src/integration/java/org/apache/iceberg/aws/s3/TestS3MultipartUpload.java @@ -21,12 +21,14 @@ import static org.assertj.core.api.Assertions.assertThat; import java.io.IOException; +import java.util.Arrays; import java.util.Random; import java.util.UUID; import java.util.function.Supplier; import java.util.stream.IntStream; import org.apache.iceberg.aws.AwsClientFactories; import org.apache.iceberg.aws.AwsIntegTestUtil; +import org.apache.iceberg.io.IOUtil; import org.apache.iceberg.io.PositionOutputStream; import org.apache.iceberg.io.SeekableInputStream; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; @@ -36,6 +38,8 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.condition.EnabledIfEnvironmentVariable; import org.junit.jupiter.api.condition.EnabledIfEnvironmentVariables; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; import software.amazon.awssdk.services.s3.S3Client; /** Long-running tests to ensure multipart upload logic is resilient */ @@ -141,6 +145,35 @@ public void testParallelUpload() throws IOException { } } + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testMultipartUploadWithChunkedEncoding(boolean chunkedEncodingEnabled) + throws IOException { + // Create a new S3FileIO with specified chunked encoding setting + try (S3FileIO testIo = new S3FileIO(() -> s3)) { + testIo.initialize( + ImmutableMap.of( + S3FileIOProperties.MULTIPART_SIZE, + Integer.toString(S3FileIOProperties.MULTIPART_SIZE_MIN), + S3FileIOProperties.CHECKSUM_ENABLED, + "true", + S3FileIOProperties.CHUNKED_ENCODING_ENABLED, + Boolean.toString(chunkedEncodingEnabled))); + + int parts = 10; + long partSize = S3FileIOProperties.MULTIPART_SIZE_MIN; + String suffix = chunkedEncodingEnabled ? "-chunked-enabled" : "-chunked-disabled"; + + String intObjectUri = objectUri + suffix + "-int"; + writeDistinctPartsWithInts(testIo, intObjectUri, parts, partSize); + verifyDistinctPartsWithInts(testIo, intObjectUri, parts, partSize); + + String bytesObjectUri = objectUri + suffix + "-bytes"; + writeDistinctPartsWithBytes(testIo, bytesObjectUri, parts, partSize); + verifyDistinctPartsWithBytes(testIo, bytesObjectUri, parts, partSize); + } + } + private void writeInts(String fileUri, int parts, Supplier writer) { writeInts(fileUri, parts, S3FileIOProperties.MULTIPART_SIZE_MIN, writer); } @@ -177,4 +210,61 @@ private void writeBytes(String fileUri, int parts, Supplier writer) { throw new RuntimeException(e); } } + + private void writeDistinctPartsWithInts(S3FileIO fileIO, String fileUri, int parts, long partSize) + throws IOException { + try (PositionOutputStream outputStream = fileIO.newOutputFile(fileUri).create()) { + for (int part = 0; part < parts; part++) { + int partByte = part + 1; + for (long j = 0; j < partSize; j++) { + outputStream.write(partByte); + } + } + } + + assertThat(fileIO.newInputFile(fileUri).getLength()).isEqualTo(parts * partSize); + } + + private void verifyDistinctPartsWithInts( + S3FileIO fileIO, String fileUri, int parts, long partSize) throws IOException { + try (SeekableInputStream inputStream = fileIO.newInputFile(fileUri).newStream()) { + byte[] readBuffer = new byte[(int) partSize]; + for (int part = 0; part < parts; part++) { + byte expectedByte = (byte) (part + 1); + IOUtil.readFully(inputStream, readBuffer, 0, (int) partSize); + for (int i = 0; i < (int) partSize; i++) { + assertThat(readBuffer[i]).as("part %d, offset %d", part, i).isEqualTo(expectedByte); + } + } + assertThat(inputStream.read()).as("expected end of stream").isEqualTo(-1); + } + } + + private void writeDistinctPartsWithBytes( + S3FileIO fileIO, String fileUri, int parts, long partSize) throws IOException { + try (PositionOutputStream outputStream = fileIO.newOutputFile(fileUri).create()) { + for (int part = 0; part < parts; part++) { + byte[] partBytes = new byte[(int) partSize]; + Arrays.fill(partBytes, (byte) (part + 1)); + outputStream.write(partBytes); + } + } + + assertThat(fileIO.newInputFile(fileUri).getLength()).isEqualTo(parts * partSize); + } + + private void verifyDistinctPartsWithBytes( + S3FileIO fileIO, String fileUri, int parts, long partSize) throws IOException { + try (SeekableInputStream inputStream = fileIO.newInputFile(fileUri).newStream()) { + byte[] readBuffer = new byte[(int) partSize]; + for (int part = 0; part < parts; part++) { + byte expectedByte = (byte) (part + 1); + IOUtil.readFully(inputStream, readBuffer, 0, (int) partSize); + for (int i = 0; i < (int) partSize; i++) { + assertThat(readBuffer[i]).as("part %d, offset %d", part, i).isEqualTo(expectedByte); + } + } + assertThat(inputStream.read()).as("expected end of stream").isEqualTo(-1); + } + } } diff --git a/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/S3SignerServlet.java b/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/S3SignerServlet.java index 038d76b03e4b..5d334eafa582 100644 --- a/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/S3SignerServlet.java +++ b/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/S3SignerServlet.java @@ -18,17 +18,6 @@ */ package org.apache.iceberg.aws.s3.signer; -import static java.lang.String.format; -import static org.apache.iceberg.rest.RESTCatalogAdapter.castRequest; -import static org.apache.iceberg.rest.RESTCatalogAdapter.castResponse; -import static org.assertj.core.api.AssertionsForClassTypes.assertThat; - -import com.fasterxml.jackson.databind.ObjectMapper; -import jakarta.servlet.http.HttpServlet; -import jakarta.servlet.http.HttpServletRequest; -import jakarta.servlet.http.HttpServletResponse; -import java.io.InputStreamReader; -import java.io.Reader; import java.time.Clock; import java.time.Instant; import java.time.ZoneId; @@ -37,23 +26,15 @@ import java.util.Locale; import java.util.Map; import java.util.Set; -import java.util.function.Predicate; import java.util.stream.Collectors; -import java.util.stream.Stream; -import org.apache.hc.core5.http.ContentType; -import org.apache.hc.core5.http.HttpHeaders; -import org.apache.iceberg.exceptions.RESTException; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.relocated.com.google.common.io.CharStreams; -import org.apache.iceberg.rest.RESTUtil; -import org.apache.iceberg.rest.ResourcePaths; -import org.apache.iceberg.rest.responses.ErrorResponse; -import org.apache.iceberg.rest.responses.OAuthTokenResponse; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.iceberg.rest.HttpMethod; +import org.apache.iceberg.rest.RemoteSignerServlet; +import org.apache.iceberg.rest.requests.RemoteSignRequest; +import org.apache.iceberg.rest.responses.ImmutableRemoteSignResponse; +import org.apache.iceberg.rest.responses.RemoteSignResponse; import software.amazon.awssdk.auth.signer.AwsS3V4Signer; import software.amazon.awssdk.auth.signer.params.AwsS3V4SignerParams; import software.amazon.awssdk.http.SdkHttpFullRequest; @@ -65,113 +46,37 @@ * {@link S3SignerServlet} provides a simple servlet implementation to emulate the server-side * behavior of signing S3 requests and handling OAuth. */ -public class S3SignerServlet extends HttpServlet { - - private static final Logger LOG = LoggerFactory.getLogger(S3SignerServlet.class); +public class S3SignerServlet extends RemoteSignerServlet { static final Clock SIGNING_CLOCK = Clock.fixed(Instant.now(), ZoneId.of("UTC")); static final Set UNSIGNED_HEADERS = Sets.newHashSet( Arrays.asList("range", "x-amz-date", "amz-sdk-invocation-id", "amz-sdk-retry")); - private static final String POST = "POST"; - - private static final Set CACHEABLE_METHODS = - Stream.of(SdkHttpMethod.GET, SdkHttpMethod.HEAD).collect(Collectors.toSet()); - - private final Map responseHeaders = - ImmutableMap.of(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType()); - private final ObjectMapper mapper; - - private List s3SignRequestValidators = Lists.newArrayList(); - - /** - * SignRequestValidator is a wrapper class used for validating the contents of the S3SignRequest - * and thus verifying the behavior of the client during testing. - */ - public static class SignRequestValidator { - private final Predicate requestMatcher; - private final Predicate requestExpectation; - private final String assertMessage; - - public SignRequestValidator( - Predicate requestExpectation, - Predicate requestMatcher, - String assertMessage) { - this.requestExpectation = requestExpectation; - this.requestMatcher = requestMatcher; - this.assertMessage = assertMessage; - } - - void validateRequest(S3SignRequest request) { - if (requestMatcher.test(request)) { - assertThat(requestExpectation.test(request)).as(assertMessage).isTrue(); - } - } - } - - public S3SignerServlet(ObjectMapper mapper) { - this.mapper = mapper; - } - - public S3SignerServlet(ObjectMapper mapper, List s3SignRequestValidators) { - this.mapper = mapper; - this.s3SignRequestValidators = s3SignRequestValidators; - } - - @Override - protected void doGet(HttpServletRequest request, HttpServletResponse response) { - execute(request, response); - } - @Override - protected void doHead(HttpServletRequest request, HttpServletResponse response) { - execute(request, response); - } + /** A fake remote signing endpoint for testing purposes. */ + static final String S3_SIGNER_ENDPOINT = "v1/namespaces/ns1/tables/t1/sign"; - @Override - protected void doPost(HttpServletRequest request, HttpServletResponse response) { - execute(request, response); + public S3SignerServlet() { + super(S3_SIGNER_ENDPOINT); } @Override - protected void doDelete(HttpServletRequest request, HttpServletResponse response) { - execute(request, response); - } - - private OAuthTokenResponse handleOAuth(Map requestMap) { - String grantType = requestMap.get("grant_type"); - switch (grantType) { - case "client_credentials": - return castResponse( - OAuthTokenResponse.class, - OAuthTokenResponse.builder() - .withToken("client-credentials-token:sub=" + requestMap.get("client_id")) - .withIssuedTokenType("urn:ietf:params:oauth:token-type:access_token") - .withTokenType("Bearer") - .setExpirationInSeconds(10000) - .build()); - - case "urn:ietf:params:oauth:grant-type:token-exchange": - String actor = requestMap.get("actor_token"); - String token = - String.format( - "token-exchange-token:sub=%s%s", - requestMap.get("subject_token"), actor != null ? ",act=" + actor : ""); - return castResponse( - OAuthTokenResponse.class, - OAuthTokenResponse.builder() - .withToken(token) - .withIssuedTokenType("urn:ietf:params:oauth:token-type:access_token") - .withTokenType("Bearer") - .setExpirationInSeconds(10000) - .build()); - - default: - throw new UnsupportedOperationException("Unsupported grant_type: " + grantType); + protected void validateSignRequest(RemoteSignRequest request) { + Preconditions.checkArgument( + request.provider() == null || "s3".equalsIgnoreCase(request.provider()), + "Unsupported provider: %s", + request.provider()); + if (HttpMethod.POST.name().equalsIgnoreCase(request.method()) + && request.uri().getQuery().contains("delete")) { + String body = request.body(); + Preconditions.checkArgument( + body != null && !body.isEmpty(), + "Sign request for delete objects should have a request body"); } } - private S3SignResponse signRequest(S3SignRequest request) { + @Override + protected RemoteSignResponse signRequest(RemoteSignRequest request) { AwsS3V4SignerParams signingParams = AwsS3V4SignerParams.builder() .awsCredentials(TestS3RestSigner.CREDENTIALS_PROVIDER.resolveCredentials()) @@ -207,59 +112,6 @@ private S3SignResponse signRequest(S3SignRequest request) { Map> headers = Maps.newHashMap(sign.headers()); headers.putAll(unsignedHeaders); - return ImmutableS3SignResponse.builder().uri(request.uri()).headers(headers).build(); - } - - protected void execute(HttpServletRequest request, HttpServletResponse response) { - response.setStatus(HttpServletResponse.SC_OK); - responseHeaders.forEach(response::setHeader); - - String path = request.getRequestURI().substring(1); - Object requestBody; - try { - // we only need to handle oauth tokens & s3 sign request routes here as those are the only - // requests that are being done by the S3V4RestSignerClient - if (POST.equals(request.getMethod()) - && S3V4RestSignerClient.S3_SIGNER_DEFAULT_ENDPOINT.equals(path)) { - S3SignRequest s3SignRequest = - castRequest( - S3SignRequest.class, mapper.readValue(request.getReader(), S3SignRequest.class)); - s3SignRequestValidators.forEach(validator -> validator.validateRequest(s3SignRequest)); - S3SignResponse s3SignResponse = signRequest(s3SignRequest); - if (CACHEABLE_METHODS.contains(SdkHttpMethod.fromValue(s3SignRequest.method()))) { - // tell the client this can be cached - response.setHeader( - S3V4RestSignerClient.CACHE_CONTROL, S3V4RestSignerClient.CACHE_CONTROL_PRIVATE); - } else { - response.setHeader( - S3V4RestSignerClient.CACHE_CONTROL, S3V4RestSignerClient.CACHE_CONTROL_NO_CACHE); - } - - mapper.writeValue(response.getWriter(), s3SignResponse); - } else if (POST.equals(request.getMethod()) && ResourcePaths.tokens().equals(path)) { - try (Reader reader = new InputStreamReader(request.getInputStream())) { - requestBody = RESTUtil.decodeFormData(CharStreams.toString(reader)); - } - - OAuthTokenResponse oAuthTokenResponse = - handleOAuth((Map) castRequest(Map.class, requestBody)); - mapper.writeValue(response.getWriter(), oAuthTokenResponse); - } else { - response.setStatus(HttpServletResponse.SC_BAD_REQUEST); - mapper.writeValue( - response.getWriter(), - ErrorResponse.builder() - .responseCode(400) - .withType("BadRequestException") - .withMessage(format("No route for request: %s %s", request.getMethod(), path)) - .build()); - } - } catch (RESTException e) { - LOG.error("Error processing REST request", e); - response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR); - } catch (Exception e) { - LOG.error("Unexpected exception when processing REST request", e); - response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR); - } + return ImmutableRemoteSignResponse.builder().uri(request.uri()).headers(headers).build(); } } diff --git a/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/TestS3RestSigner.java b/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/TestS3RestSigner.java index b51d97cc611a..4e5ed3d91870 100644 --- a/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/TestS3RestSigner.java +++ b/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/TestS3RestSigner.java @@ -33,14 +33,15 @@ import java.util.stream.Collectors; import javax.annotation.Nonnull; import org.apache.iceberg.aws.s3.MinioUtil; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.rest.RESTCatalogProperties; import org.apache.iceberg.rest.auth.OAuth2Properties; import org.apache.iceberg.util.ThreadPools; +import org.eclipse.jetty.compression.gzip.GzipCompression; +import org.eclipse.jetty.compression.server.CompressionHandler; +import org.eclipse.jetty.ee10.servlet.ServletContextHandler; +import org.eclipse.jetty.ee10.servlet.ServletHolder; import org.eclipse.jetty.server.Server; -import org.eclipse.jetty.server.handler.gzip.GzipHandler; -import org.eclipse.jetty.servlet.ServletContextHandler; -import org.eclipse.jetty.servlet.ServletHolder; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; @@ -107,8 +108,10 @@ public static void beforeClass() throws Exception { ImmutableS3V4RestSignerClient.builder() .properties( ImmutableMap.of( - S3V4RestSignerClient.S3_SIGNER_URI, + RESTCatalogProperties.SIGNER_URI, httpServer.getURI().toString(), + RESTCatalogProperties.SIGNER_ENDPOINT, + S3SignerServlet.S3_SIGNER_ENDPOINT, OAuth2Properties.CREDENTIAL, "catalog:12345")) .build(), @@ -182,19 +185,13 @@ public void before() throws Exception { } private static Server initHttpServer() throws Exception { - S3SignerServlet.SignRequestValidator deleteObjectsWithBody = - new S3SignerServlet.SignRequestValidator( - (s3SignRequest) -> - "post".equalsIgnoreCase(s3SignRequest.method()) - && s3SignRequest.uri().getQuery().contains("delete"), - (s3SignRequest) -> s3SignRequest.body() != null && !s3SignRequest.body().isEmpty(), - "Sign request for delete objects should have a request body"); - S3SignerServlet servlet = - new S3SignerServlet(S3ObjectMapper.mapper(), ImmutableList.of(deleteObjectsWithBody)); + S3SignerServlet servlet = new S3SignerServlet(); ServletContextHandler servletContext = new ServletContextHandler(ServletContextHandler.NO_SESSIONS); servletContext.addServlet(new ServletHolder(servlet), "/*"); - servletContext.setHandler(new GzipHandler()); + CompressionHandler compressionHandler = new CompressionHandler(); + compressionHandler.putCompression(new GzipCompression()); + servletContext.insertHandler(compressionHandler); Server server = new Server(new InetSocketAddress(InetAddress.getLoopbackAddress(), 0)); server.setHandler(servletContext); diff --git a/aws/src/main/java/org/apache/iceberg/aws/ApacheHttpClientConfigurations.java b/aws/src/main/java/org/apache/iceberg/aws/ApacheHttpClientConfigurations.java index 3445928d1551..30065c8db510 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/ApacheHttpClientConfigurations.java +++ b/aws/src/main/java/org/apache/iceberg/aws/ApacheHttpClientConfigurations.java @@ -41,6 +41,8 @@ class ApacheHttpClientConfigurations extends BaseHttpClientConfigurations { private Boolean tcpKeepAliveEnabled; private Boolean useIdleConnectionReaperEnabled; private String proxyEndpoint; + private Boolean proxyUseSystemPropertyValues; + private Boolean proxyUseEnvironmentVariableValues; private ApacheHttpClientConfigurations() {} @@ -82,6 +84,12 @@ private void initialize(Map httpClientProperties) { this.proxyEndpoint = PropertyUtil.propertyAsString( httpClientProperties, HttpClientProperties.PROXY_ENDPOINT, null); + this.proxyUseSystemPropertyValues = + PropertyUtil.propertyAsNullableBoolean( + httpClientProperties, HttpClientProperties.PROXY_USE_SYSTEM_PROPERTY_VALUES); + this.proxyUseEnvironmentVariableValues = + PropertyUtil.propertyAsNullableBoolean( + httpClientProperties, HttpClientProperties.PROXY_USE_ENVIRONMENT_VARIABLE_VALUES); } @VisibleForTesting @@ -113,9 +121,26 @@ void configureApacheHttpClientBuilder(ApacheHttpClient.Builder apacheHttpClientB if (useIdleConnectionReaperEnabled != null) { apacheHttpClientBuilder.useIdleConnectionReaper(useIdleConnectionReaperEnabled); } - if (proxyEndpoint != null) { - apacheHttpClientBuilder.proxyConfiguration( - ProxyConfiguration.builder().endpoint(URI.create(proxyEndpoint)).build()); + configureProxy(apacheHttpClientBuilder); + } + + private void configureProxy(ApacheHttpClient.Builder apacheHttpClientBuilder) { + if (proxyEndpoint != null + || proxyUseSystemPropertyValues != null + || proxyUseEnvironmentVariableValues != null) { + ProxyConfiguration.Builder proxyBuilder = ProxyConfiguration.builder(); + + if (proxyEndpoint != null) { + proxyBuilder.endpoint(URI.create(proxyEndpoint)); + } + if (proxyUseSystemPropertyValues != null) { + proxyBuilder.useSystemPropertyValues(proxyUseSystemPropertyValues); + } + if (proxyUseEnvironmentVariableValues != null) { + proxyBuilder.useEnvironmentVariableValues(proxyUseEnvironmentVariableValues); + } + + apacheHttpClientBuilder.proxyConfiguration(proxyBuilder.build()); } } @@ -138,6 +163,8 @@ protected String generateHttpClientCacheKey() { keyComponents.put("tcpKeepAliveEnabled", tcpKeepAliveEnabled); keyComponents.put("useIdleConnectionReaperEnabled", useIdleConnectionReaperEnabled); keyComponents.put("proxyEndpoint", proxyEndpoint); + keyComponents.put("proxyUseSystemPropertyValues", proxyUseSystemPropertyValues); + keyComponents.put("proxyUseEnvironmentVariableValues", proxyUseEnvironmentVariableValues); return keyComponents.entrySet().stream() .map(entry -> entry.getKey() + "=" + Objects.toString(entry.getValue(), "null")) diff --git a/aws/src/main/java/org/apache/iceberg/aws/HttpClientProperties.java b/aws/src/main/java/org/apache/iceberg/aws/HttpClientProperties.java index 438ae5bb0431..870d8e23651c 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/HttpClientProperties.java +++ b/aws/src/main/java/org/apache/iceberg/aws/HttpClientProperties.java @@ -61,6 +61,30 @@ public class HttpClientProperties implements Serializable { */ public static final String PROXY_ENDPOINT = "http-client.proxy-endpoint"; + /** + * Used to enable reading proxy configuration from Java system properties (http.proxyHost, + * http.proxyPort, http.nonProxyHosts, etc.). Default is true. + * + *

For more details, see + * https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/http/urlconnection/ProxyConfiguration.html + * and + * https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/http/apache/ProxyConfiguration.html + */ + public static final String PROXY_USE_SYSTEM_PROPERTY_VALUES = + "http-client.proxy-use-system-property-values"; + + /** + * Used to enable reading proxy configuration from environment variables (HTTP_PROXY, HTTPS_PROXY, + * NO_PROXY, etc.). Default is true. + * + *

For more details, see + * https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/http/urlconnection/ProxyConfiguration.html + * and + * https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/http/apache/ProxyConfiguration.html + */ + public static final String PROXY_USE_ENVIRONMENT_VARIABLE_VALUES = + "http-client.proxy-use-environment-variable-values"; + /** * Used to configure the connection timeout in milliseconds for {@link * software.amazon.awssdk.http.urlconnection.UrlConnectionHttpClient.Builder}. This flag only diff --git a/aws/src/main/java/org/apache/iceberg/aws/RESTSigV4AuthSession.java b/aws/src/main/java/org/apache/iceberg/aws/RESTSigV4AuthSession.java index 98808ead4f0b..48281841be37 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/RESTSigV4AuthSession.java +++ b/aws/src/main/java/org/apache/iceberg/aws/RESTSigV4AuthSession.java @@ -18,12 +18,15 @@ */ package org.apache.iceberg.aws; +import java.io.IOException; +import java.io.UncheckedIOException; import java.net.URI; import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Map; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; +import org.apache.iceberg.io.CloseableGroup; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.rest.HTTPHeaders; import org.apache.iceberg.rest.HTTPHeaders.HTTPHeader; @@ -64,16 +67,23 @@ public class RESTSigV4AuthSession implements AuthSession { private final Region signingRegion; private final String signingName; private final AwsCredentialsProvider credentialsProvider; + private final CloseableGroup closeableGroup; @SuppressWarnings("deprecation") public RESTSigV4AuthSession( Aws4Signer aws4Signer, AuthSession delegateAuthSession, AwsProperties awsProperties) { + this.closeableGroup = new CloseableGroup(); + this.closeableGroup.setSuppressCloseFailure(true); this.signer = Preconditions.checkNotNull(aws4Signer, "Invalid signer: null"); this.delegate = Preconditions.checkNotNull(delegateAuthSession, "Invalid delegate: null"); + this.closeableGroup.addCloseable(this.delegate); Preconditions.checkNotNull(awsProperties, "Invalid AWS properties: null"); this.signingRegion = awsProperties.restSigningRegion(); this.signingName = awsProperties.restSigningName(); this.credentialsProvider = awsProperties.restCredentialsProvider(); + if (credentialsProvider instanceof AutoCloseable closeableCredentialsProvider) { + this.closeableGroup.addCloseable(closeableCredentialsProvider); + } } public AuthSession delegate() { @@ -87,7 +97,11 @@ public HTTPRequest authenticate(HTTPRequest request) { @Override public void close() { - delegate.close(); + try { + closeableGroup.close(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } } @SuppressWarnings("deprecation") diff --git a/aws/src/main/java/org/apache/iceberg/aws/S3FileIOAwsClientFactories.java b/aws/src/main/java/org/apache/iceberg/aws/S3FileIOAwsClientFactories.java index 4aec0bda2a13..3306163baffd 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/S3FileIOAwsClientFactories.java +++ b/aws/src/main/java/org/apache/iceberg/aws/S3FileIOAwsClientFactories.java @@ -32,7 +32,7 @@ private S3FileIOAwsClientFactories() {} /** * Attempts to load an AWS client factory class for S3 file IO defined in the catalog property * {@link S3FileIOProperties#CLIENT_FACTORY}. If the property wasn't set, fallback to {@link - * AwsClientFactories#from(Map) to intialize an AWS client factory class} + * AwsClientFactories#from(Map) to initialize an AWS client factory class} * * @param properties catalog properties * @return an instance of a factory class diff --git a/aws/src/main/java/org/apache/iceberg/aws/UrlConnectionHttpClientConfigurations.java b/aws/src/main/java/org/apache/iceberg/aws/UrlConnectionHttpClientConfigurations.java index 273baa674804..fbd845852ca9 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/UrlConnectionHttpClientConfigurations.java +++ b/aws/src/main/java/org/apache/iceberg/aws/UrlConnectionHttpClientConfigurations.java @@ -35,6 +35,8 @@ class UrlConnectionHttpClientConfigurations extends BaseHttpClientConfigurations private Long httpClientUrlConnectionConnectionTimeoutMs; private Long httpClientUrlConnectionSocketTimeoutMs; private String proxyEndpoint; + private Boolean proxyUseSystemPropertyValues; + private Boolean proxyUseEnvironmentVariableValues; private UrlConnectionHttpClientConfigurations() {} @@ -56,6 +58,12 @@ private void initialize(Map httpClientProperties) { this.proxyEndpoint = PropertyUtil.propertyAsString( httpClientProperties, HttpClientProperties.PROXY_ENDPOINT, null); + this.proxyUseSystemPropertyValues = + PropertyUtil.propertyAsNullableBoolean( + httpClientProperties, HttpClientProperties.PROXY_USE_SYSTEM_PROPERTY_VALUES); + this.proxyUseEnvironmentVariableValues = + PropertyUtil.propertyAsNullableBoolean( + httpClientProperties, HttpClientProperties.PROXY_USE_ENVIRONMENT_VARIABLE_VALUES); } @VisibleForTesting @@ -69,9 +77,26 @@ void configureUrlConnectionHttpClientBuilder( urlConnectionHttpClientBuilder.socketTimeout( Duration.ofMillis(httpClientUrlConnectionSocketTimeoutMs)); } - if (proxyEndpoint != null) { - urlConnectionHttpClientBuilder.proxyConfiguration( - ProxyConfiguration.builder().endpoint(URI.create(proxyEndpoint)).build()); + configureProxy(urlConnectionHttpClientBuilder); + } + + private void configureProxy(UrlConnectionHttpClient.Builder urlConnectionHttpClientBuilder) { + if (proxyEndpoint != null + || proxyUseSystemPropertyValues != null + || proxyUseEnvironmentVariableValues != null) { + ProxyConfiguration.Builder proxyBuilder = ProxyConfiguration.builder(); + + if (proxyEndpoint != null) { + proxyBuilder.endpoint(URI.create(proxyEndpoint)); + } + if (proxyUseSystemPropertyValues != null) { + proxyBuilder.useSystemPropertyValues(proxyUseSystemPropertyValues); + } + if (proxyUseEnvironmentVariableValues != null) { + proxyBuilder.useEnvironmentVariablesValues(proxyUseEnvironmentVariableValues); + } + + urlConnectionHttpClientBuilder.proxyConfiguration(proxyBuilder.build()); } } @@ -87,6 +112,8 @@ protected String generateHttpClientCacheKey() { keyComponents.put("connectionTimeoutMs", httpClientUrlConnectionConnectionTimeoutMs); keyComponents.put("socketTimeoutMs", httpClientUrlConnectionSocketTimeoutMs); keyComponents.put("proxyEndpoint", proxyEndpoint); + keyComponents.put("proxyUseSystemPropertyValues", proxyUseSystemPropertyValues); + keyComponents.put("proxyUseEnvironmentVariableValues", proxyUseEnvironmentVariableValues); return keyComponents.entrySet().stream() .map(entry -> entry.getKey() + "=" + Objects.toString(entry.getValue(), "null")) diff --git a/aws/src/main/java/org/apache/iceberg/aws/dynamodb/DynamoDbCatalog.java b/aws/src/main/java/org/apache/iceberg/aws/dynamodb/DynamoDbCatalog.java index 0c991af75076..7c75f99d6d69 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/dynamodb/DynamoDbCatalog.java +++ b/aws/src/main/java/org/apache/iceberg/aws/dynamodb/DynamoDbCatalog.java @@ -53,6 +53,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.util.LocationUtil; +import org.apache.iceberg.util.PropertyUtil; import org.apache.iceberg.util.Tasks; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -112,6 +113,7 @@ public class DynamoDbCatalog extends BaseMetastoreCatalog private FileIO fileIO; private CloseableGroup closeableGroup; private Map catalogProperties; + private boolean uniqueTableLocation; public DynamoDbCatalog() {} @@ -123,12 +125,21 @@ public void initialize(String name, Map properties) { properties.get(CatalogProperties.WAREHOUSE_LOCATION), new AwsProperties(properties), AwsClientFactories.from(properties).dynamo(), - initializeFileIO(properties)); + initializeFileIO(properties), + PropertyUtil.propertyAsBoolean( + properties, + CatalogProperties.UNIQUE_TABLE_LOCATION, + CatalogProperties.UNIQUE_TABLE_LOCATION_DEFAULT)); } @VisibleForTesting void initialize( - String name, String path, AwsProperties properties, DynamoDbClient client, FileIO io) { + String name, + String path, + AwsProperties properties, + DynamoDbClient client, + FileIO io, + boolean uniqTableLocation) { Preconditions.checkArgument( !Strings.isNullOrEmpty(path), "Cannot initialize DynamoDbCatalog because warehousePath must not be null or empty"); @@ -138,6 +149,7 @@ void initialize( this.warehousePath = LocationUtil.stripTrailingSlash(path); this.dynamo = client; this.fileIO = io; + this.uniqueTableLocation = uniqTableLocation; this.closeableGroup = new CloseableGroup(); closeableGroup.addCloseable(dynamo); @@ -177,12 +189,12 @@ protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { } String defaultLocationCol = toPropertyCol(PROPERTY_DEFAULT_LOCATION); + String tableLocation = LocationUtil.tableLocation(tableIdentifier, uniqueTableLocation); if (response.item().containsKey(defaultLocationCol)) { - return String.format( - "%s/%s", response.item().get(defaultLocationCol).s(), tableIdentifier.name()); + return String.format("%s/%s", response.item().get(defaultLocationCol).s(), tableLocation); } else { return String.format( - "%s/%s.db/%s", warehousePath, tableIdentifier.namespace(), tableIdentifier.name()); + "%s/%s.db/%s", warehousePath, tableIdentifier.namespace(), tableLocation); } } diff --git a/aws/src/main/java/org/apache/iceberg/aws/glue/GlueCatalog.java b/aws/src/main/java/org/apache/iceberg/aws/glue/GlueCatalog.java index 47807a2b9f37..94e53cc1ab69 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/glue/GlueCatalog.java +++ b/aws/src/main/java/org/apache/iceberg/aws/glue/GlueCatalog.java @@ -89,6 +89,7 @@ public class GlueCatalog extends BaseMetastoreCatalog private Object hadoopConf; private String catalogName; private String warehousePath; + private boolean uniqueTableLocation; private AwsProperties awsProperties; private S3FileIOProperties s3FileIOProperties; private LockManager lockManager; @@ -144,7 +145,11 @@ public void initialize(String name, Map properties) { new AwsProperties(properties), new S3FileIOProperties(properties), awsClientFactory.glue(), - initializeLockManager(properties)); + initializeLockManager(properties), + PropertyUtil.propertyAsBoolean( + properties, + CatalogProperties.UNIQUE_TABLE_LOCATION, + CatalogProperties.UNIQUE_TABLE_LOCATION_DEFAULT)); } private LockManager initializeLockManager(Map properties) { @@ -172,7 +177,17 @@ void initialize( LockManager lock, Map catalogProps) { this.catalogProperties = catalogProps; - initialize(name, path, properties, s3Properties, client, lock); + initialize( + name, + path, + properties, + s3Properties, + client, + lock, + PropertyUtil.propertyAsBoolean( + catalogProps, + CatalogProperties.UNIQUE_TABLE_LOCATION, + CatalogProperties.UNIQUE_TABLE_LOCATION_DEFAULT)); } @VisibleForTesting @@ -182,13 +197,15 @@ void initialize( AwsProperties properties, S3FileIOProperties s3Properties, GlueClient client, - LockManager lock) { + LockManager lock, + boolean uniqTableLocation) { this.catalogName = name; this.awsProperties = properties; this.s3FileIOProperties = s3Properties; this.warehousePath = Strings.isNullOrEmpty(path) ? null : LocationUtil.stripTrailingSlash(path); this.glue = client; this.lockManager = lock; + this.uniqueTableLocation = uniqTableLocation; this.closeableGroup = new CloseableGroup(); this.fileIOTracker = new FileIOTracker(); @@ -278,9 +295,10 @@ protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { tableIdentifier, awsProperties.glueCatalogSkipNameValidation())) .build()); String dbLocationUri = response.database().locationUri(); + String tableLocation = LocationUtil.tableLocation(tableIdentifier, uniqueTableLocation); if (dbLocationUri != null) { dbLocationUri = LocationUtil.stripTrailingSlash(dbLocationUri); - return String.format("%s/%s", dbLocationUri, tableIdentifier.name()); + return String.format("%s/%s", dbLocationUri, tableLocation); } ValidationException.check( @@ -292,7 +310,7 @@ protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { warehousePath, IcebergToGlueConverter.getDatabaseName( tableIdentifier, awsProperties.glueCatalogSkipNameValidation()), - tableIdentifier.name()); + tableLocation); } @Override diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIOProperties.java b/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIOProperties.java index ad5181fd2798..922010d61d27 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIOProperties.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIOProperties.java @@ -295,6 +295,18 @@ public class S3FileIOProperties implements Serializable { public static final boolean REMOTE_SIGNING_ENABLED_DEFAULT = false; + /** + * Enables or disables chunked encoding for S3 requests. + * + *

This feature is enabled by default to match the AWS SDK default behavior. + * + *

For more details see: + * https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/services/s3/S3Configuration.html#chunkedEncodingEnabled() + */ + public static final String CHUNKED_ENCODING_ENABLED = "s3.chunked-encoding-enabled"; + + public static final boolean CHUNKED_ENCODING_ENABLED_DEFAULT = true; + /** Configure the batch size used when deleting multiple files from a given S3 bucket */ public static final String DELETE_BATCH_SIZE = "s3.delete.batch-size"; @@ -509,6 +521,7 @@ public class S3FileIOProperties implements Serializable { private String stagingDirectory; private ObjectCannedACL acl; private boolean isChecksumEnabled; + private boolean isChunkedEncodingEnabled; private final Set writeTags; private boolean isWriteTableTagEnabled; private boolean isWriteNamespaceTagEnabled; @@ -551,6 +564,7 @@ public S3FileIOProperties() { this.deleteBatchSize = DELETE_BATCH_SIZE_DEFAULT; this.stagingDirectory = System.getProperty("java.io.tmpdir"); this.isChecksumEnabled = CHECKSUM_ENABLED_DEFAULT; + this.isChunkedEncodingEnabled = CHUNKED_ENCODING_ENABLED_DEFAULT; this.writeTags = Sets.newHashSet(); this.isWriteTableTagEnabled = WRITE_TABLE_TAG_ENABLED_DEFAULT; this.isWriteNamespaceTagEnabled = WRITE_NAMESPACE_TAG_ENABLED_DEFAULT; @@ -641,6 +655,9 @@ public S3FileIOProperties(Map properties) { "Cannot support S3 CannedACL " + aclType); this.isChecksumEnabled = PropertyUtil.propertyAsBoolean(properties, CHECKSUM_ENABLED, CHECKSUM_ENABLED_DEFAULT); + this.isChunkedEncodingEnabled = + PropertyUtil.propertyAsBoolean( + properties, CHUNKED_ENCODING_ENABLED, CHUNKED_ENCODING_ENABLED_DEFAULT); this.deleteBatchSize = PropertyUtil.propertyAsInt(properties, DELETE_BATCH_SIZE, DELETE_BATCH_SIZE_DEFAULT); Preconditions.checkArgument( @@ -808,6 +825,10 @@ public boolean isChecksumEnabled() { return this.isChecksumEnabled; } + public boolean isChunkedEncodingEnabled() { + return this.isChunkedEncodingEnabled; + } + public boolean isRemoteSigningEnabled() { return this.isRemoteSigningEnabled; } @@ -994,6 +1015,7 @@ public void applyServiceConfigurations(T builder) { .pathStyleAccessEnabled(isPathStyleAccess) .useArnRegionEnabled(isUseArnRegionEnabled) .accelerateModeEnabled(isAccelerationEnabled) + .chunkedEncodingEnabled(isChunkedEncodingEnabled) .build()); } diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3ObjectMapper.java b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3ObjectMapper.java index 89145b2465e5..7f1d6c3cc848 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3ObjectMapper.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3ObjectMapper.java @@ -40,6 +40,10 @@ import org.apache.iceberg.rest.responses.ErrorResponse; import org.apache.iceberg.rest.responses.OAuthTokenResponse; +/** + * @deprecated since 1.11.0, will be removed in 1.12.0; use {@code RESTObjectMapper} instead. + */ +@Deprecated public class S3ObjectMapper { private static final JsonFactory FACTORY = new JsonFactory(); diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignRequest.java b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignRequest.java index 879ce8599352..995f6e7e4860 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignRequest.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignRequest.java @@ -18,31 +18,13 @@ */ package org.apache.iceberg.aws.s3.signer; -import java.net.URI; -import java.util.List; -import java.util.Map; -import javax.annotation.Nullable; -import org.apache.iceberg.rest.RESTRequest; +import org.apache.iceberg.rest.requests.RemoteSignRequest; import org.immutables.value.Value; +/** + * @deprecated since 1.11.0, will be removed in 1.12.0; use {@link RemoteSignRequest} instead. + */ +@Deprecated @Value.Immutable -public interface S3SignRequest extends RESTRequest { - String region(); - - String method(); - - URI uri(); - - Map> headers(); - - Map properties(); - - @Value.Default - @Nullable - default String body() { - return null; - } - - @Override - default void validate() {} -} +@SuppressWarnings("immutables:subtype") +public interface S3SignRequest extends RemoteSignRequest {} diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignRequestParser.java b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignRequestParser.java index 3b5eb83612e2..5d2a7d684460 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignRequestParser.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignRequestParser.java @@ -21,108 +21,47 @@ import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.databind.JsonNode; import java.io.IOException; -import java.util.Arrays; import java.util.List; import java.util.Map; -import java.util.Map.Entry; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.JsonUtil; +import org.apache.iceberg.rest.requests.RemoteSignRequest; +import org.apache.iceberg.rest.requests.RemoteSignRequestParser; +/** + * @deprecated since 1.11.0, will be removed in 1.12.0; use {@link RemoteSignRequestParser} instead. + */ +@Deprecated public class S3SignRequestParser { - private static final String REGION = "region"; - private static final String METHOD = "method"; - private static final String URI = "uri"; - private static final String HEADERS = "headers"; - private static final String PROPERTIES = "properties"; - private static final String BODY = "body"; - private S3SignRequestParser() {} public static String toJson(S3SignRequest request) { - return toJson(request, false); + return RemoteSignRequestParser.toJson(request, false); } public static String toJson(S3SignRequest request, boolean pretty) { - return JsonUtil.generate(gen -> toJson(request, gen), pretty); + return RemoteSignRequestParser.toJson(request, pretty); } public static void toJson(S3SignRequest request, JsonGenerator gen) throws IOException { - Preconditions.checkArgument(null != request, "Invalid s3 sign request: null"); - - gen.writeStartObject(); - - gen.writeStringField(REGION, request.region()); - gen.writeStringField(METHOD, request.method()); - gen.writeStringField(URI, request.uri().toString()); - headersToJson(HEADERS, request.headers(), gen); - - if (!request.properties().isEmpty()) { - JsonUtil.writeStringMap(PROPERTIES, request.properties(), gen); - } - - if (request.body() != null && !request.body().isEmpty()) { - gen.writeStringField(BODY, request.body()); - } - - gen.writeEndObject(); + RemoteSignRequestParser.toJson(request, gen); } public static S3SignRequest fromJson(String json) { - return JsonUtil.parse(json, S3SignRequestParser::fromJson); + RemoteSignRequest request = RemoteSignRequestParser.fromJson(json); + return ImmutableS3SignRequest.builder().from(request).build(); } public static S3SignRequest fromJson(JsonNode json) { - Preconditions.checkArgument(null != json, "Cannot parse s3 sign request from null object"); - Preconditions.checkArgument( - json.isObject(), "Cannot parse s3 sign request from non-object: %s", json); - - String region = JsonUtil.getString(REGION, json); - String method = JsonUtil.getString(METHOD, json); - java.net.URI uri = java.net.URI.create(JsonUtil.getString(URI, json)); - Map> headers = headersFromJson(HEADERS, json); - - ImmutableS3SignRequest.Builder builder = - ImmutableS3SignRequest.builder().region(region).method(method).uri(uri).headers(headers); - - if (json.has(PROPERTIES)) { - builder.properties(JsonUtil.getStringMap(PROPERTIES, json)); - } - - if (json.has(BODY)) { - builder.body(JsonUtil.getString(BODY, json)); - } - - return builder.build(); + RemoteSignRequest request = RemoteSignRequestParser.fromJson(json); + return ImmutableS3SignRequest.builder().from(request).build(); } static void headersToJson(String property, Map> headers, JsonGenerator gen) throws IOException { - gen.writeObjectFieldStart(property); - for (Entry> entry : headers.entrySet()) { - gen.writeFieldName(entry.getKey()); - - gen.writeStartArray(); - for (String val : entry.getValue()) { - gen.writeString(val); - } - gen.writeEndArray(); - } - gen.writeEndObject(); + RemoteSignRequestParser.headersToJson(property, headers, gen); } static Map> headersFromJson(String property, JsonNode json) { - Map> headers = Maps.newHashMap(); - JsonNode headersNode = JsonUtil.get(property, json); - headersNode - .properties() - .forEach( - entry -> { - String key = entry.getKey(); - List values = Arrays.asList(JsonUtil.getStringArray(entry.getValue())); - headers.put(key, values); - }); - return headers; + return RemoteSignRequestParser.headersFromJson(property, json); } } diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignResponse.java b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignResponse.java index 40c2059488f8..6fbaa90fe7af 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignResponse.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignResponse.java @@ -18,18 +18,13 @@ */ package org.apache.iceberg.aws.s3.signer; -import java.net.URI; -import java.util.List; -import java.util.Map; -import org.apache.iceberg.rest.RESTResponse; +import org.apache.iceberg.rest.responses.RemoteSignResponse; import org.immutables.value.Value; +/** + * @deprecated since 1.11.0, will be removed in 1.12.0; use {@link RemoteSignResponse} instead. + */ +@Deprecated @Value.Immutable -public interface S3SignResponse extends RESTResponse { - URI uri(); - - Map> headers(); - - @Override - default void validate() {} -} +@SuppressWarnings("immutables:subtype") +public interface S3SignResponse extends RemoteSignResponse {} diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignResponseParser.java b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignResponseParser.java index 69d6de8f04ac..be63a51b38fb 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignResponseParser.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignResponseParser.java @@ -21,49 +21,37 @@ import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.databind.JsonNode; import java.io.IOException; -import java.util.List; -import java.util.Map; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.util.JsonUtil; +import org.apache.iceberg.rest.responses.RemoteSignResponse; +import org.apache.iceberg.rest.responses.RemoteSignResponseParser; +/** + * @deprecated since 1.11.0, will be removed in 1.12.0; use {@link RemoteSignResponseParser} + * instead. + */ +@Deprecated public class S3SignResponseParser { - private static final String URI = "uri"; - private static final String HEADERS = "headers"; - private S3SignResponseParser() {} - public static String toJson(S3SignResponse request) { - return toJson(request, false); + public static String toJson(S3SignResponse response) { + return RemoteSignResponseParser.toJson(response, false); } - public static String toJson(S3SignResponse request, boolean pretty) { - return JsonUtil.generate(gen -> toJson(request, gen), pretty); + public static String toJson(S3SignResponse response, boolean pretty) { + return RemoteSignResponseParser.toJson(response, pretty); } public static void toJson(S3SignResponse response, JsonGenerator gen) throws IOException { - Preconditions.checkArgument(null != response, "Invalid s3 sign response: null"); - - gen.writeStartObject(); - - gen.writeStringField(URI, response.uri().toString()); - S3SignRequestParser.headersToJson(HEADERS, response.headers(), gen); - - gen.writeEndObject(); + RemoteSignResponseParser.toJson(response, gen); } public static S3SignResponse fromJson(String json) { - return JsonUtil.parse(json, S3SignResponseParser::fromJson); + RemoteSignResponse result = RemoteSignResponseParser.fromJson(json); + return ImmutableS3SignResponse.builder().from(result).build(); } public static S3SignResponse fromJson(JsonNode json) { - Preconditions.checkArgument(null != json, "Cannot parse s3 sign response from null object"); - Preconditions.checkArgument( - json.isObject(), "Cannot parse s3 sign response from non-object: %s", json); - - java.net.URI uri = java.net.URI.create(JsonUtil.getString(URI, json)); - Map> headers = S3SignRequestParser.headersFromJson(HEADERS, json); - - return ImmutableS3SignResponse.builder().uri(uri).headers(headers).build(); + RemoteSignResponse result = RemoteSignResponseParser.fromJson(json); + return ImmutableS3SignResponse.builder().from(result).build(); } } diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3V4RestSignerClient.java b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3V4RestSignerClient.java index 84b67bbdafc2..7a463abd3d2d 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3V4RestSignerClient.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3V4RestSignerClient.java @@ -37,6 +37,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.rest.ErrorHandlers; import org.apache.iceberg.rest.HTTPClient; +import org.apache.iceberg.rest.RESTCatalogProperties; import org.apache.iceberg.rest.RESTClient; import org.apache.iceberg.rest.RESTUtil; import org.apache.iceberg.rest.ResourcePaths; @@ -45,6 +46,9 @@ import org.apache.iceberg.rest.auth.AuthSession; import org.apache.iceberg.rest.auth.OAuth2Properties; import org.apache.iceberg.rest.auth.OAuth2Util; +import org.apache.iceberg.rest.requests.ImmutableRemoteSignRequest; +import org.apache.iceberg.rest.requests.RemoteSignRequest; +import org.apache.iceberg.rest.responses.RemoteSignResponse; import org.apache.iceberg.util.PropertyUtil; import org.immutables.value.Value; import org.slf4j.Logger; @@ -64,13 +68,30 @@ public abstract class S3V4RestSignerClient extends AbstractAws4Signer implements AutoCloseable { private static final Logger LOG = LoggerFactory.getLogger(S3V4RestSignerClient.class); - public static final String S3_SIGNER_URI = "s3.signer.uri"; - public static final String S3_SIGNER_ENDPOINT = "s3.signer.endpoint"; - static final String S3_SIGNER_DEFAULT_ENDPOINT = "v1/aws/s3/sign"; - static final String UNSIGNED_PAYLOAD = "UNSIGNED-PAYLOAD"; - static final String CACHE_CONTROL = "Cache-Control"; - static final String CACHE_CONTROL_PRIVATE = "private"; - static final String CACHE_CONTROL_NO_CACHE = "no-cache"; + + public static final String S3_PROVIDER = "s3"; + + /** + * @deprecated since 1.11.0, will be removed in 1.12.0; use {@link + * RESTCatalogProperties#SIGNER_URI} instead. + */ + @Deprecated public static final String S3_SIGNER_URI = "s3.signer.uri"; + + /** + * @deprecated since 1.11.0, will be removed in 1.12.0; use {@link + * RESTCatalogProperties#SIGNER_URI} instead. + */ + @Deprecated public static final String S3_SIGNER_ENDPOINT = "s3.signer.endpoint"; + + /** + * @deprecated since 1.11.0, will be removed in 1.12.0; there is no replacement. + */ + @Deprecated static final String S3_SIGNER_DEFAULT_ENDPOINT = "v1/aws/s3/sign"; + + @VisibleForTesting static final String UNSIGNED_PAYLOAD = "UNSIGNED-PAYLOAD"; + + private static final String CACHE_CONTROL = "Cache-Control"; + private static final String CACHE_CONTROL_PRIVATE = "private"; private static final Cache SIGNED_COMPONENT_CACHE = Caffeine.newBuilder().expireAfterWrite(30, TimeUnit.SECONDS).maximumSize(100).build(); @@ -94,13 +115,28 @@ public Supplier> requestPropertiesSupplier() { @Value.Lazy public String baseSignerUri() { - return properties().getOrDefault(S3_SIGNER_URI, properties().get(CatalogProperties.URI)); + // TODO remove in 1.12.0 + if (properties().containsKey(S3_SIGNER_URI)) { + return properties().get(S3_SIGNER_URI); + } + + return properties() + .getOrDefault(RESTCatalogProperties.SIGNER_URI, properties().get(CatalogProperties.URI)); } @Value.Lazy public String endpoint() { - return RESTUtil.resolveEndpoint( - baseSignerUri(), properties().getOrDefault(S3_SIGNER_ENDPOINT, S3_SIGNER_DEFAULT_ENDPOINT)); + // TODO remove in 1.12.0 + String endpointPath; + if (properties().containsKey(S3_SIGNER_ENDPOINT)) { + endpointPath = properties().get(S3_SIGNER_ENDPOINT); + } else { + endpointPath = + properties() + .getOrDefault(RESTCatalogProperties.SIGNER_ENDPOINT, S3_SIGNER_DEFAULT_ENDPOINT); + } + + return RESTUtil.resolveEndpoint(baseSignerUri(), endpointPath); } /** A credential to exchange for a token in the OAuth2 client credentials flow. */ @@ -160,7 +196,6 @@ private RESTClient httpClient() { httpClient = HTTPClient.builder(properties()) .withHeaders(RESTUtil.configHeaders(properties())) - .withObjectMapper(S3ObjectMapper.mapper()) .build(); } } @@ -197,8 +232,36 @@ private boolean credentialProvided() { @Value.Check protected void check() { Preconditions.checkArgument( - properties().containsKey(S3_SIGNER_URI) || properties().containsKey(CatalogProperties.URI), + properties().containsKey(S3_SIGNER_URI) + || properties().containsKey(RESTCatalogProperties.SIGNER_URI) + || properties().containsKey(CatalogProperties.URI), "S3 signer service URI is required"); + + if (properties().containsKey(S3_SIGNER_URI) + && !properties().containsKey(RESTCatalogProperties.SIGNER_URI)) { + LOG.warn( + "S3 signer URI is configured via deprecated property {}, this won't be supported in future releases. " + + "Please use {} instead.", + S3_SIGNER_URI, + RESTCatalogProperties.SIGNER_URI); + } + + if (properties().containsKey(S3_SIGNER_ENDPOINT) + && !properties().containsKey(RESTCatalogProperties.SIGNER_ENDPOINT)) { + LOG.warn( + "Signer endpoint is configured via deprecated property {}, this won't be supported in future releases. " + + "Please use {} instead.", + S3_SIGNER_ENDPOINT, + RESTCatalogProperties.SIGNER_ENDPOINT); + } + + // TODO change to required in 1.12.0 + if (!properties().containsKey(S3_SIGNER_ENDPOINT) + && !properties().containsKey(RESTCatalogProperties.SIGNER_ENDPOINT)) { + LOG.warn( + "Signer endpoint is not set, this won't be supported in future releases. Using deprecated default: {}", + S3_SIGNER_DEFAULT_ENDPOINT); + } } @Override @@ -241,14 +304,15 @@ public SdkHttpFullRequest sign( AwsS3V4SignerParams signerParams = extractSignerParams(AwsS3V4SignerParams.builder(), executionAttributes).build(); - S3SignRequest remoteSigningRequest = - ImmutableS3SignRequest.builder() + RemoteSignRequest remoteSigningRequest = + ImmutableRemoteSignRequest.builder() .method(request.method().name()) .region(signerParams.signingRegion().id()) .uri(request.getUri()) .headers(request.headers()) .properties(requestPropertiesSupplier().get()) .body(bodyAsString(request)) + .provider(S3_PROVIDER) .build(); Key cacheKey = Key.from(remoteSigningRequest); @@ -260,21 +324,21 @@ public SdkHttpFullRequest sign( } else { Map responseHeaders = Maps.newHashMap(); Consumer> responseHeadersConsumer = responseHeaders::putAll; - S3SignResponse s3SignResponse = + RemoteSignResponse remoteSignResponse = httpClient() .withAuthSession(authSession()) .post( endpoint(), remoteSigningRequest, - S3SignResponse.class, + RemoteSignResponse.class, Map.of(), ErrorHandlers.defaultErrorHandler(), responseHeadersConsumer); signedComponent = ImmutableSignedComponent.builder() - .headers(s3SignResponse.headers()) - .signedURI(s3SignResponse.uri()) + .headers(remoteSignResponse.headers()) + .signedURI(remoteSignResponse.uri()) .build(); if (canBeCached(responseHeaders)) { @@ -351,7 +415,7 @@ interface Key { String uri(); - static Key from(S3SignRequest request) { + static Key from(RemoteSignRequest request) { return ImmutableKey.builder() .method(request.method()) .region(request.region()) diff --git a/aws/src/main/resources/s3-signer-open-api.yaml b/aws/src/main/resources/s3-signer-open-api.yaml index 3d719c515b2a..0b98fcc59eff 100644 --- a/aws/src/main/resources/s3-signer-open-api.yaml +++ b/aws/src/main/resources/s3-signer-open-api.yaml @@ -17,19 +17,22 @@ # under the License. # +# ⚠️ WARNING: this API is deprecated. Use the new remote signing endpoint instead, +# see open-api/rest-catalog-open-api.yaml. + --- openapi: 3.0.3 info: - title: Apache Iceberg S3 Signer API + title: "[DEPRECATED] Apache Iceberg S3 Signer API" license: name: Apache 2.0 url: https://www.apache.org/licenses/LICENSE-2.0.html version: 0.0.1 description: - Defines the specification for the S3 Signer API. + "[DEPRECATED] Defines the specification for the S3 Signer API." servers: - url: "{scheme}://{host}/{basePath}" - description: Server URL when the port can be inferred from the scheme + description: "[DEPRECATED] Server URL when the port can be inferred from the scheme" variables: scheme: description: The scheme of the URI, either http or https. @@ -41,7 +44,7 @@ servers: description: Optional prefix to be prepended to all routes default: "" - url: "{scheme}://{host}:{port}/{basePath}" - description: Generic base server URL, with all parts configurable + description: "[DEPRECATED] Generic base server URL, with all parts configurable" variables: scheme: description: The scheme of the URI, either http or https. @@ -61,9 +64,10 @@ paths: /v1/aws/s3/sign: post: + deprecated: true tags: - S3 Signer API - summary: Remotely signs S3 requests + summary: "[DEPRECATED] Remotely signs S3 requests" operationId: signS3Request requestBody: description: The request containing the headers to be signed @@ -95,6 +99,7 @@ components: schemas: S3Headers: + deprecated: true type: object additionalProperties: type: array @@ -102,6 +107,7 @@ components: type: string S3SignRequest: + deprecated: true required: - region - uri @@ -133,7 +139,8 @@ components: responses: S3SignResponse: - description: The response containing signed & unsigned headers. The server will also send + description: > + [DEPRECATED] The response containing signed & unsigned headers. The server will also send a Cache-Control header, indicating whether the response can be cached (Cache-Control = ["private"]) or not (Cache-Control = ["no-cache"]). content: diff --git a/aws/src/test/java/org/apache/iceberg/aws/TestHttpClientConfigurations.java b/aws/src/test/java/org/apache/iceberg/aws/TestHttpClientConfigurations.java index 0f96ac0f6c82..da73a5c1b5a5 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/TestHttpClientConfigurations.java +++ b/aws/src/test/java/org/apache/iceberg/aws/TestHttpClientConfigurations.java @@ -22,6 +22,8 @@ import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; import org.mockito.Mockito; import software.amazon.awssdk.http.apache.ApacheHttpClient; import software.amazon.awssdk.http.apache.ProxyConfiguration; @@ -137,4 +139,39 @@ public void testApacheDefaultConfigurations() { Mockito.verify(spyApacheHttpClientBuilder, Mockito.never()) .proxyConfiguration(Mockito.any(ProxyConfiguration.class)); } + + @ParameterizedTest + @ValueSource( + strings = { + HttpClientProperties.PROXY_USE_SYSTEM_PROPERTY_VALUES, + HttpClientProperties.PROXY_USE_ENVIRONMENT_VARIABLE_VALUES + }) + public void testApacheProxyFlagTriggersProxyConfig(String propertyKey) { + Map properties = Maps.newHashMap(); + properties.put(propertyKey, "false"); + ApacheHttpClient.Builder spy = Mockito.spy(ApacheHttpClient.builder()); + + ApacheHttpClientConfigurations.create(properties).configureApacheHttpClientBuilder(spy); + + Mockito.verify(spy).proxyConfiguration(Mockito.any(ProxyConfiguration.class)); + } + + @ParameterizedTest + @ValueSource( + strings = { + HttpClientProperties.PROXY_USE_SYSTEM_PROPERTY_VALUES, + HttpClientProperties.PROXY_USE_ENVIRONMENT_VARIABLE_VALUES + }) + public void testUrlConnectionProxyFlagTriggersProxyConfig(String propertyKey) { + Map properties = Maps.newHashMap(); + properties.put(propertyKey, "false"); + UrlConnectionHttpClient.Builder spy = Mockito.spy(UrlConnectionHttpClient.builder()); + + UrlConnectionHttpClientConfigurations.create(properties) + .configureUrlConnectionHttpClientBuilder(spy); + + Mockito.verify(spy) + .proxyConfiguration( + Mockito.any(software.amazon.awssdk.http.urlconnection.ProxyConfiguration.class)); + } } diff --git a/aws/src/test/java/org/apache/iceberg/aws/TestRESTSigV4AuthSession.java b/aws/src/test/java/org/apache/iceberg/aws/TestRESTSigV4AuthSession.java index 1b2aaf2e1c01..9e996ca60089 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/TestRESTSigV4AuthSession.java +++ b/aws/src/test/java/org/apache/iceberg/aws/TestRESTSigV4AuthSession.java @@ -35,7 +35,10 @@ import org.apache.iceberg.rest.requests.CreateNamespaceRequest; import org.junit.jupiter.api.Test; import org.mockito.Mockito; +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider; import software.amazon.awssdk.auth.signer.Aws4Signer; +import software.amazon.awssdk.regions.Region; +import software.amazon.awssdk.utils.SdkAutoCloseable; class TestRESTSigV4AuthSession { @@ -306,4 +309,43 @@ void close() { session.close(); Mockito.verify(delegate).close(); } + + @Test + void closeWithCloseableCredentialsProvider() { + AuthSession delegate = Mockito.mock(AuthSession.class); + CloseableAwsCredentialsProvider credentialsProvider = + Mockito.mock(CloseableAwsCredentialsProvider.class); + closeWithCloseableCredentialsProvider(delegate, credentialsProvider); + } + + @Test + void closeSuppressesFailure() { + AuthSession delegate = Mockito.mock(AuthSession.class); + Mockito.doThrow(new RuntimeException("delegate close failed")).when(delegate).close(); + CloseableAwsCredentialsProvider credentialsProvider = + Mockito.mock(CloseableAwsCredentialsProvider.class); + Mockito.doThrow(new RuntimeException("credentials provider close failed")) + .when(credentialsProvider) + .close(); + closeWithCloseableCredentialsProvider(delegate, credentialsProvider); + } + + private void closeWithCloseableCredentialsProvider( + AuthSession delegate, CloseableAwsCredentialsProvider credentialsProvider) { + AwsProperties properties = Mockito.mock(AwsProperties.class); + when(properties.restSigningRegion()).thenReturn(Region.US_WEST_2); + when(properties.restSigningName()).thenReturn("execute-api"); + when(properties.restCredentialsProvider()).thenReturn(credentialsProvider); + + RESTSigV4AuthSession session = new RESTSigV4AuthSession(signer, delegate, properties); + session.close(); + + Mockito.verify(delegate).close(); + Mockito.verify(credentialsProvider).close(); + } + + interface CloseableAwsCredentialsProvider extends AwsCredentialsProvider, SdkAutoCloseable { + @Override + void close(); + } } diff --git a/aws/src/test/java/org/apache/iceberg/aws/dynamodb/TestDynamoDbCatalog.java b/aws/src/test/java/org/apache/iceberg/aws/dynamodb/TestDynamoDbCatalog.java index b602cea303d8..e172831a2428 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/dynamodb/TestDynamoDbCatalog.java +++ b/aws/src/test/java/org/apache/iceberg/aws/dynamodb/TestDynamoDbCatalog.java @@ -49,14 +49,25 @@ public class TestDynamoDbCatalog { public void before() { dynamo = Mockito.mock(DynamoDbClient.class); dynamoCatalog = new DynamoDbCatalog(); - dynamoCatalog.initialize(CATALOG_NAME, WAREHOUSE_PATH, new AwsProperties(), dynamo, null); + dynamoCatalog.initialize( + CATALOG_NAME, + WAREHOUSE_PATH, + new AwsProperties(), + dynamo, + null, + false /* uniqTableLocation */); } @Test public void testConstructorWarehousePathWithEndSlash() { DynamoDbCatalog catalogWithSlash = new DynamoDbCatalog(); catalogWithSlash.initialize( - CATALOG_NAME, WAREHOUSE_PATH + "/", new AwsProperties(), dynamo, null); + CATALOG_NAME, + WAREHOUSE_PATH + "/", + new AwsProperties(), + dynamo, + null, + false /* uniqTableLocation */); Mockito.doReturn(GetItemResponse.builder().item(Maps.newHashMap()).build()) .when(dynamo) .getItem(any(GetItemRequest.class)); @@ -103,4 +114,49 @@ public void testDefaultWarehouseLocationNoNamespace() { .isInstanceOf(NoSuchNamespaceException.class) .hasMessageContaining("Cannot find default warehouse location:"); } + + @Test + public void testDefaultWarehouseLocationUniqueWithoutDbUri() throws Exception { + try (DynamoDbCatalog catalog = new DynamoDbCatalog()) { + catalog.initialize( + CATALOG_NAME, + WAREHOUSE_PATH, + new AwsProperties(), + dynamo, + null, + true /* uniqTableLocation */); + Mockito.doReturn(GetItemResponse.builder().item(Maps.newHashMap()).build()) + .when(dynamo) + .getItem(any(GetItemRequest.class)); + + String defaultWarehouseLocation = catalog.defaultWarehouseLocation(TABLE_IDENTIFIER); + assertThat(defaultWarehouseLocation).matches(WAREHOUSE_PATH + "/db.db/table-[a-z0-9]{32}"); + } + } + + @Test + public void testDefaultWarehouseLocationUniqueWithDbUri() throws Exception { + try (DynamoDbCatalog catalog = new DynamoDbCatalog()) { + catalog.initialize( + CATALOG_NAME, + WAREHOUSE_PATH, + new AwsProperties(), + dynamo, + null, + true /* uniqTableLocation */); + String dbUri = "s3://bucket2/db"; + Mockito.doReturn( + GetItemResponse.builder() + .item( + ImmutableMap.of( + toPropertyCol(DynamoDbCatalog.defaultLocationProperty()), + AttributeValue.builder().s(dbUri).build())) + .build()) + .when(dynamo) + .getItem(any(GetItemRequest.class)); + + String defaultWarehouseLocation = catalog.defaultWarehouseLocation(TABLE_IDENTIFIER); + assertThat(defaultWarehouseLocation).matches("s3://bucket2/db/table-[a-z0-9]{32}"); + } + } } diff --git a/aws/src/test/java/org/apache/iceberg/aws/glue/TestGlueCatalog.java b/aws/src/test/java/org/apache/iceberg/aws/glue/TestGlueCatalog.java index 2042948eb3c9..82f7e84d563b 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/glue/TestGlueCatalog.java +++ b/aws/src/test/java/org/apache/iceberg/aws/glue/TestGlueCatalog.java @@ -194,6 +194,28 @@ public void testDefaultWarehouseLocationCustomCatalogId() { Mockito.argThat((GetDatabaseRequest req) -> req.catalogId().equals(catalogId))); } + @Test + public void testDefaultWarehouseLocationUnique() { + GlueCatalog catalog = new GlueCatalog(); + catalog.initialize( + CATALOG_NAME, + WAREHOUSE_PATH, + new AwsProperties(), + new S3FileIOProperties(), + glue, + LockManagers.defaultLockManager(), + true /* uniqTableLocation */); + + Mockito.doReturn( + GetDatabaseResponse.builder() + .database(Database.builder().name("db").locationUri("s3://bucket2/db").build()) + .build()) + .when(glue) + .getDatabase(Mockito.any(GetDatabaseRequest.class)); + String location = catalog.defaultWarehouseLocation(TableIdentifier.of("db", "table")); + assertThat(location).matches("s3://bucket2/db/table-[a-z0-9]{32}"); + } + @Test public void testListTables() { Mockito.doReturn( diff --git a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIOCredentialRefresh.java b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIOCredentialRefresh.java index 0a8b0e084873..170857ca84b4 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIOCredentialRefresh.java +++ b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIOCredentialRefresh.java @@ -29,6 +29,7 @@ import java.util.Map; import java.util.concurrent.TimeUnit; import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.TestHelpers; import org.apache.iceberg.aws.AwsProperties; import org.apache.iceberg.io.StorageCredential; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; @@ -265,4 +266,83 @@ public void credentialRefreshWithinFiveMinuteWindow() { }); } } + + @Test + public void refreshedCredentialsAreKryoSerializable() throws Exception { + // Verify that an S3FileIO whose credentials have been refreshed at runtime can still be + // round-tripped through Kryo. The internal storageCredentials list must be backed by a + // collection that Kryo can serialize and deserialize. + String nearExpiryMs = Long.toString(Instant.now().plus(3, ChronoUnit.MINUTES).toEpochMilli()); + + StorageCredential initialCredential = + StorageCredential.create( + "s3://bucket/path", + ImmutableMap.of( + S3FileIOProperties.ACCESS_KEY_ID, + "initialAccessKey", + S3FileIOProperties.SECRET_ACCESS_KEY, + "initialSecretKey", + S3FileIOProperties.SESSION_TOKEN, + "initialToken", + S3FileIOProperties.SESSION_TOKEN_EXPIRES_AT_MS, + nearExpiryMs)); + + String refreshedExpiryMs = + Long.toString(Instant.now().plus(1, ChronoUnit.HOURS).toEpochMilli()); + LoadCredentialsResponse refreshResponse = + ImmutableLoadCredentialsResponse.builder() + .addCredentials( + ImmutableCredential.builder() + .prefix("s3://bucket/path") + .config( + ImmutableMap.of( + S3FileIOProperties.ACCESS_KEY_ID, + "refreshedAccessKey", + S3FileIOProperties.SECRET_ACCESS_KEY, + "refreshedSecretKey", + S3FileIOProperties.SESSION_TOKEN, + "refreshedToken", + S3FileIOProperties.SESSION_TOKEN_EXPIRES_AT_MS, + refreshedExpiryMs)) + .build()) + .build(); + + HttpRequest mockRequest = request("/v1/credentials").withMethod(HttpMethod.GET.name()); + mockServer + .when(mockRequest) + .respond( + response(LoadCredentialsResponseParser.toJson(refreshResponse)).withStatusCode(200)); + + Map properties = + ImmutableMap.of( + AwsProperties.CLIENT_FACTORY, + StaticClientFactory.class.getName(), + VendedCredentialsProvider.URI, + CREDENTIALS_URI, + CatalogProperties.URI, + CATALOG_URI, + "init-creation-stacktrace", + "false"); + + StaticClientFactory.client = null; + try (S3FileIO fileIO = new S3FileIO()) { + fileIO.initialize(properties); + fileIO.setCredentials(List.of(initialCredential)); + + fileIO.client(); + + // Wait for the refresh to update the in-memory credentials + Awaitility.await() + .atMost(10, TimeUnit.SECONDS) + .untilAsserted( + () -> + assertThat(fileIO.credentials().get(0).config()) + .containsEntry(S3FileIOProperties.ACCESS_KEY_ID, "refreshedAccessKey")); + + // Round-trip through Kryo and verify the credentials still match + try (S3FileIO deserialized = TestHelpers.KryoHelpers.roundTripSerialize(fileIO)) { + assertThat(deserialized.credentials()).isEqualTo(fileIO.credentials()); + } + } + } } diff --git a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIOProperties.java b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIOProperties.java index 1666de1f1d08..953f73d45d4a 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIOProperties.java +++ b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIOProperties.java @@ -566,4 +566,25 @@ public void testApplyRetryConfiguration() { RetryPolicy retryPolicy = builder.overrideConfiguration().retryPolicy().get(); assertThat(retryPolicy.numRetries()).as("retries was not set").isEqualTo(999); } + + @Test + public void testChunkedEncodingEnabledDefaultValue() { + Map properties = Maps.newHashMap(); + S3FileIOProperties s3FileIOProperties = new S3FileIOProperties(properties); + + assertThat(s3FileIOProperties.isChunkedEncodingEnabled()) + .as("chunked encoding should be enabled by default") + .isTrue(); + } + + @Test + public void testChunkedEncodingDisabled() { + Map properties = Maps.newHashMap(); + properties.put(S3FileIOProperties.CHUNKED_ENCODING_ENABLED, "false"); + S3FileIOProperties s3FileIOProperties = new S3FileIOProperties(properties); + + assertThat(s3FileIOProperties.isChunkedEncodingEnabled()) + .as("chunked encoding should be disabled when explicitly set to false") + .isFalse(); + } } diff --git a/aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3V4RestSignerClient.java b/aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3V4RestSignerClient.java index 0bcc77e29fae..aadbf036b567 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3V4RestSignerClient.java +++ b/aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3V4RestSignerClient.java @@ -18,13 +18,14 @@ */ package org.apache.iceberg.aws.s3.signer; -import static org.apache.iceberg.aws.s3.signer.S3V4RestSignerClient.S3_SIGNER_URI; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.InstanceOfAssertFactories.type; import static org.mockito.Mockito.when; import java.util.Map; import java.util.stream.Stream; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.rest.RESTCatalogProperties; import org.apache.iceberg.rest.RESTClient; import org.apache.iceberg.rest.auth.AuthProperties; import org.apache.iceberg.rest.auth.AuthSession; @@ -119,12 +120,21 @@ void authSessionOAuth2(Map properties, String expectedScope, Str public static Stream validOAuth2Properties() { return Stream.of( // No OAuth2 data - Arguments.of(Map.of(S3_SIGNER_URI, "https://signer.com"), "sign", null), + Arguments.of( + Map.of( + RESTCatalogProperties.SIGNER_URI, + "https://signer.com", + RESTCatalogProperties.SIGNER_ENDPOINT, + "v1/sign/s3"), + "sign", + null), // Token only Arguments.of( Map.of( - S3_SIGNER_URI, + RESTCatalogProperties.SIGNER_URI, "https://signer.com", + RESTCatalogProperties.SIGNER_ENDPOINT, + "v1/sign/s3", AuthProperties.AUTH_TYPE, AuthProperties.AUTH_TYPE_OAUTH2, OAuth2Properties.TOKEN, @@ -134,8 +144,10 @@ public static Stream validOAuth2Properties() { // Credential only: expect a token to be fetched Arguments.of( Map.of( - S3_SIGNER_URI, + RESTCatalogProperties.SIGNER_URI, "https://signer.com", + RESTCatalogProperties.SIGNER_ENDPOINT, + "v1/sign/s3", AuthProperties.AUTH_TYPE, AuthProperties.AUTH_TYPE_OAUTH2, OAuth2Properties.CREDENTIAL, @@ -145,8 +157,10 @@ public static Stream validOAuth2Properties() { // Token and credential: should use token as is, not fetch a new one Arguments.of( Map.of( - S3_SIGNER_URI, + RESTCatalogProperties.SIGNER_URI, "https://signer.com", + RESTCatalogProperties.SIGNER_ENDPOINT, + "v1/sign/s3", AuthProperties.AUTH_TYPE, AuthProperties.AUTH_TYPE_OAUTH2, OAuth2Properties.TOKEN, @@ -158,8 +172,10 @@ public static Stream validOAuth2Properties() { // Custom scope Arguments.of( Map.of( - S3_SIGNER_URI, + RESTCatalogProperties.SIGNER_URI, "https://signer.com", + RESTCatalogProperties.SIGNER_ENDPOINT, + "v1/sign/s3", AuthProperties.AUTH_TYPE, AuthProperties.AUTH_TYPE_OAUTH2, OAuth2Properties.CREDENTIAL, @@ -169,4 +185,63 @@ public static Stream validOAuth2Properties() { "custom", "token")); } + + @ParameterizedTest + @MethodSource("legacySignerProperties") + void legacySignerProperties( + Map properties, String expectedBaseSignerUri, String expectedEndpoint) + throws Exception { + try (S3V4RestSignerClient client = + ImmutableS3V4RestSignerClient.builder().properties(properties).build()) { + assertThat(client.baseSignerUri()).isEqualTo(expectedBaseSignerUri); + assertThat(client.endpoint()).isEqualTo(expectedEndpoint); + } + } + + @SuppressWarnings("deprecation") + public static Stream legacySignerProperties() { + return Stream.of( + // Only legacy properties + Arguments.of( + Map.of( + CatalogProperties.URI, + "https://catalog.com", + S3V4RestSignerClient.S3_SIGNER_URI, + "https://legacy-signer.com", + S3V4RestSignerClient.S3_SIGNER_ENDPOINT, + "v1/legacy/sign"), + "https://legacy-signer.com", + "https://legacy-signer.com/v1/legacy/sign"), + // Only new properties + Arguments.of( + Map.of( + CatalogProperties.URI, + "https://catalog.com", + RESTCatalogProperties.SIGNER_URI, + "https://new-signer.com", + RESTCatalogProperties.SIGNER_ENDPOINT, + "v1/new/sign"), + "https://new-signer.com", + "https://new-signer.com/v1/new/sign"), + // Mixed properties: legacy properties take precedence + Arguments.of( + Map.of( + CatalogProperties.URI, + "https://catalog.com", + RESTCatalogProperties.SIGNER_URI, + "https://new-signer.com", + RESTCatalogProperties.SIGNER_ENDPOINT, + "v1/new/sign", + S3V4RestSignerClient.S3_SIGNER_URI, + "https://legacy-signer.com", + S3V4RestSignerClient.S3_SIGNER_ENDPOINT, + "v1/legacy/sign"), + "https://legacy-signer.com", + "https://legacy-signer.com/v1/legacy/sign"), + // No signer properties: the catalog URI and the deprecated default endpoint are used + Arguments.of( + Map.of(CatalogProperties.URI, "https://catalog.com"), + "https://catalog.com", + "https://catalog.com/" + S3V4RestSignerClient.S3_SIGNER_DEFAULT_ENDPOINT)); + } } diff --git a/azure-bundle/LICENSE b/azure-bundle/LICENSE index e8c049f4c33b..b0964f5e65ba 100644 --- a/azure-bundle/LICENSE +++ b/azure-bundle/LICENSE @@ -207,8 +207,7 @@ This product bundles Azure SDK for Java. Project URL: https://github.com/Azure/azure-sdk-for-java License: MIT -| The MIT License (MIT) -| + | Copyright (c) 2015 Microsoft | | Permission is hereby granted, free of charge, to any person obtaining a copy @@ -238,6 +237,91 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles FastDoubleParser (via Jackson JSON Processor). + +Copyright: 2023 Werner Randelshofer, Switzerland +Project URL: https://github.com/wrandelshofer/FastDoubleParser +License: MIT + +| Copyright (c) 2023 Werner Randelshofer, Switzerland +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles fast_float (bundled by FastDoubleParser). + +Copyright: 2021 The fast_float authors +Project URL: https://github.com/fastfloat/fast_float +License: MIT + +| Copyright (c) 2021 The fast_float authors +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles bigint (bundled by FastDoubleParser). + +Copyright: 2022 Tim Buktu +Project URL: https://github.com/tbuktu/bigint +License: BSD 2-Clause + +| Copyright 2022 Tim Buktu +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| 1. Redistributions of source code must retain the above copyright notice, this +| list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- @@ -245,27 +329,53 @@ This product bundles Microsoft Authentication Library for Java. Project URL: https://github.com/AzureAD/microsoft-authentication-library-for-java License: MIT -| MIT License -| -| Copyright (c) Microsoft Corporation. All rights reserved. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in all -| copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -| SOFTWARE + +| Copyright (c) Microsoft Corporation. All rights reserved. +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE + +-------------------------------------------------------------------------------- + +This product bundles MSAL4J Persistence Extension. + +Project URL: https://github.com/AzureAD/microsoft-authentication-library-for-java +License: MIT + +| Copyright (c) Microsoft Corporation. All rights reserved. +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE -------------------------------------------------------------------------------- @@ -276,6 +386,13 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles Apache Tomcat Native (netty-tcnative-classes and netty-tcnative-boringssl-static, bundled by Reactor Netty). + +Project URL: https://tomcat.apache.org/native-doc/ +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + This product bundles Reactor Core. Project URL: https://github.com/reactor/reactor-core @@ -290,9 +407,16 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Reactor AddOns. +This product bundles Reactor Pool (bundled by Reactor Netty). + +Project URL: https://github.com/reactor/reactor-pool +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles Aalto XML (bundled by Azure SDK for Java). -Project URL: https://github.com/reactor/reactor-addons +Project URL: https://github.com/FasterXML/aalto-xml License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -307,9 +431,8 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 This product bundles Reactive Streams. Project URL: http://www.reactive-streams.org/ -License: MIT -| MIT No Attribution -| +License: MIT-0 + | Copyright 2014 Reactive Streams | | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so. diff --git a/azure-bundle/NOTICE b/azure-bundle/NOTICE index 07f87c0bc05c..12325baf97d2 100644 --- a/azure-bundle/NOTICE +++ b/azure-bundle/NOTICE @@ -8,7 +8,6 @@ The Apache Software Foundation (http://www.apache.org/). -------------------------------------------------------------------------------- This product bundles Jackson JSON Processor with the following in its NOTICE file: -| | # Jackson JSON processor | | Jackson is a high-performance, Free/Open Source JSON processing library. @@ -16,6 +15,10 @@ This product bundles Jackson JSON Processor with the following in its NOTICE fil | been in development since 2007. | It is currently developed by a community of developers. | +| ## Copyright +| +| Copyright 2007-, Tatu Saloranta (tatu.saloranta@iki.fi) +| | ## Licensing | | Jackson 2.x core and extension components are licensed under Apache License 2.0 @@ -26,7 +29,17 @@ This product bundles Jackson JSON Processor with the following in its NOTICE fil | A list of contributors may be found from CREDITS(-2.x) file, which is included | in some artifacts (usually source distributions); but is always available | from the source code management (SCM) system project uses. -| +| +| ## FastDoubleParser +| +| jackson-core bundles a shaded copy of FastDoubleParser . +| That code is available under an MIT license +| under the following copyright. +| +| Copyright © 2023 Werner Randelshofer, Switzerland. MIT License. +| +| See FastDoubleParser-NOTICE for details of other source code included in FastDoubleParser +| and the licenses and copyrights that apply to that code. -------------------------------------------------------------------------------- diff --git a/azure-bundle/build.gradle b/azure-bundle/build.gradle index 0bdc30fdaa7e..fde8adbfc539 100644 --- a/azure-bundle/build.gradle +++ b/azure-bundle/build.gradle @@ -23,6 +23,12 @@ project(":iceberg-azure-bundle") { tasks.jar.dependsOn tasks.shadowJar + configurations { + implementation { + exclude group: 'org.slf4j' + } + } + dependencies { implementation platform(libs.azuresdk.bom) implementation "com.azure:azure-storage-file-datalake" @@ -40,10 +46,6 @@ project(":iceberg-azure-bundle") { include 'NOTICE' } - dependencies { - exclude(dependency('org.slf4j:slf4j-api')) - } - // relocate Azure-specific versions relocate 'io.netty', 'org.apache.iceberg.azure.shaded.io.netty' relocate 'com.fasterxml.jackson', 'org.apache.iceberg.azure.shaded.com.fasterxml.jackson' @@ -52,4 +54,6 @@ project(":iceberg-azure-bundle") { jar { enabled = false } + + apply from: "${rootDir}/runtime-deps.gradle" } diff --git a/azure-bundle/runtime-deps.txt b/azure-bundle/runtime-deps.txt new file mode 100644 index 000000000000..2e5198f49842 --- /dev/null +++ b/azure-bundle/runtime-deps.txt @@ -0,0 +1,43 @@ +com.azure:azure-core-http-netty:1.16.3 +com.azure:azure-core:1.57.1 +com.azure:azure-identity:1.18.2 +com.azure:azure-json:1.5.1 +com.azure:azure-security-keyvault-keys:4.10.6 +com.azure:azure-storage-blob:12.33.3 +com.azure:azure-storage-common:12.32.2 +com.azure:azure-storage-file-datalake:12.26.3 +com.azure:azure-storage-internal-avro:12.18.2 +com.azure:azure-xml:1.2.1 +com.fasterxml.jackson.core:jackson-annotations:2.18.4 +com.fasterxml.jackson.core:jackson-core:2.18.4.1 +com.fasterxml.jackson.core:jackson-databind:2.18.4 +com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.18.4 +com.microsoft.azure:msal4j-persistence-extension:1.3.0 +com.microsoft.azure:msal4j:1.23.1 +io.netty:netty-buffer:4.1.130.Final +io.netty:netty-codec-dns:4.1.128.Final +io.netty:netty-codec-http2:4.1.130.Final +io.netty:netty-codec-http:4.1.130.Final +io.netty:netty-codec-socks:4.1.130.Final +io.netty:netty-codec:4.1.130.Final +io.netty:netty-common:4.1.130.Final +io.netty:netty-handler-proxy:4.1.130.Final +io.netty:netty-handler:4.1.130.Final +io.netty:netty-resolver-dns-classes-macos:4.1.128.Final +io.netty:netty-resolver-dns-native-macos:4.1.128.Final +io.netty:netty-resolver-dns:4.1.128.Final +io.netty:netty-resolver:4.1.130.Final +io.netty:netty-tcnative-boringssl-static:2.0.74.Final +io.netty:netty-tcnative-classes:2.0.74.Final +io.netty:netty-transport-classes-epoll:4.1.130.Final +io.netty:netty-transport-classes-kqueue:4.1.130.Final +io.netty:netty-transport-native-epoll:4.1.130.Final +io.netty:netty-transport-native-kqueue:4.1.130.Final +io.netty:netty-transport-native-unix-common:4.1.130.Final +io.netty:netty-transport:4.1.130.Final +io.projectreactor.netty:reactor-netty-core:1.2.13 +io.projectreactor.netty:reactor-netty-http:1.2.13 +io.projectreactor:reactor-core:3.7.14 +net.java.dev.jna:jna-platform:5.17.0 +net.java.dev.jna:jna:5.17.0 +org.reactivestreams:reactive-streams:1.0.4 diff --git a/azure/src/integration/java/org/apache/iceberg/azure/adlsv2/TestADLSFileIO.java b/azure/src/integration/java/org/apache/iceberg/azure/adlsv2/TestADLSFileIO.java index 5e343782ab1c..621813a25574 100644 --- a/azure/src/integration/java/org/apache/iceberg/azure/adlsv2/TestADLSFileIO.java +++ b/azure/src/integration/java/org/apache/iceberg/azure/adlsv2/TestADLSFileIO.java @@ -20,6 +20,7 @@ import static org.apache.iceberg.azure.AzureProperties.ADLS_SAS_TOKEN_PREFIX; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.mockito.ArgumentMatchers.anyBoolean; import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.any; @@ -31,6 +32,7 @@ import com.azure.core.http.rest.PagedIterable; import com.azure.core.http.rest.Response; +import com.azure.storage.blob.models.BlobStorageException; import com.azure.storage.file.datalake.DataLakeFileClient; import com.azure.storage.file.datalake.DataLakeFileSystemClient; import com.azure.storage.file.datalake.DataLakeFileSystemClientBuilder; @@ -42,6 +44,7 @@ import java.util.Iterator; import org.apache.iceberg.TestHelpers; import org.apache.iceberg.azure.AzureProperties; +import org.apache.iceberg.exceptions.NotFoundException; import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.FileInfo; import org.apache.iceberg.io.InputFile; @@ -78,6 +81,23 @@ public void testFileOperations() throws IOException { assertThat(fileClient.exists()).isFalse(); } + @Test + public void readMissingLocation() { + String path = "path/to/file"; + String location = AZURITE_CONTAINER.location(path); + ADLSFileIO io = createFileIO(); + DataLakeFileClient fileClient = AZURITE_CONTAINER.fileClient(path); + assertThat(fileClient.exists()).isFalse(); + + InputFile inputFile = io.newInputFile(location); + + assertThatThrownBy(inputFile::newStream) + .isInstanceOf(NotFoundException.class) + .hasCauseInstanceOf(BlobStorageException.class) + .hasMessage( + "Location does not exist: abfs://container@account.dfs.core.windows.net/path/to/file"); + } + @Test public void testBulkDeleteFiles() { String path1 = "path/to/file1"; diff --git a/azure/src/integration/java/org/apache/iceberg/azure/adlsv2/TestADLSInputStream.java b/azure/src/integration/java/org/apache/iceberg/azure/adlsv2/TestADLSInputStream.java index 8464e57516ce..1edf48eaec35 100644 --- a/azure/src/integration/java/org/apache/iceberg/azure/adlsv2/TestADLSInputStream.java +++ b/azure/src/integration/java/org/apache/iceberg/azure/adlsv2/TestADLSInputStream.java @@ -39,6 +39,10 @@ public class TestADLSInputStream extends AzuriteTestBase { private final Random random = new Random(1); private final AzureProperties azureProperties = new AzureProperties(); + private String location() { + return AZURITE_CONTAINER.location(FILE_PATH); + } + private DataLakeFileClient fileClient() { return AZURITE_CONTAINER.fileClient(FILE_PATH); } @@ -55,7 +59,8 @@ public void testRead() throws Exception { setupData(data); try (SeekableInputStream in = - new ADLSInputStream(fileClient(), null, azureProperties, MetricsContext.nullMetrics())) { + new ADLSInputStream( + location(), fileClient(), null, azureProperties, MetricsContext.nullMetrics())) { int readSize = 1024; readAndCheck(in, in.getPos(), readSize, data, false); @@ -90,7 +95,8 @@ public void testReadSingle() throws Exception { setupData(data); try (SeekableInputStream in = - new ADLSInputStream(fileClient(), null, azureProperties, MetricsContext.nullMetrics())) { + new ADLSInputStream( + location(), fileClient(), null, azureProperties, MetricsContext.nullMetrics())) { assertThat(in.read()).isEqualTo(i0); assertThat(in.read()).isEqualTo(i1); } @@ -131,7 +137,8 @@ public void testRangeRead() throws Exception { setupData(expected); try (RangeReadable in = - new ADLSInputStream(fileClient(), null, azureProperties, MetricsContext.nullMetrics())) { + new ADLSInputStream( + location(), fileClient(), null, azureProperties, MetricsContext.nullMetrics())) { // first 1k position = 0; offset = 0; @@ -164,7 +171,8 @@ private void readAndCheckRanges( public void testClose() throws Exception { setupData(randomData(2)); SeekableInputStream closed = - new ADLSInputStream(fileClient(), null, azureProperties, MetricsContext.nullMetrics()); + new ADLSInputStream( + location(), fileClient(), null, azureProperties, MetricsContext.nullMetrics()); closed.close(); assertThatThrownBy(() -> closed.seek(0)) .isInstanceOf(IllegalStateException.class) @@ -178,7 +186,8 @@ public void testSeek() throws Exception { setupData(data); try (SeekableInputStream in = - new ADLSInputStream(fileClient(), null, azureProperties, MetricsContext.nullMetrics())) { + new ADLSInputStream( + location(), fileClient(), null, azureProperties, MetricsContext.nullMetrics())) { in.seek(data.length / 2); byte[] actual = new byte[data.length / 2]; @@ -193,7 +202,8 @@ public void testSeek() throws Exception { public void testSeekNegative() throws Exception { setupData(randomData(2)); SeekableInputStream in = - new ADLSInputStream(fileClient(), null, azureProperties, MetricsContext.nullMetrics()); + new ADLSInputStream( + location(), fileClient(), null, azureProperties, MetricsContext.nullMetrics()); assertThatThrownBy(() -> in.seek(-3)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot seek: position -3 is negative"); diff --git a/azure/src/main/java/org/apache/iceberg/azure/AzureProperties.java b/azure/src/main/java/org/apache/iceberg/azure/AzureProperties.java index 73e99e029221..383bec30111b 100644 --- a/azure/src/main/java/org/apache/iceberg/azure/AzureProperties.java +++ b/azure/src/main/java/org/apache/iceberg/azure/AzureProperties.java @@ -21,7 +21,6 @@ import com.azure.core.credential.AccessToken; import com.azure.core.credential.TokenCredential; import com.azure.core.credential.TokenRequestContext; -import com.azure.security.keyvault.keys.cryptography.models.KeyWrapAlgorithm; import com.azure.storage.common.StorageSharedKeyCredential; import com.azure.storage.file.datalake.DataLakeFileSystemClientBuilder; import java.io.Serializable; @@ -53,6 +52,9 @@ public class AzureProperties implements Serializable { public static final String AZURE_KEYVAULT_KEY_WRAP_ALGORITHM = "azure.keyvault.key-wrap-algorithm"; + // Must match KeyWrapAlgorithm.RSA_OAEP_256.getValue() from azure-security-keyvault-keys + private static final String DEFAULT_KEY_WRAP_ALGORITHM = "RSA-OAEP-256"; + /** * Configure the ADLS token credential provider used to get {@link TokenCredential}. A fully * qualified concrete class with package that implements the {@link AdlsTokenCredentialProvider} @@ -136,8 +138,7 @@ public AzureProperties(Map properties) { this.keyWrapAlgorithm = properties.getOrDefault( - AzureProperties.AZURE_KEYVAULT_KEY_WRAP_ALGORITHM, - KeyWrapAlgorithm.RSA_OAEP_256.getValue()); + AzureProperties.AZURE_KEYVAULT_KEY_WRAP_ALGORITHM, DEFAULT_KEY_WRAP_ALGORITHM); } public Optional adlsReadBlockSize() { @@ -204,8 +205,8 @@ public Mono getToken(TokenRequestContext request) { } } - public KeyWrapAlgorithm keyWrapAlgorithm() { - return KeyWrapAlgorithm.fromString(this.keyWrapAlgorithm); + public String keyWrapAlgorithm() { + return this.keyWrapAlgorithm; } public Optional keyVaultUrl() { diff --git a/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSInputFile.java b/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSInputFile.java index 95e57bf04d32..5b07534f1368 100644 --- a/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSInputFile.java +++ b/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSInputFile.java @@ -55,6 +55,6 @@ public long getLength() { @Override public SeekableInputStream newStream() { - return new ADLSInputStream(fileClient(), fileSize, azureProperties(), metrics()); + return new ADLSInputStream(location(), fileClient(), fileSize, azureProperties(), metrics()); } } diff --git a/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSInputStream.java b/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSInputStream.java index 55ecade4486a..b1a2d3abfa32 100644 --- a/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSInputStream.java +++ b/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSInputStream.java @@ -18,14 +18,18 @@ */ package org.apache.iceberg.azure.adlsv2; +import com.azure.storage.blob.models.BlobErrorCode; +import com.azure.storage.blob.models.BlobStorageException; import com.azure.storage.file.datalake.DataLakeFileClient; import com.azure.storage.file.datalake.models.DataLakeFileOpenInputStreamResult; +import com.azure.storage.file.datalake.models.DataLakeStorageException; import com.azure.storage.file.datalake.models.FileRange; import com.azure.storage.file.datalake.options.DataLakeFileInputStreamOptions; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; import org.apache.iceberg.azure.AzureProperties; +import org.apache.iceberg.exceptions.NotFoundException; import org.apache.iceberg.io.FileIOMetricsContext; import org.apache.iceberg.io.IOUtil; import org.apache.iceberg.io.RangeReadable; @@ -46,6 +50,7 @@ class ADLSInputStream extends SeekableInputStream implements RangeReadable { private static final int SKIP_SIZE = 1024 * 1024; private final StackTraceElement[] createStack; + private final String location; private final DataLakeFileClient fileClient; private Long fileSize; private final AzureProperties azureProperties; @@ -59,10 +64,12 @@ class ADLSInputStream extends SeekableInputStream implements RangeReadable { private final Counter readOperations; ADLSInputStream( + String location, DataLakeFileClient fileClient, Long fileSize, AzureProperties azureProperties, MetricsContext metrics) { + this.location = location; this.fileClient = fileClient; this.fileSize = fileSize; this.azureProperties = azureProperties; @@ -184,6 +191,7 @@ private DataLakeFileOpenInputStreamResult openRange(FileRange range) { try { return fileClient.openInputStream(getInputOptions(range)); } catch (RuntimeException e) { + throwNotFoundIfNotPresent(e, location); LOG.error( "Failed to open input stream for file {}, range {}", fileClient.getFilePath(), range, e); throw e; @@ -209,4 +217,20 @@ protected void finalize() throws Throwable { LOG.warn("Unclosed input stream created by:\n\t{}", trace); } } + + private static void throwNotFoundIfNotPresent(Throwable throwable, String location) { + if (isFileNotFoundException(throwable)) { + throw new NotFoundException(throwable, "Location does not exist: %s", location); + } + } + + private static boolean isFileNotFoundException(Throwable exception) { + if (exception instanceof BlobStorageException blobStorageException) { + return BlobErrorCode.BLOB_NOT_FOUND.equals(blobStorageException.getErrorCode()); + } + if (exception instanceof DataLakeStorageException dataLakeStorageException) { + return "PathNotFound".equals(dataLakeStorageException.getErrorCode()); + } + return false; + } } diff --git a/azure/src/main/java/org/apache/iceberg/azure/keymanagement/AzureKeyManagementClient.java b/azure/src/main/java/org/apache/iceberg/azure/keymanagement/AzureKeyManagementClient.java index 66bf0678bce9..498c432212c5 100644 --- a/azure/src/main/java/org/apache/iceberg/azure/keymanagement/AzureKeyManagementClient.java +++ b/azure/src/main/java/org/apache/iceberg/azure/keymanagement/AzureKeyManagementClient.java @@ -80,7 +80,8 @@ private ClientState state() { keyClientBuilder .credential(AdlsTokenCredentialProviders.from(allProperties).credential()) .buildClient(); - KeyWrapAlgorithm keyWrapAlgorithm = azureProperties.keyWrapAlgorithm(); + KeyWrapAlgorithm keyWrapAlgorithm = + KeyWrapAlgorithm.fromString(azureProperties.keyWrapAlgorithm()); state = new ClientState(keyClient, keyWrapAlgorithm); } } diff --git a/azure/src/test/java/org/apache/iceberg/azure/adlsv2/TestADLSInputStream.java b/azure/src/test/java/org/apache/iceberg/azure/adlsv2/TestADLSInputStream.java index 058bf0372b05..e98061846a88 100644 --- a/azure/src/test/java/org/apache/iceberg/azure/adlsv2/TestADLSInputStream.java +++ b/azure/src/test/java/org/apache/iceberg/azure/adlsv2/TestADLSInputStream.java @@ -46,7 +46,13 @@ void before() { InternalDataLakeFileOpenInputStreamResult openInputStreamResult = new InternalDataLakeFileOpenInputStreamResult(inputStream, mock()); when(fileClient.openInputStream(any())).thenReturn(openInputStreamResult); - adlsInputStream = new ADLSInputStream(fileClient, 0L, mock(), mock()); + adlsInputStream = + new ADLSInputStream( + "abfs://container@account.dfs.core.windows.net/path/to/file", + fileClient, + 0L, + mock(), + mock()); } @Test diff --git a/baseline.gradle b/baseline.gradle index 4efbd89eda02..6b180effbbbf 100644 --- a/baseline.gradle +++ b/baseline.gradle @@ -157,6 +157,8 @@ subprojects { '-Xep:Slf4jThrowable:ERROR', // Added because it errors out compile, but we need to figure out if we want it '-Xep:StrictUnusedVariable:OFF', + // This rule doesn't enforce the use of method references. That's handled by checkstyle. + '-Xep:StringCaseLocaleUsage:ERROR', // Enforce safe string splitting '-Xep:StringSplitter:ERROR', '-Xep:TypeParameterShadowing:OFF', diff --git a/bigquery/src/test/java/org/apache/iceberg/gcp/bigquery/TestBigQueryCatalog.java b/bigquery/src/test/java/org/apache/iceberg/gcp/bigquery/TestBigQueryCatalog.java index cdeaa1ef1e63..23441d0db184 100644 --- a/bigquery/src/test/java/org/apache/iceberg/gcp/bigquery/TestBigQueryCatalog.java +++ b/bigquery/src/test/java/org/apache/iceberg/gcp/bigquery/TestBigQueryCatalog.java @@ -24,6 +24,7 @@ import static org.assertj.core.api.Assertions.assertThat; import java.io.File; +import java.io.IOException; import java.util.List; import java.util.Map; import org.apache.iceberg.CatalogProperties; @@ -169,6 +170,18 @@ public void testRenameTableMissingSourceTable() { super.testRenameTableMissingSourceTable(); } + @Disabled("BigQuery Metastore does not support rename tables") + @Test + public void createTableInUniqueLocation() { + super.createTableInUniqueLocation(); + } + + @Disabled("BigQuery Metastore does not support rename tables") + @Test + public void dropAfterRenameDoesntCorruptTable() throws IOException { + super.dropAfterRenameDoesntCorruptTable(); + } + @Test public void testIsValidIdentifierWithValidSingleLevelNamespace() { assertThat(catalog.isValidIdentifier(TableIdentifier.of("dataset1", "table1"))).isTrue(); diff --git a/build.gradle b/build.gradle index 1369ea8b7e23..ccfc5abee0cf 100644 --- a/build.gradle +++ b/build.gradle @@ -38,7 +38,7 @@ buildscript { classpath 'org.revapi:gradle-revapi:1.8.0' classpath 'com.gorylenko.gradle-git-properties:gradle-git-properties:2.5.7' classpath 'com.palantir.gradle.gitversion:gradle-git-version:4.3.0' - classpath 'org.openapitools:openapi-generator-gradle-plugin:7.20.0' + classpath 'org.openapitools:openapi-generator-gradle-plugin:7.22.0' } } @@ -122,6 +122,15 @@ allprojects { } } +tasks.register('checkAllRuntimeDeps') { + description = 'Validates runtime dependency baselines for all subprojects that have them' + group = 'verification' + + dependsOn subprojects.collect { subproject -> + subproject.tasks.matching { it.name == 'checkRuntimeDeps' } + } +} + subprojects { if (it.name == 'iceberg-bom') { // the BOM does not build anything, the code below expects "source code" @@ -327,6 +336,7 @@ project(':iceberg-api') { testImplementation libs.avro.avro testImplementation libs.esotericsoftware.kryo testImplementation libs.awaitility + testImplementation libs.junit.pioneer } tasks.processTestResources.dependsOn rootProject.tasks.buildInfo @@ -389,7 +399,8 @@ project(':iceberg-core') { testImplementation libs.jetty.servlet testImplementation libs.jakarta.servlet - testImplementation libs.jetty.server + testImplementation libs.jetty.compression.server + testImplementation libs.jetty.compression.gzip testImplementation libs.mockserver.netty testImplementation libs.mockserver.client.java testImplementation libs.sqlite.jdbc @@ -401,6 +412,13 @@ project(':iceberg-core') { exclude group: 'junit' } testImplementation libs.awaitility + + // Lock BouncyCastle versions to avoid version mismatches + // when these dependencies are added transitively. + // Required for TLS tests with MockServer. + testImplementation libs.bouncycastle.bcpkix + testImplementation libs.bouncycastle.bcutil + testImplementation libs.bouncycastle.bcprov } } @@ -446,6 +464,8 @@ project(':iceberg-data') { testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') + testImplementation project(path: ':iceberg-orc', configuration: 'testArtifacts') + testImplementation(testFixtures(project(':iceberg-parquet'))) } test { @@ -466,8 +486,8 @@ project(':iceberg-aliyun') { implementation project(':iceberg-common') compileOnly libs.aliyun.sdk.oss - implementation libs.aliyun.credentials.java - implementation libs.aliyun.tea + compileOnly libs.aliyun.credentials.java + compileOnly libs.aliyun.tea compileOnly libs.jaxb.api compileOnly libs.activation compileOnly libs.jaxb.runtime @@ -543,6 +563,8 @@ project(':iceberg-aws') { testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation libs.awaitility testImplementation libs.jetty.servlet + testImplementation libs.jetty.compression.server + testImplementation libs.jetty.compression.gzip testImplementation libs.mockito.junit.jupiter } @@ -567,7 +589,8 @@ project(':iceberg-aws') { jvmArgs += project.property('extraJvmArgs') } - def s3SignerSpec = "$projectDir/src/main/resources/s3-signer-open-api.yaml" + // TODO delete once s3-signer-open-api.yaml is removed + def s3SignerSpec = layout.projectDirectory.file("src/main/resources/s3-signer-open-api.yaml") tasks.register('validateS3SignerSpec', org.openapitools.generator.gradle.plugin.tasks.ValidateTask) { inputSpec.set(s3SignerSpec) recommend.set(true) @@ -713,10 +736,10 @@ project(':iceberg-bigquery') { implementation project(path: ':iceberg-bundled-guava', configuration: 'shadow') - implementation platform(libs.google.libraries.bom) + compileOnly platform(libs.google.libraries.bom) compileOnly "com.google.cloud:google-cloud-storage" - implementation "com.google.cloud:google-cloud-bigquery" - implementation "com.google.cloud:google-cloud-core" + compileOnly "com.google.cloud:google-cloud-bigquery" + compileOnly "com.google.cloud:google-cloud-core" testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') @@ -925,6 +948,13 @@ project(':iceberg-parquet') { exclude group: 'org.apache.avro', module: 'avro' } + testFixturesApi(libs.parquet.hadoop) { + exclude group: 'org.apache.avro', module: 'avro' + // already shaded by Parquet + exclude group: 'it.unimi.dsi' + exclude group: 'org.codehaus.jackson' + } + testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-data') @@ -1131,7 +1161,8 @@ project(':iceberg-open-api') { testFixturesImplementation libs.slf4j.simple testFixturesImplementation libs.jetty.servlet - testFixturesImplementation libs.jetty.server + testFixturesImplementation libs.jetty.compression.server + testFixturesImplementation libs.jetty.compression.gzip testFixturesImplementation libs.sqlite.jdbc testFixturesCompileOnly libs.apiguardian @@ -1156,7 +1187,7 @@ project(':iceberg-open-api') { .collectEntries { k, v -> { [(k):v, (k.replaceFirst("rck.", "")):v] }} // strip prefix } - def restCatalogSpec = "$projectDir/rest-catalog-open-api.yaml" + def restCatalogSpec = layout.projectDirectory.file("rest-catalog-open-api.yaml") tasks.register('validateRESTCatalogSpec', org.openapitools.generator.gradle.plugin.tasks.ValidateTask) { inputSpec.set(restCatalogSpec) recommend.set(true) diff --git a/core/src/main/java/org/apache/iceberg/stats/BaseContentStats.java b/core/src/main/java/org/apache/iceberg/BaseContentStats.java similarity index 98% rename from core/src/main/java/org/apache/iceberg/stats/BaseContentStats.java rename to core/src/main/java/org/apache/iceberg/BaseContentStats.java index be56c411b6a7..45900b03e299 100644 --- a/core/src/main/java/org/apache/iceberg/stats/BaseContentStats.java +++ b/core/src/main/java/org/apache/iceberg/BaseContentStats.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.stats; +package org.apache.iceberg; import java.io.Serializable; import java.util.List; @@ -24,7 +24,6 @@ import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; -import org.apache.iceberg.Schema; import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; @@ -33,14 +32,14 @@ import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; -public class BaseContentStats implements ContentStats, Serializable { +class BaseContentStats implements ContentStats, Serializable { private final List> fieldStats; private final Map> fieldStatsById; private final Types.StructType statsStruct; /** Used by Avro reflection to instantiate this class when reading manifest files. */ - public BaseContentStats(Types.StructType projection) { + BaseContentStats(Types.StructType projection) { this.statsStruct = projection; this.fieldStats = Lists.newArrayListWithCapacity(projection.fields().size()); this.fieldStatsById = Maps.newLinkedHashMapWithExpectedSize(projection.fields().size()); diff --git a/core/src/main/java/org/apache/iceberg/stats/BaseFieldStats.java b/core/src/main/java/org/apache/iceberg/BaseFieldStats.java similarity index 98% rename from core/src/main/java/org/apache/iceberg/stats/BaseFieldStats.java rename to core/src/main/java/org/apache/iceberg/BaseFieldStats.java index 470303179bf5..11da570b8faa 100644 --- a/core/src/main/java/org/apache/iceberg/stats/BaseFieldStats.java +++ b/core/src/main/java/org/apache/iceberg/BaseFieldStats.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.stats; +package org.apache.iceberg; import java.nio.ByteBuffer; import java.nio.CharBuffer; @@ -28,7 +28,7 @@ import org.apache.iceberg.types.Types; import org.apache.iceberg.util.ByteBuffers; -public class BaseFieldStats extends SupportsIndexProjection implements FieldStats { +class BaseFieldStats extends SupportsIndexProjection implements FieldStats { private static final int[] IDENTITY_MAPPING = identityMapping(); private final int fieldId; private final Type type; diff --git a/core/src/main/java/org/apache/iceberg/BaseFile.java b/core/src/main/java/org/apache/iceberg/BaseFile.java index a02e0eff55a2..3c31c50f099f 100644 --- a/core/src/main/java/org/apache/iceberg/BaseFile.java +++ b/core/src/main/java/org/apache/iceberg/BaseFile.java @@ -45,7 +45,7 @@ abstract class BaseFile extends SupportsIndexProjection StructLike, SpecificData.SchemaConstructable, Serializable { - private static final FileContent[] FILE_CONTENT_VALUES = FileContent.values(); + static final Types.StructType EMPTY_STRUCT_TYPE = Types.StructType.of(); static final PartitionData EMPTY_PARTITION_DATA = new PartitionData(EMPTY_STRUCT_TYPE) { @@ -316,7 +316,7 @@ public void put(int i, Object value) { protected void internalSet(int pos, T value) { switch (pos) { case 0: - this.content = value != null ? FILE_CONTENT_VALUES[(Integer) value] : FileContent.DATA; + this.content = value != null ? FileContent.fromId((Integer) value) : FileContent.DATA; return; case 1: // always coerce to String for Serializable diff --git a/core/src/main/java/org/apache/iceberg/CatalogProperties.java b/core/src/main/java/org/apache/iceberg/CatalogProperties.java index 59744e50924f..6b85ccbc87bc 100644 --- a/core/src/main/java/org/apache/iceberg/CatalogProperties.java +++ b/core/src/main/java/org/apache/iceberg/CatalogProperties.java @@ -158,6 +158,15 @@ private CatalogProperties() {} public static final String APP_NAME = "app-name"; public static final String USER = "user"; + /** + * Requests that the catalog provide unique locations for new tables. + * + *

Relevant only for catalogs which support unique table locations. + */ + public static final String UNIQUE_TABLE_LOCATION = "unique-table-location"; + + public static final boolean UNIQUE_TABLE_LOCATION_DEFAULT = false; + public static final String AUTH_SESSION_TIMEOUT_MS = "auth.session-timeout-ms"; public static final long AUTH_SESSION_TIMEOUT_MS_DEFAULT = TimeUnit.HOURS.toMillis(1); diff --git a/api/src/main/java/org/apache/iceberg/stats/ContentStats.java b/core/src/main/java/org/apache/iceberg/ContentStats.java similarity index 92% rename from api/src/main/java/org/apache/iceberg/stats/ContentStats.java rename to core/src/main/java/org/apache/iceberg/ContentStats.java index b39db2565163..623a8eb39baf 100644 --- a/api/src/main/java/org/apache/iceberg/stats/ContentStats.java +++ b/core/src/main/java/org/apache/iceberg/ContentStats.java @@ -16,13 +16,12 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.stats; +package org.apache.iceberg; import java.util.List; -import org.apache.iceberg.StructLike; import org.apache.iceberg.types.Types; -public interface ContentStats extends StructLike { +interface ContentStats extends StructLike { /** A list of all the {@link FieldStats} */ List> fieldStats(); diff --git a/core/src/main/java/org/apache/iceberg/DeletionVector.java b/core/src/main/java/org/apache/iceberg/DeletionVector.java new file mode 100644 index 000000000000..0fc8f259f075 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/DeletionVector.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import org.apache.iceberg.types.Types; + +/** + * Metadata about a deletion vector. + * + *

Tracks where a DV blob can be read. The DV blob follows the format defined by the + * deletion-vector-v1 blob type in the Puffin spec. + */ +interface DeletionVector { + Types.NestedField LOCATION = + Types.NestedField.required( + 155, "location", Types.StringType.get(), "Location of the file containing the DV"); + Types.NestedField OFFSET = + Types.NestedField.required( + 144, "offset", Types.LongType.get(), "Offset in the file where the DV content starts"); + Types.NestedField SIZE_IN_BYTES = + Types.NestedField.required( + 145, + "size_in_bytes", + Types.LongType.get(), + "Length of the referenced DV content stored in the file"); + Types.NestedField CARDINALITY = + Types.NestedField.required( + 156, + "cardinality", + Types.LongType.get(), + "Number of set bits (deleted rows) in the vector"); + + static Types.StructType schema() { + return Types.StructType.of(LOCATION, OFFSET, SIZE_IN_BYTES, CARDINALITY); + } + + /** Returns the location of the file containing the deletion vector. */ + String location(); + + /** Returns the offset in the file where the deletion vector content starts. */ + long offset(); + + /** Returns the size in bytes of the deletion vector content. */ + long sizeInBytes(); + + /** Returns the number of set bits (deleted rows) in the vector. */ + long cardinality(); + + /** Copies this deletion vector. */ + DeletionVector copy(); +} diff --git a/core/src/main/java/org/apache/iceberg/DeletionVectorStruct.java b/core/src/main/java/org/apache/iceberg/DeletionVectorStruct.java new file mode 100644 index 000000000000..0eb7c2fe1eb6 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/DeletionVectorStruct.java @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import java.io.Serializable; +import org.apache.iceberg.avro.SupportsIndexProjection; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.types.Types; + +/** Mutable {@link StructLike} implementation of {@link DeletionVector}. */ +class DeletionVectorStruct extends SupportsIndexProjection implements DeletionVector, Serializable { + private static final Types.StructType BASE_TYPE = + Types.StructType.of( + DeletionVector.LOCATION, + DeletionVector.OFFSET, + DeletionVector.SIZE_IN_BYTES, + DeletionVector.CARDINALITY); + + private String location = null; + private long offset = -1L; + private long sizeInBytes = -1L; + private long cardinality = -1L; + + DeletionVectorStruct(Types.StructType type) { + super(BASE_TYPE, type); + } + + private DeletionVectorStruct(DeletionVectorStruct toCopy) { + super(toCopy); + this.location = toCopy.location; + this.offset = toCopy.offset; + this.sizeInBytes = toCopy.sizeInBytes; + this.cardinality = toCopy.cardinality; + } + + private DeletionVectorStruct(String location, long offset, long sizeInBytes, long cardinality) { + super(BASE_TYPE, BASE_TYPE); + this.location = location; + this.offset = offset; + this.sizeInBytes = sizeInBytes; + this.cardinality = cardinality; + } + + @Override + public String location() { + return location; + } + + @Override + public long offset() { + return offset; + } + + @Override + public long sizeInBytes() { + return sizeInBytes; + } + + @Override + public long cardinality() { + return cardinality; + } + + @Override + public DeletionVectorStruct copy() { + return new DeletionVectorStruct(this); + } + + @Override + protected T internalGet(int pos, Class javaClass) { + return javaClass.cast(getByPos(pos)); + } + + private Object getByPos(int pos) { + switch (pos) { + case 0: + return location; + case 1: + return offset; + case 2: + return sizeInBytes; + case 3: + return cardinality; + default: + throw new UnsupportedOperationException("Unknown field ordinal: " + pos); + } + } + + @Override + protected void internalSet(int pos, T value) { + switch (pos) { + case 0: + // always coerce to String for Serializable + this.location = value.toString(); + break; + case 1: + this.offset = (Long) value; + break; + case 2: + this.sizeInBytes = (Long) value; + break; + case 3: + this.cardinality = (Long) value; + break; + default: + // ignore the object, it must be from a newer version of the format + } + } + + static Builder builder() { + return new Builder(); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("location", location) + .add("offset", offset) + .add("size_in_bytes", sizeInBytes) + .add("cardinality", cardinality) + .toString(); + } + + static class Builder { + private String location = null; + private long offset = -1L; + private long sizeInBytes = -1L; + private long cardinality = -1L; + + Builder location(String dvLocation) { + this.location = dvLocation; + return this; + } + + Builder offset(long dvOffset) { + this.offset = dvOffset; + return this; + } + + Builder sizeInBytes(long dvSizeInBytes) { + this.sizeInBytes = dvSizeInBytes; + return this; + } + + Builder cardinality(long dvCardinality) { + this.cardinality = dvCardinality; + return this; + } + + DeletionVectorStruct build() { + Preconditions.checkArgument(location != null, "Invalid location: null"); + Preconditions.checkArgument(offset >= 0, "Invalid offset: %s (must be >= 0)", offset); + Preconditions.checkArgument( + sizeInBytes >= 0, "Invalid size in bytes: %s (must be >= 0)", sizeInBytes); + Preconditions.checkArgument( + cardinality >= 0, "Invalid cardinality: %s (must be >= 0)", cardinality); + return new DeletionVectorStruct(location, offset, sizeInBytes, cardinality); + } + } +} diff --git a/core/src/main/java/org/apache/iceberg/EntryStatus.java b/core/src/main/java/org/apache/iceberg/EntryStatus.java new file mode 100644 index 000000000000..ceabeb562415 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/EntryStatus.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +/** Status of an entry in a manifest file. */ +enum EntryStatus { + EXISTING(0), + ADDED(1), + DELETED(2), + /** Indicates an entry that has been replaced by a column update or DV change. Added in v4. */ + REPLACED(3); + + private static final EntryStatus[] VALUES = EntryStatus.values(); + + private final int id; + + EntryStatus(int id) { + this.id = id; + } + + public int id() { + return id; + } + + static EntryStatus fromId(int id) { + return VALUES[id]; + } +} diff --git a/api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java b/core/src/main/java/org/apache/iceberg/FieldStatistic.java similarity index 93% rename from api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java rename to core/src/main/java/org/apache/iceberg/FieldStatistic.java index 72058e5253ab..85712384254c 100644 --- a/api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java +++ b/core/src/main/java/org/apache/iceberg/FieldStatistic.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.stats; +package org.apache.iceberg; import static org.apache.iceberg.types.Types.NestedField.optional; @@ -25,12 +25,12 @@ import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; -public enum FieldStatistic { +enum FieldStatistic { VALUE_COUNT(1, "value_count"), NULL_VALUE_COUNT(2, "null_value_count"), NAN_VALUE_COUNT(3, "nan_value_count"), - AVG_VALUE_SIZE(4, "avg_value_size"), - MAX_VALUE_SIZE(5, "max_value_size"), + AVG_VALUE_SIZE(4, "avg_value_size_in_bytes"), + MAX_VALUE_SIZE(5, "max_value_size_in_bytes"), LOWER_BOUND(6, "lower_bound"), UPPER_BOUND(7, "upper_bound"), EXACT_BOUNDS(8, "exact_bounds"); @@ -125,13 +125,13 @@ public static Types.StructType fieldStatsFor(Types.NestedField field, int baseFi baseFieldId + AVG_VALUE_SIZE.offset(), AVG_VALUE_SIZE.fieldName(), Types.IntegerType.get(), - "Avg value size of variable-length types (String, Binary)")); + "Avg value size in bytes of variable-length types (String, Binary)")); fields.add( optional( baseFieldId + MAX_VALUE_SIZE.offset(), MAX_VALUE_SIZE.fieldName(), Types.IntegerType.get(), - "Max value size of variable-length types (String, Binary)")); + "Max value size in bytes of variable-length types (String, Binary)")); } fields.add( diff --git a/api/src/main/java/org/apache/iceberg/stats/FieldStats.java b/core/src/main/java/org/apache/iceberg/FieldStats.java similarity index 92% rename from api/src/main/java/org/apache/iceberg/stats/FieldStats.java rename to core/src/main/java/org/apache/iceberg/FieldStats.java index 6411b479af49..e42d774c7cee 100644 --- a/api/src/main/java/org/apache/iceberg/stats/FieldStats.java +++ b/core/src/main/java/org/apache/iceberg/FieldStats.java @@ -16,12 +16,11 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.stats; +package org.apache.iceberg; -import org.apache.iceberg.StructLike; import org.apache.iceberg.types.Type; -public interface FieldStats extends StructLike { +interface FieldStats extends StructLike { /** The field ID of the statistic */ int fieldId(); diff --git a/core/src/main/java/org/apache/iceberg/FileCleanupStrategy.java b/core/src/main/java/org/apache/iceberg/FileCleanupStrategy.java index b55280a6537f..573aef057ff6 100644 --- a/core/src/main/java/org/apache/iceberg/FileCleanupStrategy.java +++ b/core/src/main/java/org/apache/iceberg/FileCleanupStrategy.java @@ -26,7 +26,9 @@ import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.SupportsBulkOperations; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.util.Tasks; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -78,13 +80,15 @@ public abstract void cleanFiles( ExpireSnapshots.CleanupLevel cleanupLevel); private static final Schema MANIFEST_PROJECTION = - ManifestFile.schema() - .select( - "manifest_path", - "manifest_length", - "partition_spec_id", - "added_snapshot_id", - "deleted_data_files_count"); + TypeUtil.select( + ManifestFile.schema(), + ImmutableSet.of( + ManifestFile.PATH.fieldId(), + ManifestFile.LENGTH.fieldId(), + ManifestFile.SPEC_ID.fieldId(), + ManifestFile.SNAPSHOT_ID.fieldId(), + ManifestFile.ADDED_FILES_COUNT.fieldId(), + ManifestFile.DELETED_FILES_COUNT.fieldId())); protected CloseableIterable readManifests(Snapshot snapshot) { if (snapshot.manifestListLocation() != null) { diff --git a/core/src/main/java/org/apache/iceberg/GenericManifestEntry.java b/core/src/main/java/org/apache/iceberg/GenericManifestEntry.java index b2ce5fa2aa11..f154c982d1c7 100644 --- a/core/src/main/java/org/apache/iceberg/GenericManifestEntry.java +++ b/core/src/main/java/org/apache/iceberg/GenericManifestEntry.java @@ -26,7 +26,6 @@ class GenericManifestEntry> implements ManifestEntry, IndexedRecord, SpecificData.SchemaConstructable, StructLike { - private static final Status[] STATUS_VALUES = Status.values(); private final org.apache.avro.Schema schema; private Status status = Status.EXISTING; private Long snapshotId = null; @@ -159,7 +158,7 @@ public void setFileSequenceNumber(long newFileSequenceNumber) { public void put(int i, Object v) { switch (i) { case 0: - this.status = STATUS_VALUES[(Integer) v]; + this.status = Status.fromId((Integer) v); return; case 1: this.snapshotId = (Long) v; diff --git a/core/src/main/java/org/apache/iceberg/GenericManifestFile.java b/core/src/main/java/org/apache/iceberg/GenericManifestFile.java index ac93222d01b5..9624484ffe0c 100644 --- a/core/src/main/java/org/apache/iceberg/GenericManifestFile.java +++ b/core/src/main/java/org/apache/iceberg/GenericManifestFile.java @@ -40,8 +40,6 @@ public class GenericManifestFile extends SupportsIndexProjection implements ManifestFile, StructLike, IndexedRecord, SchemaConstructable, Serializable { private static final Schema AVRO_SCHEMA = AvroSchemaUtil.convert(ManifestFile.schema(), "manifest_file"); - private static final ManifestContent[] MANIFEST_CONTENT_VALUES = ManifestContent.values(); - private transient Schema avroSchema; // not final for Java serialization // data fields @@ -343,7 +341,7 @@ protected void internalSet(int basePos, T value) { return; case 3: this.content = - value != null ? MANIFEST_CONTENT_VALUES[(Integer) value] : ManifestContent.DATA; + value != null ? ManifestContent.fromId((Integer) value) : ManifestContent.DATA; return; case 4: this.sequenceNumber = value != null ? (Long) value : 0; diff --git a/core/src/main/java/org/apache/iceberg/ManifestEntry.java b/core/src/main/java/org/apache/iceberg/ManifestEntry.java index 4dce92cf5c2f..635231069ffc 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestEntry.java +++ b/core/src/main/java/org/apache/iceberg/ManifestEntry.java @@ -30,6 +30,8 @@ enum Status { ADDED(1), DELETED(2); + private static final Status[] VALUES = Status.values(); + private final int id; Status(int id) { @@ -39,6 +41,10 @@ enum Status { public int id() { return id; } + + static Status fromId(int id) { + return VALUES[id]; + } } // ids for data-file columns are assigned from 1000 diff --git a/core/src/main/java/org/apache/iceberg/ManifestInfo.java b/core/src/main/java/org/apache/iceberg/ManifestInfo.java new file mode 100644 index 000000000000..e87287911426 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/ManifestInfo.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import java.nio.ByteBuffer; +import org.apache.iceberg.types.Types; + +/** Summary information about a manifest referenced by a root manifest entry. */ +interface ManifestInfo { + Types.NestedField ADDED_FILES_COUNT = + Types.NestedField.required( + 504, "added_files_count", Types.IntegerType.get(), "Number of files added"); + Types.NestedField EXISTING_FILES_COUNT = + Types.NestedField.required( + 505, "existing_files_count", Types.IntegerType.get(), "Number of existing files"); + Types.NestedField DELETED_FILES_COUNT = + Types.NestedField.required( + 506, "deleted_files_count", Types.IntegerType.get(), "Number of deleted files"); + Types.NestedField REPLACED_FILES_COUNT = + Types.NestedField.required( + 520, "replaced_files_count", Types.IntegerType.get(), "Number of replaced files"); + Types.NestedField ADDED_ROWS_COUNT = + Types.NestedField.required( + 512, "added_rows_count", Types.LongType.get(), "Number of rows in added files"); + Types.NestedField EXISTING_ROWS_COUNT = + Types.NestedField.required( + 513, "existing_rows_count", Types.LongType.get(), "Number of rows in existing files"); + Types.NestedField DELETED_ROWS_COUNT = + Types.NestedField.required( + 514, "deleted_rows_count", Types.LongType.get(), "Number of rows in deleted files"); + Types.NestedField REPLACED_ROWS_COUNT = + Types.NestedField.required( + 521, "replaced_rows_count", Types.LongType.get(), "Number of rows in replaced files"); + Types.NestedField MIN_SEQUENCE_NUMBER = + Types.NestedField.required( + 516, + "min_sequence_number", + Types.LongType.get(), + "Minimum sequence number of files in this manifest"); + Types.NestedField DV = + Types.NestedField.optional( + 522, "dv", Types.BinaryType.get(), "Deletion vector for manifest entries"); + Types.NestedField DV_CARDINALITY = + Types.NestedField.optional( + 523, + "dv_cardinality", + Types.LongType.get(), + "Number of entries marked as deleted in the DV"); + + static Types.StructType schema() { + return Types.StructType.of( + ADDED_FILES_COUNT, + EXISTING_FILES_COUNT, + DELETED_FILES_COUNT, + REPLACED_FILES_COUNT, + ADDED_ROWS_COUNT, + EXISTING_ROWS_COUNT, + DELETED_ROWS_COUNT, + REPLACED_ROWS_COUNT, + MIN_SEQUENCE_NUMBER, + DV, + DV_CARDINALITY); + } + + /** Returns the number of files added by this manifest. */ + int addedFilesCount(); + + /** Returns the number of existing files referenced by this manifest. */ + int existingFilesCount(); + + /** Returns the number of deleted files in this manifest. */ + int deletedFilesCount(); + + /** Returns the number of replaced files in this manifest. */ + int replacedFilesCount(); + + /** Returns the number of rows in added files. */ + long addedRowsCount(); + + /** Returns the number of rows in existing files. */ + long existingRowsCount(); + + /** Returns the number of rows in deleted files. */ + long deletedRowsCount(); + + /** Returns the number of rows in replaced files. */ + long replacedRowsCount(); + + /** Returns the minimum sequence number of files in this manifest. */ + long minSequenceNumber(); + + /** Returns the deletion vector bitmap, or null if not present. */ + ByteBuffer dv(); + + /** Returns the number of entries marked as deleted in the DV, or null if not present. */ + Long dvCardinality(); + + /** Copies this manifest info. */ + ManifestInfo copy(); +} diff --git a/core/src/main/java/org/apache/iceberg/ManifestInfoStruct.java b/core/src/main/java/org/apache/iceberg/ManifestInfoStruct.java new file mode 100644 index 000000000000..922047bffedd --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/ManifestInfoStruct.java @@ -0,0 +1,384 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.util.Arrays; +import org.apache.iceberg.avro.SupportsIndexProjection; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.ByteBuffers; + +/** Mutable {@link StructLike} implementation of {@link ManifestInfo}. */ +class ManifestInfoStruct extends SupportsIndexProjection implements ManifestInfo, Serializable { + private static final Types.StructType BASE_TYPE = + Types.StructType.of( + ManifestInfo.ADDED_FILES_COUNT, + ManifestInfo.EXISTING_FILES_COUNT, + ManifestInfo.DELETED_FILES_COUNT, + ManifestInfo.REPLACED_FILES_COUNT, + ManifestInfo.ADDED_ROWS_COUNT, + ManifestInfo.EXISTING_ROWS_COUNT, + ManifestInfo.DELETED_ROWS_COUNT, + ManifestInfo.REPLACED_ROWS_COUNT, + ManifestInfo.MIN_SEQUENCE_NUMBER, + ManifestInfo.DV, + ManifestInfo.DV_CARDINALITY); + + private int addedFilesCount = -1; + private int existingFilesCount = -1; + private int deletedFilesCount = -1; + private int replacedFilesCount = -1; + private long addedRowsCount = -1L; + private long existingRowsCount = -1L; + private long deletedRowsCount = -1L; + private long replacedRowsCount = -1L; + private long minSequenceNumber = -1L; + private byte[] dv = null; + private Long dvCardinality = null; + + ManifestInfoStruct(Types.StructType type) { + super(BASE_TYPE, type); + } + + private ManifestInfoStruct(ManifestInfoStruct toCopy) { + super(toCopy); + this.addedFilesCount = toCopy.addedFilesCount; + this.existingFilesCount = toCopy.existingFilesCount; + this.deletedFilesCount = toCopy.deletedFilesCount; + this.replacedFilesCount = toCopy.replacedFilesCount; + this.addedRowsCount = toCopy.addedRowsCount; + this.existingRowsCount = toCopy.existingRowsCount; + this.deletedRowsCount = toCopy.deletedRowsCount; + this.replacedRowsCount = toCopy.replacedRowsCount; + this.minSequenceNumber = toCopy.minSequenceNumber; + this.dv = toCopy.dv != null ? Arrays.copyOf(toCopy.dv, toCopy.dv.length) : null; + this.dvCardinality = toCopy.dvCardinality; + } + + private ManifestInfoStruct( + int addedFilesCount, + int existingFilesCount, + int deletedFilesCount, + int replacedFilesCount, + long addedRowsCount, + long existingRowsCount, + long deletedRowsCount, + long replacedRowsCount, + long minSequenceNumber, + byte[] dv, + Long dvCardinality) { + super(BASE_TYPE, BASE_TYPE); + this.addedFilesCount = addedFilesCount; + this.existingFilesCount = existingFilesCount; + this.deletedFilesCount = deletedFilesCount; + this.replacedFilesCount = replacedFilesCount; + this.addedRowsCount = addedRowsCount; + this.existingRowsCount = existingRowsCount; + this.deletedRowsCount = deletedRowsCount; + this.replacedRowsCount = replacedRowsCount; + this.minSequenceNumber = minSequenceNumber; + this.dv = dv; + this.dvCardinality = dvCardinality; + } + + @Override + public int addedFilesCount() { + return addedFilesCount; + } + + @Override + public int existingFilesCount() { + return existingFilesCount; + } + + @Override + public int deletedFilesCount() { + return deletedFilesCount; + } + + @Override + public int replacedFilesCount() { + return replacedFilesCount; + } + + @Override + public long addedRowsCount() { + return addedRowsCount; + } + + @Override + public long existingRowsCount() { + return existingRowsCount; + } + + @Override + public long deletedRowsCount() { + return deletedRowsCount; + } + + @Override + public long replacedRowsCount() { + return replacedRowsCount; + } + + @Override + public long minSequenceNumber() { + return minSequenceNumber; + } + + @Override + public ByteBuffer dv() { + return dv != null ? ByteBuffer.wrap(dv) : null; + } + + @Override + public Long dvCardinality() { + return dvCardinality; + } + + @Override + public ManifestInfoStruct copy() { + return new ManifestInfoStruct(this); + } + + @Override + protected T internalGet(int pos, Class javaClass) { + return javaClass.cast(getByPos(pos)); + } + + private Object getByPos(int pos) { + switch (pos) { + case 0: + return addedFilesCount; + case 1: + return existingFilesCount; + case 2: + return deletedFilesCount; + case 3: + return replacedFilesCount; + case 4: + return addedRowsCount; + case 5: + return existingRowsCount; + case 6: + return deletedRowsCount; + case 7: + return replacedRowsCount; + case 8: + return minSequenceNumber; + case 9: + return dv(); + case 10: + return dvCardinality; + default: + throw new UnsupportedOperationException("Unknown field ordinal: " + pos); + } + } + + @Override + protected void internalSet(int pos, T value) { + switch (pos) { + case 0: + this.addedFilesCount = (Integer) value; + break; + case 1: + this.existingFilesCount = (Integer) value; + break; + case 2: + this.deletedFilesCount = (Integer) value; + break; + case 3: + this.replacedFilesCount = (Integer) value; + break; + case 4: + this.addedRowsCount = (Long) value; + break; + case 5: + this.existingRowsCount = (Long) value; + break; + case 6: + this.deletedRowsCount = (Long) value; + break; + case 7: + this.replacedRowsCount = (Long) value; + break; + case 8: + this.minSequenceNumber = (Long) value; + break; + case 9: + this.dv = ByteBuffers.toByteArray((ByteBuffer) value); + break; + case 10: + this.dvCardinality = (Long) value; + break; + default: + // ignore the object, it must be from a newer version of the format + } + } + + static Builder builder() { + return new Builder(); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("added_files_count", addedFilesCount) + .add("existing_files_count", existingFilesCount) + .add("deleted_files_count", deletedFilesCount) + .add("replaced_files_count", replacedFilesCount) + .add("added_rows_count", addedRowsCount) + .add("existing_rows_count", existingRowsCount) + .add("deleted_rows_count", deletedRowsCount) + .add("replaced_rows_count", replacedRowsCount) + .add("min_sequence_number", minSequenceNumber) + .add("dv", dv == null ? "null" : "(binary)") + .add("dv_cardinality", dvCardinality == null ? "null" : dvCardinality) + .toString(); + } + + static class Builder { + private int addedFilesCount = -1; + private int existingFilesCount = -1; + private int deletedFilesCount = -1; + private int replacedFilesCount = -1; + private long addedRowsCount = -1L; + private long existingRowsCount = -1L; + private long deletedRowsCount = -1L; + private long replacedRowsCount = -1L; + private long minSequenceNumber = -1L; + private byte[] dv = null; + private Long dvCardinality = null; + + Builder addedFilesCount(int count) { + this.addedFilesCount = count; + return this; + } + + Builder existingFilesCount(int count) { + this.existingFilesCount = count; + return this; + } + + Builder deletedFilesCount(int count) { + this.deletedFilesCount = count; + return this; + } + + Builder replacedFilesCount(int count) { + this.replacedFilesCount = count; + return this; + } + + Builder addedRowsCount(long count) { + this.addedRowsCount = count; + return this; + } + + Builder existingRowsCount(long count) { + this.existingRowsCount = count; + return this; + } + + Builder deletedRowsCount(long count) { + this.deletedRowsCount = count; + return this; + } + + Builder replacedRowsCount(long count) { + this.replacedRowsCount = count; + return this; + } + + Builder minSequenceNumber(long sequenceNumber) { + this.minSequenceNumber = sequenceNumber; + return this; + } + + Builder dv(ByteBuffer buffer) { + this.dv = buffer != null ? ByteBuffers.toByteArray(buffer) : null; + return this; + } + + Builder dv(byte[] buffer) { + this.dv = buffer; + return this; + } + + Builder dvCardinality(Long cardinality) { + this.dvCardinality = cardinality; + return this; + } + + ManifestInfoStruct build() { + Preconditions.checkArgument( + addedFilesCount >= 0, "Invalid added files count: %s (must be >= 0)", addedFilesCount); + Preconditions.checkArgument( + existingFilesCount >= 0, + "Invalid existing files count: %s (must be >= 0)", + existingFilesCount); + Preconditions.checkArgument( + deletedFilesCount >= 0, + "Invalid deleted files count: %s (must be >= 0)", + deletedFilesCount); + Preconditions.checkArgument( + replacedFilesCount >= 0, + "Invalid replaced files count: %s (must be >= 0)", + replacedFilesCount); + Preconditions.checkArgument( + addedRowsCount >= 0, "Invalid added rows count: %s (must be >= 0)", addedRowsCount); + Preconditions.checkArgument( + existingRowsCount >= 0, + "Invalid existing rows count: %s (must be >= 0)", + existingRowsCount); + Preconditions.checkArgument( + deletedRowsCount >= 0, "Invalid deleted rows count: %s (must be >= 0)", deletedRowsCount); + Preconditions.checkArgument( + replacedRowsCount >= 0, + "Invalid replaced rows count: %s (must be >= 0)", + replacedRowsCount); + Preconditions.checkArgument( + minSequenceNumber >= 0, + "Invalid min sequence number: %s (must be >= 0)", + minSequenceNumber); + Preconditions.checkArgument( + (dv == null) == (dvCardinality == null), + "Invalid DV and cardinality: must both be null or non-null"); + Preconditions.checkArgument( + dvCardinality == null || dvCardinality > 0, + "Invalid DV cardinality: %s (must be positive)", + dvCardinality); + return new ManifestInfoStruct( + addedFilesCount, + existingFilesCount, + deletedFilesCount, + replacedFilesCount, + addedRowsCount, + existingRowsCount, + deletedRowsCount, + replacedRowsCount, + minSequenceNumber, + dv, + dvCardinality); + } + } +} diff --git a/core/src/main/java/org/apache/iceberg/ManifestReader.java b/core/src/main/java/org/apache/iceberg/ManifestReader.java index 668a3764de1d..09bbe8b0cc6b 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestReader.java +++ b/core/src/main/java/org/apache/iceberg/ManifestReader.java @@ -43,6 +43,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.PartitionSet; import org.slf4j.Logger; @@ -68,6 +69,11 @@ public class ManifestReader> extends CloseableGroup "upper_bounds", "record_count"); + private static final Schema STATUS_ONLY_PROJECTION = + TypeUtil.select( + ManifestEntry.getSchema(Types.StructType.of()), + ImmutableSet.of(ManifestEntry.STATUS.fieldId())); + protected enum FileType { DATA_FILES(GenericDataFile.class), DELETE_FILES(GenericDeleteFile.class); @@ -157,9 +163,7 @@ private static > Map readMetadata(Input Map metadata; try { try (CloseableIterable> headerReader = - InternalData.read(FileFormat.AVRO, inputFile) - .project(ManifestEntry.getSchema(Types.StructType.of()).select("status")) - .build()) { + InternalData.read(FileFormat.AVRO, inputFile).project(STATUS_ONLY_PROJECTION).build()) { if (headerReader instanceof AvroIterable) { metadata = ((AvroIterable>) headerReader).getMetadata(); diff --git a/core/src/main/java/org/apache/iceberg/MergingSnapshotProducer.java b/core/src/main/java/org/apache/iceberg/MergingSnapshotProducer.java index eed4e56dc05a..e072382543b7 100644 --- a/core/src/main/java/org/apache/iceberg/MergingSnapshotProducer.java +++ b/core/src/main/java/org/apache/iceberg/MergingSnapshotProducer.java @@ -289,7 +289,11 @@ private void addInternal(DeleteFile file) { protected void validateNewDeleteFile(DeleteFile file) { Preconditions.checkNotNull(file, "Invalid delete file: null"); - switch (formatVersion()) { + validateDeleteFileForVersion(file, formatVersion()); + } + + private static void validateDeleteFileForVersion(DeleteFile file, int formatVersion) { + switch (formatVersion) { case 1: throw new IllegalArgumentException("Deletes are supported in V2 and above"); case 2: @@ -303,11 +307,11 @@ protected void validateNewDeleteFile(DeleteFile file) { Preconditions.checkArgument( file.content() == FileContent.EQUALITY_DELETES || ContentFileUtil.isDV(file), "Must use DVs for position deletes in V%s: %s", - formatVersion(), + formatVersion, file.location()); break; default: - throw new IllegalArgumentException("Unsupported format version: " + formatVersion()); + throw new IllegalArgumentException("Unsupported format version: " + formatVersion); } } @@ -959,8 +963,16 @@ protected Map summary() { return summaryBuilder.build(); } + // guard buffered deletes against concurrent format upgrade + private void validateDeleteFilesForVersion(int currentFormatVersion) { + for (DeleteFile file : v2Deletes) { + validateDeleteFileForVersion(file, currentFormatVersion); + } + } + @Override public List apply(TableMetadata base, Snapshot snapshot) { + validateDeleteFilesForVersion(base.formatVersion()); // filter any existing manifests List filtered = filterManager.filterManifests( diff --git a/core/src/main/java/org/apache/iceberg/MetricsConfig.java b/core/src/main/java/org/apache/iceberg/MetricsConfig.java index 593dbc570b8a..2b55bcbeab22 100644 --- a/core/src/main/java/org/apache/iceberg/MetricsConfig.java +++ b/core/src/main/java/org/apache/iceberg/MetricsConfig.java @@ -223,7 +223,7 @@ public Set map( * @param order sort order columns, will be promoted to truncate(16) * @return metrics configuration */ - private static MetricsConfig from(Map props, Schema schema, SortOrder order) { + public static MetricsConfig from(Map props, Schema schema, SortOrder order) { int maxInferredDefaultColumns = maxInferredColumnDefaults(props); Map columnModes = Maps.newHashMap(); diff --git a/core/src/main/java/org/apache/iceberg/MetricsUtil.java b/core/src/main/java/org/apache/iceberg/MetricsUtil.java index 944e833b31d7..72c57a8bebcf 100644 --- a/core/src/main/java/org/apache/iceberg/MetricsUtil.java +++ b/core/src/main/java/org/apache/iceberg/MetricsUtil.java @@ -34,9 +34,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.stats.BaseContentStats; -import org.apache.iceberg.stats.BaseFieldStats; -import org.apache.iceberg.stats.ContentStats; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.TypeUtil; @@ -482,7 +479,7 @@ public void set(int pos, T value) { } } - public static ContentStats fromMetrics(Schema schema, Metrics metrics) { + static ContentStats fromMetrics(Schema schema, Metrics metrics) { if (null == metrics) { return null; } diff --git a/core/src/main/java/org/apache/iceberg/PartitionsTable.java b/core/src/main/java/org/apache/iceberg/PartitionsTable.java index 09c6e7893b7e..10366db5a55d 100644 --- a/core/src/main/java/org/apache/iceberg/PartitionsTable.java +++ b/core/src/main/java/org/apache/iceberg/PartitionsTable.java @@ -27,7 +27,9 @@ import org.apache.iceberg.expressions.ManifestEvaluator; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; import org.apache.iceberg.types.Comparators; +import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.ParallelIterable; import org.apache.iceberg.util.PartitionUtil; @@ -37,6 +39,58 @@ /** A {@link Table} implementation that exposes a table's partitions as rows. */ public class PartitionsTable extends BaseMetadataTable { + private static final int PARTITION_FIELD_ID = 1; + + private static final Types.NestedField SPEC_ID = + Types.NestedField.required(4, "spec_id", Types.IntegerType.get()); + private static final Types.NestedField RECORD_COUNT = + Types.NestedField.required( + 2, "record_count", Types.LongType.get(), "Count of records in data files"); + private static final Types.NestedField FILE_COUNT = + Types.NestedField.required(3, "file_count", Types.IntegerType.get(), "Count of data files"); + private static final Types.NestedField TOTAL_DATA_FILE_SIZE_IN_BYTES = + Types.NestedField.required( + 11, + "total_data_file_size_in_bytes", + Types.LongType.get(), + "Total size in bytes of data files"); + private static final Types.NestedField POSITION_DELETE_RECORD_COUNT = + Types.NestedField.required( + 5, + "position_delete_record_count", + Types.LongType.get(), + "Count of records in position delete files"); + private static final Types.NestedField POSITION_DELETE_FILE_COUNT = + Types.NestedField.required( + 6, + "position_delete_file_count", + Types.IntegerType.get(), + "Count of position delete files"); + private static final Types.NestedField EQUALITY_DELETE_RECORD_COUNT = + Types.NestedField.required( + 7, + "equality_delete_record_count", + Types.LongType.get(), + "Count of records in equality delete files"); + private static final Types.NestedField EQUALITY_DELETE_FILE_COUNT = + Types.NestedField.required( + 8, + "equality_delete_file_count", + Types.IntegerType.get(), + "Count of equality delete files"); + private static final Types.NestedField LAST_UPDATED_AT = + Types.NestedField.optional( + 9, + "last_updated_at", + Types.TimestampType.withZone(), + "Commit time of snapshot that last updated this partition"); + private static final Types.NestedField LAST_UPDATED_SNAPSHOT_ID = + Types.NestedField.optional( + 10, + "last_updated_snapshot_id", + Types.LongType.get(), + "Id of snapshot that last updated this partition"); + private final Schema schema; private final boolean unpartitionedTable; @@ -50,47 +104,18 @@ public class PartitionsTable extends BaseMetadataTable { this.schema = new Schema( - Types.NestedField.required(1, "partition", Partitioning.partitionType(table)), - Types.NestedField.required(4, "spec_id", Types.IntegerType.get()), - Types.NestedField.required( - 2, "record_count", Types.LongType.get(), "Count of records in data files"), - Types.NestedField.required( - 3, "file_count", Types.IntegerType.get(), "Count of data files"), - Types.NestedField.required( - 11, - "total_data_file_size_in_bytes", - Types.LongType.get(), - "Total size in bytes of data files"), Types.NestedField.required( - 5, - "position_delete_record_count", - Types.LongType.get(), - "Count of records in position delete files"), - Types.NestedField.required( - 6, - "position_delete_file_count", - Types.IntegerType.get(), - "Count of position delete files"), - Types.NestedField.required( - 7, - "equality_delete_record_count", - Types.LongType.get(), - "Count of records in equality delete files"), - Types.NestedField.required( - 8, - "equality_delete_file_count", - Types.IntegerType.get(), - "Count of equality delete files"), - Types.NestedField.optional( - 9, - "last_updated_at", - Types.TimestampType.withZone(), - "Commit time of snapshot that last updated this partition"), - Types.NestedField.optional( - 10, - "last_updated_snapshot_id", - Types.LongType.get(), - "Id of snapshot that last updated this partition")); + PARTITION_FIELD_ID, "partition", Partitioning.partitionType(table)), + SPEC_ID, + RECORD_COUNT, + FILE_COUNT, + TOTAL_DATA_FILE_SIZE_IN_BYTES, + POSITION_DELETE_RECORD_COUNT, + POSITION_DELETE_FILE_COUNT, + EQUALITY_DELETE_RECORD_COUNT, + EQUALITY_DELETE_FILE_COUNT, + LAST_UPDATED_AT, + LAST_UPDATED_SNAPSHOT_ID); this.unpartitionedTable = Partitioning.partitionType(table).fields().isEmpty(); } @@ -102,16 +127,18 @@ public TableScan newScan() { @Override public Schema schema() { if (unpartitionedTable) { - return schema.select( - "record_count", - "file_count", - "total_data_file_size_in_bytes", - "position_delete_record_count", - "position_delete_file_count", - "equality_delete_record_count", - "equality_delete_file_count", - "last_updated_at", - "last_updated_snapshot_id"); + return TypeUtil.select( + schema, + ImmutableSet.of( + RECORD_COUNT.fieldId(), + FILE_COUNT.fieldId(), + TOTAL_DATA_FILE_SIZE_IN_BYTES.fieldId(), + POSITION_DELETE_RECORD_COUNT.fieldId(), + POSITION_DELETE_FILE_COUNT.fieldId(), + EQUALITY_DELETE_RECORD_COUNT.fieldId(), + EQUALITY_DELETE_FILE_COUNT.fieldId(), + LAST_UPDATED_AT.fieldId(), + LAST_UPDATED_SNAPSHOT_ID.fieldId())); } return schema; } diff --git a/api/src/main/java/org/apache/iceberg/stats/StatsUtil.java b/core/src/main/java/org/apache/iceberg/StatsUtil.java similarity index 98% rename from api/src/main/java/org/apache/iceberg/stats/StatsUtil.java rename to core/src/main/java/org/apache/iceberg/StatsUtil.java index 2ff52f92bdda..39fef3d372d3 100644 --- a/api/src/main/java/org/apache/iceberg/stats/StatsUtil.java +++ b/core/src/main/java/org/apache/iceberg/StatsUtil.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.stats; +package org.apache.iceberg; import static org.apache.iceberg.types.Types.NestedField.optional; @@ -25,7 +25,6 @@ import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; -import org.apache.iceberg.Schema; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.types.TypeUtil; @@ -33,7 +32,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class StatsUtil { +class StatsUtil { private static final Logger LOG = LoggerFactory.getLogger(StatsUtil.class); // the number of reserved field IDs from the reserved field ID space as defined in // https://iceberg.apache.org/spec/#reserved-field-ids diff --git a/core/src/main/java/org/apache/iceberg/TableProperties.java b/core/src/main/java/org/apache/iceberg/TableProperties.java index 1f778984af17..021ef95d9122 100644 --- a/core/src/main/java/org/apache/iceberg/TableProperties.java +++ b/core/src/main/java/org/apache/iceberg/TableProperties.java @@ -135,6 +135,10 @@ private TableProperties() {} "write.delete.parquet.page-size-bytes"; public static final int PARQUET_PAGE_SIZE_BYTES_DEFAULT = 1024 * 1024; // 1 MB + public static final String PARQUET_PAGE_VERSION = "write.parquet.page-version"; + public static final String DELETE_PARQUET_PAGE_VERSION = "write.delete.parquet.page-version"; + public static final String PARQUET_PAGE_VERSION_DEFAULT = "v1"; + public static final String PARQUET_PAGE_ROW_LIMIT = "write.parquet.page-row-limit"; public static final String DELETE_PARQUET_PAGE_ROW_LIMIT = "write.delete.parquet.page-row-limit"; public static final int PARQUET_PAGE_ROW_LIMIT_DEFAULT = 20_000; @@ -154,6 +158,12 @@ private TableProperties() {} "write.delete.parquet.compression-level"; public static final String PARQUET_COMPRESSION_LEVEL_DEFAULT = null; + public static final String PARQUET_SHRED_VARIANTS = "write.parquet.shred-variants"; + public static final boolean PARQUET_SHRED_VARIANTS_DEFAULT = false; + public static final String PARQUET_VARIANT_BUFFER_SIZE = + "write.parquet.variant-inference-buffer-size"; + public static final int PARQUET_VARIANT_BUFFER_SIZE_DEFAULT = 100; + public static final String PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT = "write.parquet.row-group-check-min-record-count"; public static final String DELETE_PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT = @@ -346,6 +356,10 @@ private TableProperties() {} public static final String SPARK_WRITE_ACCEPT_ANY_SCHEMA = "write.spark.accept-any-schema"; public static final boolean SPARK_WRITE_ACCEPT_ANY_SCHEMA_DEFAULT = false; + public static final String SPARK_WRITE_AUTO_SCHEMA_EVOLUTION = + "write.spark.auto-schema-evolution.enabled"; + public static final boolean SPARK_WRITE_AUTO_SCHEMA_EVOLUTION_DEFAULT = true; + public static final String SPARK_WRITE_ADVISORY_PARTITION_SIZE_BYTES = "write.spark.advisory-partition-size-bytes"; diff --git a/core/src/main/java/org/apache/iceberg/TrackedFile.java b/core/src/main/java/org/apache/iceberg/TrackedFile.java new file mode 100644 index 000000000000..d9ae100ac651 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/TrackedFile.java @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.List; +import java.util.Set; +import org.apache.iceberg.types.Types; + +/** A file tracked by a manifest. */ +interface TrackedFile { + Types.NestedField TRACKING = + Types.NestedField.required( + 147, "tracking", Tracking.schema(), "Tracking information for this entry"); + Types.NestedField CONTENT_TYPE = + Types.NestedField.required( + 134, + "content_type", + Types.IntegerType.get(), + "Type of content: 0=DATA, 2=EQUALITY_DELETES, 3=DATA_MANIFEST, 4=DELETE_MANIFEST"); + Types.NestedField LOCATION = + Types.NestedField.required(100, "location", Types.StringType.get(), "Location of the file"); + Types.NestedField FILE_FORMAT = + Types.NestedField.required( + 101, + "file_format", + Types.StringType.get(), + "String file format name: avro, orc, or parquet"); + Types.NestedField RECORD_COUNT = + Types.NestedField.required( + 103, "record_count", Types.LongType.get(), "Number of records in this file"); + Types.NestedField FILE_SIZE_IN_BYTES = + Types.NestedField.required( + 104, "file_size_in_bytes", Types.LongType.get(), "Total file size in bytes"); + Types.NestedField SPEC_ID = + Types.NestedField.optional( + 141, "spec_id", Types.IntegerType.get(), "Spec ID used to partition the file"); + + int CONTENT_STATS_ID = 146; + String CONTENT_STATS_NAME = "content_stats"; + String CONTENT_STATS_DOC = "Content statistics for this entry"; + + Types.NestedField SORT_ORDER_ID = + Types.NestedField.optional( + 140, "sort_order_id", Types.IntegerType.get(), "ID of the sort order for this file"); + Types.NestedField DELETION_VECTOR = + Types.NestedField.optional( + 148, "deletion_vector", DeletionVector.schema(), "Deletion vector for the data file"); + Types.NestedField MANIFEST_INFO = + Types.NestedField.optional( + 150, + "manifest_info", + ManifestInfo.schema(), + "Metadata fields specific to manifest files"); + Types.NestedField KEY_METADATA = + Types.NestedField.optional( + 131, + "key_metadata", + Types.BinaryType.get(), + "Implementation-specific key metadata for encryption"); + Types.NestedField SPLIT_OFFSETS = + Types.NestedField.optional( + 132, + "split_offsets", + Types.ListType.ofRequired(133, Types.LongType.get()), + "Split offsets for the data file"); + Types.NestedField EQUALITY_IDS = + Types.NestedField.optional( + 135, + "equality_ids", + Types.ListType.ofRequired(136, Types.IntegerType.get()), + "Field ids used to determine row equality in equality delete files"); + + static Types.StructType schemaWithContentStats(Types.StructType contentStatsType) { + return Types.StructType.of( + TRACKING, + CONTENT_TYPE, + LOCATION, + FILE_FORMAT, + RECORD_COUNT, + FILE_SIZE_IN_BYTES, + SPEC_ID, + Types.NestedField.optional( + CONTENT_STATS_ID, CONTENT_STATS_NAME, contentStatsType, CONTENT_STATS_DOC), + SORT_ORDER_ID, + DELETION_VECTOR, + MANIFEST_INFO, + KEY_METADATA, + SPLIT_OFFSETS, + EQUALITY_IDS); + } + + /** Returns the tracking information for this entry. */ + Tracking tracking(); + + /** Returns the type of content stored by this entry. */ + FileContent contentType(); + + /** Returns the location of the file. */ + String location(); + + /** Returns the format of the file. */ + FileFormat fileFormat(); + + /** Returns the number of records in this file. */ + long recordCount(); + + /** Returns the total file size in bytes. */ + long fileSizeInBytes(); + + /** Returns the ID of the partition spec used to partition this file, or null. */ + Integer specId(); + + /** Returns the content stats for this entry. */ + ContentStats contentStats(); + + /** Returns the ID of the sort order for this file, or null. */ + Integer sortOrderId(); + + /** Returns the deletion vector for this entry, or null if there is no deletion vector. */ + DeletionVector deletionVector(); + + /** Returns the manifest summary information, or null for non-manifest entries. */ + ManifestInfo manifestInfo(); + + /** Returns encryption key metadata, or null if the file is not encrypted. */ + ByteBuffer keyMetadata(); + + /** Returns the list of recommended split locations, or null. */ + List splitOffsets(); + + /** Returns the set of field IDs used for equality comparison in equality delete files. */ + List equalityIds(); + + /** Copies this tracked file. */ + TrackedFile copy(); + + /** + * Copies this tracked file with stats only for specific columns. + * + * @param requestedColumnIds table field IDs for which to keep stats + */ + TrackedFile copyWithStats(Set requestedColumnIds); + + /** Copies this tracked file without stats. */ + default TrackedFile copyWithoutStats() { + return copyWithStats(Collections.emptySet()); + } +} diff --git a/core/src/main/java/org/apache/iceberg/TrackedFileStruct.java b/core/src/main/java/org/apache/iceberg/TrackedFileStruct.java new file mode 100644 index 000000000000..ba9fd362038a --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/TrackedFileStruct.java @@ -0,0 +1,328 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.List; +import java.util.Set; +import org.apache.iceberg.avro.SupportsIndexProjection; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.ArrayUtil; +import org.apache.iceberg.util.ByteBuffers; + +/** Mutable {@link StructLike} implementation of {@link TrackedFile}. */ +class TrackedFileStruct extends SupportsIndexProjection implements TrackedFile, Serializable { + private static final Types.StructType EMPTY_STRUCT_TYPE = Types.StructType.of(); + + private static final Types.StructType BASE_TYPE = + Types.StructType.of( + TrackedFile.TRACKING, + TrackedFile.CONTENT_TYPE, + TrackedFile.LOCATION, + TrackedFile.FILE_FORMAT, + TrackedFile.RECORD_COUNT, + TrackedFile.FILE_SIZE_IN_BYTES, + TrackedFile.SPEC_ID, + Types.NestedField.optional( + TrackedFile.CONTENT_STATS_ID, + TrackedFile.CONTENT_STATS_NAME, + EMPTY_STRUCT_TYPE, + TrackedFile.CONTENT_STATS_DOC), + TrackedFile.SORT_ORDER_ID, + TrackedFile.DELETION_VECTOR, + TrackedFile.MANIFEST_INFO, + TrackedFile.KEY_METADATA, + TrackedFile.SPLIT_OFFSETS, + TrackedFile.EQUALITY_IDS); + + private FileContent contentType = null; + private String location = null; + private FileFormat fileFormat = null; + private long recordCount = -1L; + private long fileSizeInBytes = -1L; + private Integer specId = null; + + // optional fields + private Tracking tracking = null; + private ContentStats contentStats = null; + private Integer sortOrderId = null; + private DeletionVector deletionVector = null; + private ManifestInfo manifestInfo = null; + private byte[] keyMetadata = null; + private long[] splitOffsets = null; + private int[] equalityIds = null; + + /** Used by internal readers to instantiate this class with a projection schema. */ + TrackedFileStruct(Types.StructType projection) { + super(BASE_TYPE, projection); + } + + /** No-projection constructor for direct construction. */ + TrackedFileStruct() { + super(BASE_TYPE.fields().size()); + } + + /** Constructor that accepts required fields. */ + TrackedFileStruct( + Tracking tracking, + FileContent contentType, + String location, + FileFormat fileFormat, + long recordCount, + long fileSizeInBytes) { + super(BASE_TYPE.fields().size()); + this.tracking = tracking; + this.contentType = contentType; + this.location = location; + this.fileFormat = fileFormat; + this.recordCount = recordCount; + this.fileSizeInBytes = fileSizeInBytes; + } + + /** Copy constructor. */ + private TrackedFileStruct(TrackedFileStruct toCopy, boolean withStats, Set statsIds) { + super(toCopy); + this.contentType = toCopy.contentType; + this.location = toCopy.location; + this.fileFormat = toCopy.fileFormat; + this.recordCount = toCopy.recordCount; + this.fileSizeInBytes = toCopy.fileSizeInBytes; + this.specId = toCopy.specId; + + this.tracking = toCopy.tracking != null ? toCopy.tracking.copy() : null; + + this.sortOrderId = toCopy.sortOrderId; + this.deletionVector = toCopy.deletionVector != null ? toCopy.deletionVector.copy() : null; + + if (withStats && toCopy.contentStats != null) { + ContentStats filtered = BaseContentStats.buildFrom(toCopy.contentStats, statsIds).build(); + this.contentStats = filtered.fieldStats().isEmpty() ? null : filtered; + } else { + this.contentStats = null; + } + + this.manifestInfo = toCopy.manifestInfo != null ? toCopy.manifestInfo.copy() : null; + this.keyMetadata = + toCopy.keyMetadata != null + ? Arrays.copyOf(toCopy.keyMetadata, toCopy.keyMetadata.length) + : null; + this.splitOffsets = + toCopy.splitOffsets != null + ? Arrays.copyOf(toCopy.splitOffsets, toCopy.splitOffsets.length) + : null; + this.equalityIds = + toCopy.equalityIds != null + ? Arrays.copyOf(toCopy.equalityIds, toCopy.equalityIds.length) + : null; + } + + @Override + public Tracking tracking() { + return tracking; + } + + @Override + public FileContent contentType() { + return contentType; + } + + @Override + public String location() { + return location; + } + + @Override + public FileFormat fileFormat() { + return fileFormat; + } + + @Override + public long recordCount() { + return recordCount; + } + + @Override + public long fileSizeInBytes() { + return fileSizeInBytes; + } + + @Override + public Integer specId() { + return specId; + } + + @Override + public ContentStats contentStats() { + return contentStats; + } + + @Override + public Integer sortOrderId() { + return sortOrderId; + } + + @Override + public DeletionVector deletionVector() { + return deletionVector; + } + + @Override + public ManifestInfo manifestInfo() { + return manifestInfo; + } + + @Override + public ByteBuffer keyMetadata() { + return keyMetadata != null ? ByteBuffer.wrap(keyMetadata) : null; + } + + @Override + public List splitOffsets() { + return splitOffsets != null ? ArrayUtil.toUnmodifiableLongList(splitOffsets) : null; + } + + @Override + public List equalityIds() { + return equalityIds != null ? ArrayUtil.toUnmodifiableIntList(equalityIds) : null; + } + + @Override + public TrackedFile copy() { + return new TrackedFileStruct(this, true, null); + } + + @Override + public TrackedFile copyWithStats(Set requestedColumnIds) { + return new TrackedFileStruct(this, true, requestedColumnIds); + } + + @Override + protected T internalGet(int pos, Class javaClass) { + return javaClass.cast(getByPos(pos)); + } + + private Object getByPos(int pos) { + switch (pos) { + case 0: + return tracking; + case 1: + return contentType != null ? contentType.id() : null; + case 2: + return location; + case 3: + return fileFormat != null ? fileFormat.toString() : null; + case 4: + return recordCount; + case 5: + return fileSizeInBytes; + case 6: + return specId; + case 7: + return contentStats; + case 8: + return sortOrderId; + case 9: + return deletionVector; + case 10: + return manifestInfo; + case 11: + return keyMetadata(); + case 12: + return splitOffsets(); + case 13: + return equalityIds(); + default: + throw new UnsupportedOperationException("Unknown field ordinal: " + pos); + } + } + + @Override + protected void internalSet(int pos, T value) { + switch (pos) { + case 0: + this.tracking = (Tracking) value; + break; + case 1: + this.contentType = FileContent.fromId((Integer) value); + break; + case 2: + // always coerce to String for Serializable + this.location = value.toString(); + break; + case 3: + this.fileFormat = FileFormat.fromString(value.toString()); + break; + case 4: + this.recordCount = (Long) value; + break; + case 5: + this.fileSizeInBytes = (Long) value; + break; + case 6: + this.specId = (Integer) value; + break; + case 7: + this.contentStats = (ContentStats) value; + break; + case 8: + this.sortOrderId = (Integer) value; + break; + case 9: + this.deletionVector = (DeletionVector) value; + break; + case 10: + this.manifestInfo = (ManifestInfo) value; + break; + case 11: + this.keyMetadata = ByteBuffers.toByteArray((ByteBuffer) value); + break; + case 12: + this.splitOffsets = ArrayUtil.toLongArray((List) value); + break; + case 13: + this.equalityIds = ArrayUtil.toIntArray((List) value); + break; + default: + // ignore the object, it must be from a newer version of the format + } + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("content", contentType != null ? contentType.lowerCaseName() : null) + .add("location", location) + .add("file_format", fileFormat) + .add("record_count", recordCount) + .add("file_size_in_bytes", fileSizeInBytes) + .add("spec_id", specId()) + .add("tracking", tracking) + .add("content_stats", contentStats) + .add("sort_order_id", sortOrderId) + .add("deletion_vector", deletionVector) + .add("manifest_info", manifestInfo) + .add("key_metadata", keyMetadata == null ? "null" : "(redacted)") + .add("split_offsets", splitOffsets == null ? "null" : splitOffsets()) + .add("equality_ids", equalityIds == null ? "null" : equalityIds()) + .toString(); + } +} diff --git a/core/src/main/java/org/apache/iceberg/Tracking.java b/core/src/main/java/org/apache/iceberg/Tracking.java new file mode 100644 index 000000000000..8003ed82ea9c --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/Tracking.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import java.nio.ByteBuffer; +import org.apache.iceberg.types.Types; + +/** Tracking information for a manifest entry. */ +interface Tracking { + Types.NestedField STATUS = + Types.NestedField.required( + 0, + "status", + Types.IntegerType.get(), + "Entry status: 0=existing, 1=added, 2=deleted, 3=replaced"); + Types.NestedField SNAPSHOT_ID = + Types.NestedField.optional( + 1, + "snapshot_id", + Types.LongType.get(), + "Snapshot ID where the file was added or deleted"); + Types.NestedField SEQUENCE_NUMBER = + Types.NestedField.optional( + 3, "sequence_number", Types.LongType.get(), "Data sequence number of the file"); + Types.NestedField FILE_SEQUENCE_NUMBER = + Types.NestedField.optional( + 4, + "file_sequence_number", + Types.LongType.get(), + "File sequence number indicating when the file was added"); + Types.NestedField DV_SNAPSHOT_ID = + Types.NestedField.optional( + 5, + "dv_snapshot_id", + Types.LongType.get(), + "Snapshot ID where the DV was added; null if there is no DV"); + Types.NestedField FIRST_ROW_ID = + Types.NestedField.optional( + 142, "first_row_id", Types.LongType.get(), "ID of the first row in the data file"); + Types.NestedField DELETED_POSITIONS = + Types.NestedField.optional( + 6, + "deleted_positions", + Types.BinaryType.get(), + "Bitmap of positions deleted in this snapshot"); + Types.NestedField REPLACED_POSITIONS = + Types.NestedField.optional( + 7, + "replaced_positions", + Types.BinaryType.get(), + "Bitmap of positions replaced in this snapshot"); + + static Types.StructType schema() { + return Types.StructType.of( + STATUS, + SNAPSHOT_ID, + SEQUENCE_NUMBER, + FILE_SEQUENCE_NUMBER, + DV_SNAPSHOT_ID, + FIRST_ROW_ID, + DELETED_POSITIONS, + REPLACED_POSITIONS); + } + + /** Returns the status of the entry. */ + EntryStatus status(); + + /** Returns whether this entry is live. */ + default boolean isLive() { + return status() == EntryStatus.ADDED || status() == EntryStatus.EXISTING; + } + + /** Returns the snapshot ID where the file was added or deleted. */ + Long snapshotId(); + + /** Returns the data sequence number of the file. */ + Long dataSequenceNumber(); + + /** Returns the file sequence number indicating when the file was added. */ + Long fileSequenceNumber(); + + /** Returns the snapshot ID where the DV was added; null if there is no DV. */ + Long dvSnapshotId(); + + /** Returns the ID of the first row in the data file. */ + Long firstRowId(); + + /** Returns the bitmap of positions deleted in this snapshot. */ + ByteBuffer deletedPositions(); + + /** Returns the bitmap of positions replaced in this snapshot. */ + ByteBuffer replacedPositions(); + + /** Returns the manifest location this entry was read from, or null. */ + String manifestLocation(); + + /** Returns the ordinal position of this entry within the manifest. */ + long manifestPos(); + + /** Copies this tracking information. */ + Tracking copy(); +} diff --git a/core/src/main/java/org/apache/iceberg/TrackingStruct.java b/core/src/main/java/org/apache/iceberg/TrackingStruct.java new file mode 100644 index 000000000000..65513c8d4a7c --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/TrackingStruct.java @@ -0,0 +1,343 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.Objects; +import org.apache.iceberg.avro.SupportsIndexProjection; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.ByteBuffers; + +/** Mutable {@link StructLike} implementation of {@link Tracking}. */ +class TrackingStruct extends SupportsIndexProjection implements Tracking, Serializable { + private static final Types.StructType BASE_TYPE = + Types.StructType.of( + Tracking.STATUS, + Tracking.SNAPSHOT_ID, + Tracking.SEQUENCE_NUMBER, + Tracking.FILE_SEQUENCE_NUMBER, + Tracking.DV_SNAPSHOT_ID, + Tracking.FIRST_ROW_ID, + Tracking.DELETED_POSITIONS, + Tracking.REPLACED_POSITIONS, + MetadataColumns.ROW_POSITION); + + private EntryStatus status = null; + private Long snapshotId = null; + private Long dataSequenceNumber = null; + private Long fileSequenceNumber = null; + private Long dvSnapshotId = null; + private Long firstRowId = null; + private byte[] deletedPositions = null; + private byte[] replacedPositions = null; + + // set by manifest readers, not written to manifests + private String manifestLocation = null; + private long manifestPos = -1L; + + TrackingStruct(Types.StructType type) { + super(BASE_TYPE, type); + } + + private TrackingStruct(TrackingStruct toCopy) { + super(toCopy); + this.status = toCopy.status; + this.snapshotId = toCopy.snapshotId; + this.dataSequenceNumber = toCopy.dataSequenceNumber; + this.fileSequenceNumber = toCopy.fileSequenceNumber; + this.dvSnapshotId = toCopy.dvSnapshotId; + this.firstRowId = toCopy.firstRowId; + this.deletedPositions = + toCopy.deletedPositions != null + ? Arrays.copyOf(toCopy.deletedPositions, toCopy.deletedPositions.length) + : null; + this.replacedPositions = + toCopy.replacedPositions != null + ? Arrays.copyOf(toCopy.replacedPositions, toCopy.replacedPositions.length) + : null; + this.manifestLocation = toCopy.manifestLocation; + this.manifestPos = toCopy.manifestPos; + } + + private TrackingStruct( + EntryStatus status, + Long snapshotId, + Long dataSequenceNumber, + Long fileSequenceNumber, + Long dvSnapshotId, + Long firstRowId, + byte[] deletedPositions, + byte[] replacedPositions) { + super(BASE_TYPE, BASE_TYPE); + this.status = status; + this.snapshotId = snapshotId; + this.dataSequenceNumber = dataSequenceNumber; + this.fileSequenceNumber = fileSequenceNumber; + this.dvSnapshotId = dvSnapshotId; + this.firstRowId = firstRowId; + this.deletedPositions = deletedPositions; + this.replacedPositions = replacedPositions; + } + + void inheritFrom(Tracking manifestTracking) { + if (manifestTracking != null) { + if (snapshotId == null) { + this.snapshotId = manifestTracking.snapshotId(); + } + + // manifests do not distinguish between data and file sequence numbers + Preconditions.checkArgument( + Objects.equals( + manifestTracking.dataSequenceNumber(), manifestTracking.fileSequenceNumber()), + "Manifest data and file sequence numbers must be equal, got %s and %s", + manifestTracking.dataSequenceNumber(), + manifestTracking.fileSequenceNumber()); + + if (status == EntryStatus.ADDED) { + if (dataSequenceNumber == null) { + this.dataSequenceNumber = manifestTracking.fileSequenceNumber(); + } + + if (fileSequenceNumber == null) { + this.fileSequenceNumber = manifestTracking.fileSequenceNumber(); + } + } + } + } + + void setManifestLocation(String location) { + this.manifestLocation = location; + } + + @Override + public EntryStatus status() { + return status; + } + + @Override + public Long snapshotId() { + return snapshotId; + } + + @Override + public Long dataSequenceNumber() { + return dataSequenceNumber; + } + + @Override + public Long fileSequenceNumber() { + return fileSequenceNumber; + } + + @Override + public Long dvSnapshotId() { + return dvSnapshotId; + } + + @Override + public Long firstRowId() { + return firstRowId; + } + + @Override + public ByteBuffer deletedPositions() { + return deletedPositions != null ? ByteBuffer.wrap(deletedPositions) : null; + } + + @Override + public ByteBuffer replacedPositions() { + return replacedPositions != null ? ByteBuffer.wrap(replacedPositions) : null; + } + + @Override + public String manifestLocation() { + return manifestLocation; + } + + @Override + public long manifestPos() { + return manifestPos; + } + + @Override + public TrackingStruct copy() { + return new TrackingStruct(this); + } + + @Override + protected T internalGet(int pos, Class javaClass) { + return javaClass.cast(getByPos(pos)); + } + + private Object getByPos(int pos) { + switch (pos) { + case 0: + return status != null ? status.id() : null; + case 1: + return snapshotId(); + case 2: + return dataSequenceNumber(); + case 3: + return fileSequenceNumber(); + case 4: + return dvSnapshotId; + case 5: + return firstRowId; + case 6: + return deletedPositions(); + case 7: + return replacedPositions(); + case 8: + return manifestPos; + default: + throw new UnsupportedOperationException("Unknown field ordinal: " + pos); + } + } + + @Override + protected void internalSet(int pos, T value) { + switch (pos) { + case 0: + this.status = EntryStatus.fromId((Integer) value); + break; + case 1: + this.snapshotId = (Long) value; + break; + case 2: + this.dataSequenceNumber = (Long) value; + break; + case 3: + this.fileSequenceNumber = (Long) value; + break; + case 4: + this.dvSnapshotId = (Long) value; + break; + case 5: + this.firstRowId = (Long) value; + break; + case 6: + this.deletedPositions = ByteBuffers.toByteArray((ByteBuffer) value); + break; + case 7: + this.replacedPositions = ByteBuffers.toByteArray((ByteBuffer) value); + break; + case 8: + this.manifestPos = (long) value; + break; + default: + // ignore the object, it must be from a newer version of the format + } + } + + static Builder builder() { + return new Builder(); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("status", status) + .add("snapshot_id", snapshotId == null ? "null" : snapshotId) + .add("data_sequence_number", dataSequenceNumber == null ? "null" : dataSequenceNumber) + .add("file_sequence_number", fileSequenceNumber == null ? "null" : fileSequenceNumber) + .add("dv_snapshot_id", dvSnapshotId == null ? "null" : dvSnapshotId) + .add("first_row_id", firstRowId == null ? "null" : firstRowId) + .add("deleted_positions", deletedPositions == null ? "null" : "(binary)") + .add("replaced_positions", replacedPositions == null ? "null" : "(binary)") + .toString(); + } + + static class Builder { + private EntryStatus status = null; + private Long snapshotId = null; + private Long dataSequenceNumber = null; + private Long fileSequenceNumber = null; + private Long dvSnapshotId = null; + private Long firstRowId = null; + private byte[] deletedPositions = null; + private byte[] replacedPositions = null; + + Builder status(EntryStatus entryStatus) { + this.status = entryStatus; + return this; + } + + Builder snapshotId(Long id) { + this.snapshotId = id; + return this; + } + + Builder dataSequenceNumber(Long sequenceNumber) { + this.dataSequenceNumber = sequenceNumber; + return this; + } + + Builder fileSequenceNumber(Long sequenceNumber) { + this.fileSequenceNumber = sequenceNumber; + return this; + } + + Builder dvSnapshotId(Long id) { + this.dvSnapshotId = id; + return this; + } + + Builder firstRowId(Long rowId) { + this.firstRowId = rowId; + return this; + } + + Builder deletedPositions(ByteBuffer positions) { + this.deletedPositions = positions != null ? ByteBuffers.toByteArray(positions) : null; + return this; + } + + Builder deletedPositions(byte[] positions) { + this.deletedPositions = positions; + return this; + } + + Builder replacedPositions(ByteBuffer positions) { + this.replacedPositions = positions != null ? ByteBuffers.toByteArray(positions) : null; + return this; + } + + Builder replacedPositions(byte[] positions) { + this.replacedPositions = positions; + return this; + } + + TrackingStruct build() { + Preconditions.checkArgument(status != null, "Invalid status: null"); + return new TrackingStruct( + status, + snapshotId, + dataSequenceNumber, + fileSequenceNumber, + dvSnapshotId, + firstRowId, + deletedPositions, + replacedPositions); + } + } +} diff --git a/core/src/main/java/org/apache/iceberg/V2Metadata.java b/core/src/main/java/org/apache/iceberg/V2Metadata.java index 832e5c383fe5..803905f6b42e 100644 --- a/core/src/main/java/org/apache/iceberg/V2Metadata.java +++ b/core/src/main/java/org/apache/iceberg/V2Metadata.java @@ -93,6 +93,7 @@ private Object get(int pos) { case 2: return wrapped.partitionSpecId(); case 3: + checkContentType(wrapped.content()); return wrapped.content().id(); case 4: if (wrapped.sequenceNumber() == ManifestWriter.UNASSIGNED_SEQ) { @@ -428,6 +429,7 @@ public T get(int pos, Class javaClass) { private Object get(int pos) { switch (pos) { case 0: + checkContentType(wrapped.content()); return wrapped.content().id(); case 1: return wrapped.location(); @@ -589,4 +591,20 @@ public F copyWithoutStats() { throw new UnsupportedOperationException("Cannot copy IndexedDataFile wrapper"); } } + + private static void checkContentType(ManifestContent content) { + Preconditions.checkArgument( + content == ManifestContent.DATA || content == ManifestContent.DELETES, + "Unsupported manifest content type for v2: %s", + content); + } + + private static void checkContentType(FileContent content) { + Preconditions.checkArgument( + content == FileContent.DATA + || content == FileContent.POSITION_DELETES + || content == FileContent.EQUALITY_DELETES, + "Unsupported file content type for v2: %s", + content); + } } diff --git a/core/src/main/java/org/apache/iceberg/V3Metadata.java b/core/src/main/java/org/apache/iceberg/V3Metadata.java index 8529d68501d4..4e67d9977e64 100644 --- a/core/src/main/java/org/apache/iceberg/V3Metadata.java +++ b/core/src/main/java/org/apache/iceberg/V3Metadata.java @@ -94,6 +94,7 @@ private Object get(int pos) { case 2: return wrapped.partitionSpecId(); case 3: + checkContentType(wrapped.content()); return wrapped.content().id(); case 4: if (wrapped.sequenceNumber() == ManifestWriter.UNASSIGNED_SEQ) { @@ -454,6 +455,7 @@ public T get(int pos, Class javaClass) { private Object get(int pos) { switch (pos) { case 0: + checkContentType(wrapped.content()); return wrapped.content().id(); case 1: return wrapped.location(); @@ -523,4 +525,20 @@ public Long pos() { return null; } } + + private static void checkContentType(ManifestContent content) { + Preconditions.checkArgument( + content == ManifestContent.DATA || content == ManifestContent.DELETES, + "Unsupported manifest content type for v3: %s", + content); + } + + private static void checkContentType(FileContent content) { + Preconditions.checkArgument( + content == FileContent.DATA + || content == FileContent.POSITION_DELETES + || content == FileContent.EQUALITY_DELETES, + "Unsupported file content type for v3: %s", + content); + } } diff --git a/core/src/main/java/org/apache/iceberg/deletes/RoaringPositionBitmap.java b/core/src/main/java/org/apache/iceberg/deletes/RoaringPositionBitmap.java index 3f7613c6ea58..037a38b114a6 100644 --- a/core/src/main/java/org/apache/iceberg/deletes/RoaringPositionBitmap.java +++ b/core/src/main/java/org/apache/iceberg/deletes/RoaringPositionBitmap.java @@ -79,14 +79,45 @@ public void set(long pos) { } /** - * Sets a range of positions in the bitmap. + * Sets a range of positions in the bitmap. If {@code posStartInclusive} equals {@code + * posEndExclusive}, this method does nothing. * * @param posStartInclusive the start position of the range (inclusive) * @param posEndExclusive the end position of the range (exclusive) + * @throws IllegalArgumentException if posStartInclusive > posEndExclusive */ public void setRange(long posStartInclusive, long posEndExclusive) { - for (long pos = posStartInclusive; pos < posEndExclusive; pos++) { - set(pos); + Preconditions.checkArgument( + posStartInclusive <= posEndExclusive, + "Start position must not exceed end position: [%s, %s)", + posStartInclusive, + posEndExclusive); + + if (posStartInclusive == posEndExclusive) { + return; + } + + validatePosition(posStartInclusive); + validatePosition(posEndExclusive - 1); + + int startKey = key(posStartInclusive); + int endKey = key(posEndExclusive - 1); + allocateBitmapsIfNeeded(endKey + 1); + + if (startKey == endKey) { + long lowStart = Integer.toUnsignedLong(pos32Bits(posStartInclusive)); + long lowEnd = Integer.toUnsignedLong(pos32Bits(posEndExclusive - 1)) + 1; + bitmaps[startKey].add(lowStart, lowEnd); + } else { + long firstLowStart = Integer.toUnsignedLong(pos32Bits(posStartInclusive)); + bitmaps[startKey].add(firstLowStart, 1L << 32); + + for (int key = startKey + 1; key < endKey; key++) { + bitmaps[key].add(0L, 1L << 32); + } + + long lastLowEnd = Integer.toUnsignedLong(pos32Bits(posEndExclusive - 1)) + 1; + bitmaps[endKey].add(0L, lastLowEnd); } } diff --git a/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java b/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java index 4a6b5a6cf40f..e1e93aa1fd07 100644 --- a/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java +++ b/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java @@ -170,6 +170,11 @@ public static FileWriterBuilder, S> equalityDelet * records that identify rows to be deleted by file path and position, producing a {@link * DeleteFile} that can be used for table operations. * + *

Note: This method is only applicable to format-version 2 tables. Format-version 3 + * tables use deletion vectors, which are always written in Puffin format. Registered {@link + * FormatModel} implementations for {@link PositionDelete} are not consulted for format-version 3+ + * tables. + * * @param format the file format used for writing * @param outputFile destination for the written data * @return a configured delete write builder for creating a {@link PositionDeleteWriter} diff --git a/core/src/main/java/org/apache/iceberg/inmemory/InMemoryCatalog.java b/core/src/main/java/org/apache/iceberg/inmemory/InMemoryCatalog.java index 975b5a39dfe3..2234d418de40 100644 --- a/core/src/main/java/org/apache/iceberg/inmemory/InMemoryCatalog.java +++ b/core/src/main/java/org/apache/iceberg/inmemory/InMemoryCatalog.java @@ -48,6 +48,8 @@ import org.apache.iceberg.relocated.com.google.common.base.Objects; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.LocationUtil; +import org.apache.iceberg.util.PropertyUtil; import org.apache.iceberg.view.BaseMetastoreViewCatalog; import org.apache.iceberg.view.BaseViewOperations; import org.apache.iceberg.view.ViewMetadata; @@ -71,6 +73,7 @@ public class InMemoryCatalog extends BaseMetastoreViewCatalog private String catalogName; private String warehouseLocation; private CloseableGroup closeableGroup; + private boolean uniqueTableLocation; private Map catalogProperties; public InMemoryCatalog() { @@ -88,6 +91,11 @@ public String name() { public void initialize(String name, Map properties) { this.catalogName = name != null ? name : InMemoryCatalog.class.getSimpleName(); this.catalogProperties = ImmutableMap.copyOf(properties); + this.uniqueTableLocation = + PropertyUtil.propertyAsBoolean( + properties, + CatalogProperties.UNIQUE_TABLE_LOCATION, + CatalogProperties.UNIQUE_TABLE_LOCATION_DEFAULT); String warehouse = properties.getOrDefault(CatalogProperties.WAREHOUSE_LOCATION, ""); this.warehouseLocation = warehouse.replaceAll("/*$", ""); @@ -104,8 +112,8 @@ protected TableOperations newTableOps(TableIdentifier tableIdentifier) { @Override protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { - return SLASH.join( - defaultNamespaceLocation(tableIdentifier.namespace()), tableIdentifier.name()); + String tableLocation = LocationUtil.tableLocation(tableIdentifier, uniqueTableLocation); + return SLASH.join(defaultNamespaceLocation(tableIdentifier.namespace()), tableLocation); } private String defaultNamespaceLocation(Namespace namespace) { @@ -211,6 +219,13 @@ public boolean dropNamespace(Namespace namespace) throws NamespaceNotEmptyExcept return false; } + List childNamespaces = listNamespaces(namespace); + if (!childNamespaces.isEmpty()) { + throw new NamespaceNotEmptyException( + "Namespace %s is not empty. Contains %d child namespace(s).", + namespace, childNamespaces.size()); + } + List tableIdentifiers = listTables(namespace); if (!tableIdentifiers.isEmpty()) { throw new NamespaceNotEmptyException( diff --git a/core/src/main/java/org/apache/iceberg/io/BufferedFileAppender.java b/core/src/main/java/org/apache/iceberg/io/BufferedFileAppender.java new file mode 100644 index 000000000000..8f8ef8f33b76 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/io/BufferedFileAppender.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.io; + +import java.io.IOException; +import java.util.List; +import java.util.function.Function; +import java.util.function.UnaryOperator; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +/** + * A FileAppender that buffers the first N rows, then creates a delegate appender via a factory. + * + *

The factory receives the buffered rows and is responsible for creating the real appender. Row + * replay is handled internally. All subsequent {@link #add} calls delegate directly to the real + * appender. + * + *

If fewer than {@code bufferSize} rows are written before close, the factory is called with + * whatever rows were buffered. If no rows were written, the factory is not called and no file is + * created on disk. In this case, {@link #metrics()} returns {@code new Metrics(0L)} and {@link + * #length()} returns {@code 0L}. + * + * @param the row type + */ +public class BufferedFileAppender implements FileAppender { + private final int bufferRowCount; + private final Function, FileAppender> appenderFactory; + private final UnaryOperator copyFunc; + private List buffer; + private FileAppender delegate; + private boolean closed = false; + + /** + * @param bufferRowCount number of rows to buffer before creating the delegate appender + * @param appenderFactory given the buffered rows, creates the delegate appender + */ + public BufferedFileAppender( + int bufferRowCount, Function, FileAppender> appenderFactory) { + this(bufferRowCount, appenderFactory, UnaryOperator.identity()); + } + + /** + * @param bufferRowCount number of rows to buffer before creating the delegate appender + * @param appenderFactory given the buffered rows, creates the delegate appender + * @param copyFunc copies a row before buffering (needed when row objects are reused, e.g. Spark + * InternalRow) + */ + public BufferedFileAppender( + int bufferRowCount, + Function, FileAppender> appenderFactory, + UnaryOperator copyFunc) { + Preconditions.checkArgument( + bufferRowCount > 0, "bufferRowCount must be > 0, got %s", bufferRowCount); + Preconditions.checkNotNull(appenderFactory, "appenderFactory must not be null"); + Preconditions.checkNotNull(copyFunc, "copyFunc must not be null"); + this.bufferRowCount = bufferRowCount; + this.appenderFactory = appenderFactory; + this.copyFunc = copyFunc; + this.buffer = Lists.newArrayListWithCapacity(bufferRowCount); + } + + @Override + public void add(D datum) { + Preconditions.checkState(!closed, "Cannot add to a closed appender"); + if (delegate != null) { + delegate.add(datum); + } else { + buffer.add(copyFunc.apply(datum)); + if (buffer.size() >= bufferRowCount) { + initialize(); + } + } + } + + @Override + public Metrics metrics() { + Preconditions.checkState(closed, "Cannot return metrics for unclosed appender"); + if (delegate == null) { + return new Metrics(0L); + } + + return delegate.metrics(); + } + + @Override + public long length() { + if (delegate != null) { + return delegate.length(); + } + + // No bytes written to disk yet; data is buffered in memory + return 0L; + } + + @Override + public List splitOffsets() { + if (delegate != null) { + return delegate.splitOffsets(); + } + + return null; + } + + @Override + public void close() throws IOException { + if (!closed) { + try { + if (delegate == null && buffer != null && !buffer.isEmpty()) { + initialize(); + } + + if (delegate != null) { + delegate.close(); + } + } finally { + this.closed = true; + this.buffer = null; + } + } + } + + private void initialize() { + delegate = appenderFactory.apply(buffer); + Preconditions.checkState(delegate != null, "appenderFactory must not return null"); + try { + buffer.forEach(delegate::add); + } finally { + buffer = null; + } + } +} diff --git a/core/src/main/java/org/apache/iceberg/jdbc/JdbcCatalog.java b/core/src/main/java/org/apache/iceberg/jdbc/JdbcCatalog.java index 55c00319a0cc..2d24e5598ac7 100644 --- a/core/src/main/java/org/apache/iceberg/jdbc/JdbcCatalog.java +++ b/core/src/main/java/org/apache/iceberg/jdbc/JdbcCatalog.java @@ -88,6 +88,7 @@ public class JdbcCatalog extends BaseMetastoreViewCatalog private Object conf; private JdbcClientPool connections; private Map catalogProperties; + private boolean uniqueTableLocation; private final Function, FileIO> ioBuilder; private final Function, JdbcClientPool> clientPoolBuilder; private boolean initializeCatalogTables; @@ -120,6 +121,11 @@ public void initialize(String name, Map properties) { this.warehouseLocation = LocationUtil.stripTrailingSlash(inputWarehouseLocation); this.catalogProperties = ImmutableMap.copyOf(properties); + this.uniqueTableLocation = + PropertyUtil.propertyAsBoolean( + properties, + CatalogProperties.UNIQUE_TABLE_LOCATION, + CatalogProperties.UNIQUE_TABLE_LOCATION_DEFAULT); if (name != null) { this.catalogName = name; @@ -287,7 +293,8 @@ protected ViewOperations newViewOps(TableIdentifier viewIdentifier) { @Override protected String defaultWarehouseLocation(TableIdentifier table) { - return SLASH.join(defaultNamespaceLocation(table.namespace()), table.name()); + String tableLocation = LocationUtil.tableLocation(table, uniqueTableLocation); + return SLASH.join(defaultNamespaceLocation(table.namespace()), tableLocation); } @Override @@ -536,6 +543,13 @@ public boolean dropNamespace(Namespace namespace) throws NamespaceNotEmptyExcept return false; } + List childNamespaces = listNamespaces(namespace); + if (childNamespaces != null && !childNamespaces.isEmpty()) { + throw new NamespaceNotEmptyException( + "Namespace %s is not empty. Contains %d child namespace(s).", + namespace, childNamespaces.size()); + } + List tableIdentifiers = listTables(namespace); if (tableIdentifiers != null && !tableIdentifiers.isEmpty()) { throw new NamespaceNotEmptyException( diff --git a/core/src/main/java/org/apache/iceberg/rest/Endpoint.java b/core/src/main/java/org/apache/iceberg/rest/Endpoint.java index c2369a0fa57d..d56a14d18954 100644 --- a/core/src/main/java/org/apache/iceberg/rest/Endpoint.java +++ b/core/src/main/java/org/apache/iceberg/rest/Endpoint.java @@ -66,6 +66,8 @@ public class Endpoint { Endpoint.create("POST", ResourcePaths.V1_TABLE_METRICS); public static final Endpoint V1_TABLE_CREDENTIALS = Endpoint.create("GET", ResourcePaths.V1_TABLE_CREDENTIALS); + public static final Endpoint V1_TABLE_REMOTE_SIGN = + Endpoint.create("POST", ResourcePaths.V1_TABLE_REMOTE_SIGN); // table scan plan endpoints public static final Endpoint V1_SUBMIT_TABLE_SCAN_PLAN = diff --git a/core/src/main/java/org/apache/iceberg/rest/ErrorHandlers.java b/core/src/main/java/org/apache/iceberg/rest/ErrorHandlers.java index 791eb732bb7c..334bfde8abfc 100644 --- a/core/src/main/java/org/apache/iceberg/rest/ErrorHandlers.java +++ b/core/src/main/java/org/apache/iceberg/rest/ErrorHandlers.java @@ -30,6 +30,7 @@ import org.apache.iceberg.exceptions.NoSuchPlanTaskException; import org.apache.iceberg.exceptions.NoSuchTableException; import org.apache.iceberg.exceptions.NoSuchViewException; +import org.apache.iceberg.exceptions.NoSuchWarehouseException; import org.apache.iceberg.exceptions.NotAuthorizedException; import org.apache.iceberg.exceptions.NotFoundException; import org.apache.iceberg.exceptions.RESTException; @@ -92,6 +93,10 @@ public static Consumer defaultErrorHandler() { return DefaultErrorHandler.INSTANCE; } + public static Consumer configErrorHandler() { + return ConfigErrorHandler.INSTANCE; + } + public static Consumer oauthErrorHandler() { return OAuthErrorHandler.INSTANCE; } @@ -295,6 +300,20 @@ public void accept(ErrorResponse error) { } } + /** Request error handler for config endpoint. */ + private static class ConfigErrorHandler extends DefaultErrorHandler { + private static final ErrorHandler INSTANCE = new ConfigErrorHandler(); + + @Override + public void accept(ErrorResponse error) { + if (error.code() == 404 && error.type() != null) { + throw new NoSuchWarehouseException("%s", error.message()); + } + + super.accept(error); + } + } + /** * Request error handler that handles the common cases that are included with all responses, such * as 400, 500, etc. diff --git a/core/src/main/java/org/apache/iceberg/rest/HTTPClient.java b/core/src/main/java/org/apache/iceberg/rest/HTTPClient.java index 86eceba21c95..46d9177b9571 100644 --- a/core/src/main/java/org/apache/iceberg/rest/HTTPClient.java +++ b/core/src/main/java/org/apache/iceberg/rest/HTTPClient.java @@ -18,7 +18,6 @@ */ package org.apache.iceberg.rest; -import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectReader; import java.io.IOException; @@ -29,6 +28,7 @@ import java.util.concurrent.ConcurrentMap; import java.util.concurrent.TimeUnit; import java.util.function.Consumer; +import javax.net.ssl.HostnameVerifier; import org.apache.hc.client5.http.auth.AuthScope; import org.apache.hc.client5.http.auth.CredentialsProvider; import org.apache.hc.client5.http.auth.UsernamePasswordCredentials; @@ -43,6 +43,7 @@ import org.apache.hc.client5.http.io.HttpClientConnectionManager; import org.apache.hc.client5.http.protocol.HttpClientContext; import org.apache.hc.client5.http.ssl.DefaultClientTlsStrategy; +import org.apache.hc.client5.http.ssl.HostnameVerificationPolicy; import org.apache.hc.core5.http.ContentType; import org.apache.hc.core5.http.Header; import org.apache.hc.core5.http.HttpHeaders; @@ -340,32 +341,23 @@ protected T execute( return null; } - String responseBody = extractResponseBodyAsString(response); - if (!isSuccessful(response)) { // The provided error handler is expected to throw, but a RESTException is thrown if not. + String responseBody = extractResponseBodyAsString(response); throwFailure(response, responseBody, errorHandler); } - if (responseBody == null) { + if (response.getEntity() == null) { throw new RESTException( "Invalid (null) response body for request (expected %s): method=%s, path=%s, status=%d", responseType.getSimpleName(), req.method(), req.path(), response.getCode()); } - try { - ObjectReader reader = objectReaderCache.computeIfAbsent(responseType, mapper::readerFor); - if (parserContext != null && !parserContext.isEmpty()) { - reader = reader.with(parserContext.toInjectableValues()); - } - return reader.readValue(responseBody); - } catch (JsonProcessingException e) { - throw new RESTException( - e, - "Received a success response code of %d, but failed to parse response body into %s", - response.getCode(), - responseType.getSimpleName()); + ObjectReader reader = objectReaderCache.computeIfAbsent(responseType, mapper::readerFor); + if (parserContext != null && !parserContext.isEmpty()) { + reader = reader.with(parserContext.toInjectableValues()); } + return reader.readValue(response.getEntity().getContent()); } catch (IOException e) { throw new RESTException(e, "Error occurred while processing %s request", req.method()); } @@ -410,13 +402,19 @@ static HttpClientConnectionManager configureConnectionManager(Map()) .addDeserializer(LoadCredentialsResponse.class, new LoadCredentialsResponseDeserializer<>()) .addDeserializer( - ImmutableLoadCredentialsResponse.class, new LoadCredentialsResponseDeserializer<>()); + ImmutableLoadCredentialsResponse.class, new LoadCredentialsResponseDeserializer<>()) + .addSerializer(RemoteSignRequest.class, new RemoteSignRequestSerializer<>()) + .addSerializer(ImmutableRemoteSignRequest.class, new RemoteSignRequestSerializer<>()) + .addDeserializer(RemoteSignRequest.class, new RemoteSignRequestDeserializer<>()) + .addDeserializer(ImmutableRemoteSignRequest.class, new RemoteSignRequestDeserializer<>()) + .addSerializer(RemoteSignResponse.class, new RemoteSignResponseSerializer<>()) + .addSerializer(ImmutableRemoteSignResponse.class, new RemoteSignResponseSerializer<>()) + .addDeserializer(RemoteSignResponse.class, new RemoteSignResponseDeserializer<>()) + .addDeserializer(ImmutableRemoteSignResponse.class, new RemoteSignResponseDeserializer<>()); mapper.registerModule(module); } @@ -650,4 +664,39 @@ boolean isCaseSensitive() { return caseSensitive; } } + + static class RemoteSignRequestSerializer extends JsonSerializer { + @Override + public void serialize(T request, JsonGenerator gen, SerializerProvider serializers) + throws IOException { + RemoteSignRequestParser.toJson(request, gen); + } + } + + static class RemoteSignRequestDeserializer + extends JsonDeserializer { + @Override + public T deserialize(JsonParser p, DeserializationContext context) throws IOException { + JsonNode jsonNode = p.getCodec().readTree(p); + return (T) RemoteSignRequestParser.fromJson(jsonNode); + } + } + + static class RemoteSignResponseSerializer + extends JsonSerializer { + @Override + public void serialize(T response, JsonGenerator gen, SerializerProvider serializers) + throws IOException { + RemoteSignResponseParser.toJson(response, gen); + } + } + + static class RemoteSignResponseDeserializer + extends JsonDeserializer { + @Override + public T deserialize(JsonParser p, DeserializationContext context) throws IOException { + JsonNode jsonNode = p.getCodec().readTree(p); + return (T) RemoteSignResponseParser.fromJson(jsonNode); + } + } } diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTSessionCatalog.java b/core/src/main/java/org/apache/iceberg/rest/RESTSessionCatalog.java index cbdf17a8ebbe..ec30d9de897e 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTSessionCatalog.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTSessionCatalog.java @@ -265,7 +265,7 @@ public void initialize(String name, Map unresolved) { PropertyUtil.propertyAsString( mergedProps, RESTCatalogProperties.SNAPSHOT_LOADING_MODE, - RESTCatalogProperties.SNAPSHOT_LOADING_MODE_DEFAULT) + RESTCatalogProperties.SNAPSHOT_LOADING_MODE_DEFAULT.name()) .toUpperCase(Locale.US)); this.reporter = CatalogUtil.loadMetricsReporter(mergedProps); @@ -279,7 +279,7 @@ public void initialize(String name, Map unresolved) { PropertyUtil.propertyAsString( mergedProps, RESTCatalogProperties.NAMESPACE_SEPARATOR, - RESTUtil.NAMESPACE_SEPARATOR_URLENCODED_UTF_8); + RESTCatalogProperties.NAMESPACE_SEPARATOR_DEFAULT); this.tableCache = createTableCache(mergedProps); this.closeables.addCloseable(this.tableCache); @@ -615,7 +615,7 @@ private RESTTable restTableForScanPlanning( RESTCatalogProperties.ScanPlanningMode effectiveMode = effectiveModeConfig != null ? RESTCatalogProperties.ScanPlanningMode.fromString(effectiveModeConfig) - : RESTCatalogProperties.ScanPlanningMode.CLIENT; + : RESTCatalogProperties.SCAN_PLANNING_MODE_DEFAULT; if (effectiveMode == RESTCatalogProperties.ScanPlanningMode.SERVER) { Preconditions.checkState( @@ -1338,7 +1338,7 @@ private static ConfigResponse fetchConfig( queryParams.build(), ConfigResponse.class, RESTUtil.configHeaders(properties), - ErrorHandlers.defaultErrorHandler()); + ErrorHandlers.configErrorHandler()); configResponse.validate(); return configResponse; } diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTTableScan.java b/core/src/main/java/org/apache/iceberg/rest/RESTTableScan.java index 74fe9ebd7d4e..9fa273ca169f 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTTableScan.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTTableScan.java @@ -22,6 +22,7 @@ import com.github.benmanes.caffeine.cache.Caffeine; import com.github.benmanes.caffeine.cache.RemovalListener; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicReference; @@ -47,9 +48,11 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.rest.credentials.Credential; import org.apache.iceberg.rest.requests.PlanTableScanRequest; +import org.apache.iceberg.rest.responses.ErrorResponse; import org.apache.iceberg.rest.responses.FetchPlanningResultResponse; import org.apache.iceberg.rest.responses.PlanTableScanResponse; import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.PropertyUtil; import org.apache.iceberg.util.Tasks; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -59,7 +62,6 @@ class RESTTableScan extends DataTableScan { private static final long MIN_SLEEP_MS = 1000; // Initial delay private static final long MAX_SLEEP_MS = 60 * 1000; // Max backoff delay (1 minute) private static final int MAX_RETRIES = 10; // Max number of poll retries - private static final long MAX_WAIT_TIME_MS = 5 * 60 * 1000; // Total maximum duration (5 minutes) private static final double SCALE_FACTOR = 2.0; // Exponential scale factor private static final String DEFAULT_FILE_IO_IMPL = "org.apache.iceberg.io.ResolvingFileIO"; private static final Cache FILEIO_TRACKER = @@ -217,11 +219,7 @@ private CloseableIterable planTableScan(PlanTableScanRequest planT Endpoint.check(supportedEndpoints, Endpoint.V1_FETCH_TABLE_SCAN_PLAN); return fetchPlanningResult(); case FAILED: - throw new IllegalStateException( - String.format("Received status: %s for planId: %s", PlanStatus.FAILED, planId)); - case CANCELLED: - throw new IllegalStateException( - String.format("Received status: %s for planId: %s", PlanStatus.CANCELLED, planId)); + throw new IllegalStateException(failureMessage(planId, response.errorResponse())); default: throw new IllegalStateException( String.format("Invalid planStatus: %s for planId: %s", planStatus, planId)); @@ -249,38 +247,72 @@ private FileIO scanFileIO(List storageCredentials) { } private CloseableIterable fetchPlanningResult() { - AtomicReference result = new AtomicReference<>(); - Tasks.foreach(planId) - .exponentialBackoff(MIN_SLEEP_MS, MAX_SLEEP_MS, MAX_WAIT_TIME_MS, SCALE_FACTOR) - .retry(MAX_RETRIES) - .onlyRetryOn(NotCompleteException.class) - .onFailure( - (id, err) -> { - LOG.warn("Planning failed for plan ID: {}", id, err); - cleanupPlanResources(); - }) - .throwFailureWhenFinished() - .run( - id -> { - FetchPlanningResultResponse response = - client.get( - resourcePaths.plan(tableIdentifier, id), - headers, - FetchPlanningResultResponse.class, - headers, - ErrorHandlers.planErrorHandler(), - parserContext); + long maxWaitTimeMs = + PropertyUtil.propertyAsLong( + catalogProperties, + RESTCatalogProperties.REST_SCAN_PLANNING_POLL_TIMEOUT_MS, + RESTCatalogProperties.REST_SCAN_PLANNING_POLL_TIMEOUT_MS_DEFAULT); + Preconditions.checkArgument( + maxWaitTimeMs > 0, + "Invalid value for %s: %s (must be positive)", + RESTCatalogProperties.REST_SCAN_PLANNING_POLL_TIMEOUT_MS, + maxWaitTimeMs); - if (response.planStatus() == PlanStatus.SUBMITTED) { - throw new NotCompleteException(); - } else if (response.planStatus() != PlanStatus.COMPLETED) { - throw new IllegalStateException( - String.format( - "Invalid planStatus: %s for planId: %s", response.planStatus(), id)); - } + AtomicReference result = new AtomicReference<>(); + try { + Tasks.foreach(planId) + .exponentialBackoff(MIN_SLEEP_MS, MAX_SLEEP_MS, maxWaitTimeMs, SCALE_FACTOR) + .retry(MAX_RETRIES) + .onlyRetryOn(NotCompleteException.class) + .onFailure( + (id, err) -> { + LOG.warn("Planning failed for plan ID: {}", id, err); + cleanupPlanResources(); + }) + .throwFailureWhenFinished() + .run( + id -> { + FetchPlanningResultResponse response = + client.get( + resourcePaths.plan(tableIdentifier, id), + headers, + FetchPlanningResultResponse.class, + headers, + ErrorHandlers.planErrorHandler(), + parserContext); - result.set(response); - }); + switch (response.planStatus()) { + case COMPLETED: + result.set(response); + break; + case SUBMITTED: + throw new NotCompleteException(); + case FAILED: + throw new IllegalStateException(failureMessage(id, response.errorResponse())); + case CANCELLED: + throw new IllegalStateException( + String.format( + Locale.ROOT, "Remote scan planning cancelled for planId: %s", id)); + default: + throw new IllegalStateException( + String.format( + Locale.ROOT, + "Invalid planStatus: %s for planId: %s", + response.planStatus(), + id)); + } + }); + } catch (NotCompleteException e) { + throw new RemotePlanTimeoutException( + String.format( + Locale.ROOT, + "Remote scan planning for planId: %s did not complete within configured limits" + + " (timeout=%d ms, maxRetries=%d)", + planId, + maxWaitTimeMs, + MAX_RETRIES), + e); + } FetchPlanningResultResponse response = result.get(); @@ -290,6 +322,21 @@ private CloseableIterable fetchPlanningResult() { return scanTasksIterable(response.planTasks(), response.fileScanTasks()); } + private static String failureMessage(String planId, ErrorResponse error) { + // If a FAILED response lacks the expected error payload, still return a useful error + // message instead of throwing. + String type = error != null ? error.type() : "unknown"; + int code = error != null ? error.code() : 0; + String message = error != null ? error.message() : "unknown"; + return String.format( + Locale.ROOT, + "Remote scan planning failed for planId: %s: %s (code=%d): %s", + planId, + type, + code, + message); + } + private CloseableIterable scanTasksIterable( List planTasks, List fileScanTasks) { if (planTasks != null && !planTasks.isEmpty()) { diff --git a/core/src/main/java/org/apache/iceberg/rest/RemotePlanTimeoutException.java b/core/src/main/java/org/apache/iceberg/rest/RemotePlanTimeoutException.java new file mode 100644 index 000000000000..e0f01aadd612 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/rest/RemotePlanTimeoutException.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.rest; + +/** Thrown when server-side scan planning does not complete before the client deadline. */ +class RemotePlanTimeoutException extends RuntimeException { + RemotePlanTimeoutException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/core/src/main/java/org/apache/iceberg/rest/ResourcePaths.java b/core/src/main/java/org/apache/iceberg/rest/ResourcePaths.java index 0fc55c1a44d8..be2fde22053d 100644 --- a/core/src/main/java/org/apache/iceberg/rest/ResourcePaths.java +++ b/core/src/main/java/org/apache/iceberg/rest/ResourcePaths.java @@ -35,6 +35,8 @@ public class ResourcePaths { public static final String V1_TABLE = "/v1/{prefix}/namespaces/{namespace}/tables/{table}"; public static final String V1_TABLE_CREDENTIALS = "/v1/{prefix}/namespaces/{namespace}/tables/{table}/credentials"; + public static final String V1_TABLE_REMOTE_SIGN = + "/v1/{prefix}/namespaces/{namespace}/tables/{table}/sign"; public static final String V1_TABLE_REGISTER = "/v1/{prefix}/namespaces/{namespace}/register"; public static final String V1_TABLE_METRICS = "/v1/{prefix}/namespaces/{namespace}/tables/{table}/metrics"; @@ -57,7 +59,7 @@ public static ResourcePaths forCatalogProperties(Map properties) PropertyUtil.propertyAsString( properties, RESTCatalogProperties.NAMESPACE_SEPARATOR, - RESTUtil.NAMESPACE_SEPARATOR_URLENCODED_UTF_8)); + RESTCatalogProperties.NAMESPACE_SEPARATOR_DEFAULT)); } public static String config() { @@ -130,6 +132,17 @@ public String metrics(TableIdentifier identifier) { "metrics"); } + public String remoteSign(TableIdentifier identifier) { + return SLASH.join( + "v1", + prefix, + "namespaces", + pathEncode(identifier.namespace()), + "tables", + RESTUtil.encodeString(identifier.name()), + "sign"); + } + public String commitTransaction() { return SLASH.join("v1", prefix, "transactions", "commit"); } diff --git a/core/src/main/java/org/apache/iceberg/rest/auth/OAuth2Util.java b/core/src/main/java/org/apache/iceberg/rest/auth/OAuth2Util.java index c2b47e6e944f..7a244bff70f6 100644 --- a/core/src/main/java/org/apache/iceberg/rest/auth/OAuth2Util.java +++ b/core/src/main/java/org/apache/iceberg/rest/auth/OAuth2Util.java @@ -529,6 +529,7 @@ public Pair refresh(RESTClient client) { .from(config()) .token(response.token()) .tokenType(response.issuedTokenType()) + .expiresAtMillis(OAuth2Util.expiresAtMillis(response.token())) .build(); Map currentHeaders = this.headers; this.headers = RESTUtil.merge(currentHeaders, authHeaders(config.token())); @@ -618,6 +619,7 @@ public static AuthSession fromAccessToken( .from(parent.config()) .token(token) .tokenType(OAuth2Properties.ACCESS_TOKEN_TYPE) + .expiresAtMillis(OAuth2Util.expiresAtMillis(token)) .build()); long startTimeMillis = System.currentTimeMillis(); @@ -699,6 +701,7 @@ private static AuthSession fromTokenResponse( .token(response.token()) .tokenType(issuedTokenType) .credential(credential) + .expiresAtMillis(OAuth2Util.expiresAtMillis(response.token())) .build()); Long expiresAtMillis = session.expiresAtMillis(); diff --git a/core/src/main/java/org/apache/iceberg/rest/auth/TLSConfigurer.java b/core/src/main/java/org/apache/iceberg/rest/auth/TLSConfigurer.java index da22b5cb4c6a..4b3ed3a74c24 100644 --- a/core/src/main/java/org/apache/iceberg/rest/auth/TLSConfigurer.java +++ b/core/src/main/java/org/apache/iceberg/rest/auth/TLSConfigurer.java @@ -19,9 +19,9 @@ package org.apache.iceberg.rest.auth; import java.util.Map; +import javax.annotation.Nullable; import javax.net.ssl.HostnameVerifier; import javax.net.ssl.SSLContext; -import org.apache.hc.client5.http.ssl.HttpsSupport; import org.apache.hc.core5.ssl.SSLContexts; public interface TLSConfigurer { @@ -32,8 +32,16 @@ default SSLContext sslContext() { return SSLContexts.createDefault(); } + /** + * Returns a custom {@link HostnameVerifier} to use for hostname verification, or {@code null} to + * use the default JSSE built-in hostname verifier. + * + *

If a non-null verifier is returned, only the custom verifier is executed and the JSSE + * built-in hostname verifier won't be executed. + */ + @Nullable default HostnameVerifier hostnameVerifier() { - return HttpsSupport.getDefaultHostnameVerifier(); + return null; } default String[] supportedProtocols() { diff --git a/core/src/main/java/org/apache/iceberg/rest/requests/RemoteSignRequest.java b/core/src/main/java/org/apache/iceberg/rest/requests/RemoteSignRequest.java new file mode 100644 index 000000000000..561007c480eb --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/rest/requests/RemoteSignRequest.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.rest.requests; + +import java.net.URI; +import java.util.List; +import java.util.Map; +import javax.annotation.Nullable; +import org.apache.iceberg.rest.RESTRequest; +import org.immutables.value.Value; + +@Value.Immutable +public interface RemoteSignRequest extends RESTRequest { + String region(); + + String method(); + + URI uri(); + + Map> headers(); + + Map properties(); + + @Value.Default + @Nullable + default String body() { + return null; + } + + @Nullable + String provider(); + + @Override + default void validate() {} +} diff --git a/core/src/main/java/org/apache/iceberg/rest/requests/RemoteSignRequestParser.java b/core/src/main/java/org/apache/iceberg/rest/requests/RemoteSignRequestParser.java new file mode 100644 index 000000000000..61b44cc177d1 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/rest/requests/RemoteSignRequestParser.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.rest.requests; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.databind.JsonNode; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.JsonUtil; + +public class RemoteSignRequestParser { + + private static final String REGION = "region"; + private static final String METHOD = "method"; + private static final String URI = "uri"; + private static final String HEADERS = "headers"; + private static final String PROPERTIES = "properties"; + private static final String BODY = "body"; + private static final String PROVIDER = "provider"; + + private RemoteSignRequestParser() {} + + public static String toJson(RemoteSignRequest request) { + return toJson(request, false); + } + + public static String toJson(RemoteSignRequest request, boolean pretty) { + return JsonUtil.generate(gen -> toJson(request, gen), pretty); + } + + public static void toJson(RemoteSignRequest request, JsonGenerator gen) throws IOException { + Preconditions.checkArgument(null != request, "Invalid remote sign request: null"); + + gen.writeStartObject(); + + gen.writeStringField(REGION, request.region()); + gen.writeStringField(METHOD, request.method()); + gen.writeStringField(URI, request.uri().toString()); + headersToJson(HEADERS, request.headers(), gen); + + if (!request.properties().isEmpty()) { + JsonUtil.writeStringMap(PROPERTIES, request.properties(), gen); + } + + if (request.body() != null && !request.body().isEmpty()) { + gen.writeStringField(BODY, request.body()); + } + + if (request.provider() != null) { + gen.writeStringField(PROVIDER, request.provider()); + } + + gen.writeEndObject(); + } + + public static RemoteSignRequest fromJson(String json) { + return JsonUtil.parse(json, RemoteSignRequestParser::fromJson); + } + + public static RemoteSignRequest fromJson(JsonNode json) { + Preconditions.checkArgument(null != json, "Cannot parse remote sign request from null object"); + Preconditions.checkArgument( + json.isObject(), "Cannot parse remote sign request from non-object: %s", json); + + String region = JsonUtil.getString(REGION, json); + String method = JsonUtil.getString(METHOD, json); + java.net.URI uri = java.net.URI.create(JsonUtil.getString(URI, json)); + Map> headers = headersFromJson(HEADERS, json); + + ImmutableRemoteSignRequest.Builder builder = + ImmutableRemoteSignRequest.builder() + .region(region) + .method(method) + .uri(uri) + .headers(headers); + + if (json.has(PROPERTIES)) { + builder.properties(JsonUtil.getStringMap(PROPERTIES, json)); + } + + if (json.has(BODY)) { + builder.body(JsonUtil.getString(BODY, json)); + } + + if (json.has(PROVIDER)) { + builder.provider(JsonUtil.getString(PROVIDER, json)); + } + + return builder.build(); + } + + public static void headersToJson( + String property, Map> headers, JsonGenerator gen) throws IOException { + gen.writeObjectFieldStart(property); + for (Entry> entry : headers.entrySet()) { + gen.writeFieldName(entry.getKey()); + + gen.writeStartArray(); + for (String val : entry.getValue()) { + gen.writeString(val); + } + gen.writeEndArray(); + } + gen.writeEndObject(); + } + + public static Map> headersFromJson(String property, JsonNode json) { + Map> headers = Maps.newHashMap(); + JsonNode headersNode = JsonUtil.get(property, json); + headersNode + .properties() + .forEach( + entry -> { + String key = entry.getKey(); + List values = Arrays.asList(JsonUtil.getStringArray(entry.getValue())); + headers.put(key, values); + }); + return headers; + } +} diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/ErrorResponseParser.java b/core/src/main/java/org/apache/iceberg/rest/responses/ErrorResponseParser.java index 31ad0573b107..1329c074ab29 100644 --- a/core/src/main/java/org/apache/iceberg/rest/responses/ErrorResponseParser.java +++ b/core/src/main/java/org/apache/iceberg/rest/responses/ErrorResponseParser.java @@ -46,9 +46,12 @@ public static String toJson(ErrorResponse errorResponse, boolean pretty) { public static void toJson(ErrorResponse errorResponse, JsonGenerator generator) throws IOException { generator.writeStartObject(); + writeError(errorResponse, generator); + generator.writeEndObject(); + } + static void writeError(ErrorResponse errorResponse, JsonGenerator generator) throws IOException { generator.writeObjectFieldStart(ERROR); - generator.writeStringField(MESSAGE, errorResponse.message()); generator.writeStringField(TYPE, errorResponse.type()); generator.writeNumberField(CODE, errorResponse.code()); @@ -57,8 +60,6 @@ public static void toJson(ErrorResponse errorResponse, JsonGenerator generator) } generator.writeEndObject(); - - generator.writeEndObject(); } /** diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/FetchPlanningResultResponse.java b/core/src/main/java/org/apache/iceberg/rest/responses/FetchPlanningResultResponse.java index 59db196244f5..2e176aac653f 100644 --- a/core/src/main/java/org/apache/iceberg/rest/responses/FetchPlanningResultResponse.java +++ b/core/src/main/java/org/apache/iceberg/rest/responses/FetchPlanningResultResponse.java @@ -31,10 +31,12 @@ public class FetchPlanningResultResponse extends BaseScanTaskResponse { private final PlanStatus planStatus; + private final ErrorResponse errorResponse; private final List credentials; private FetchPlanningResultResponse( PlanStatus planStatus, + ErrorResponse errorResponse, List planTasks, List fileScanTasks, List deleteFiles, @@ -42,6 +44,7 @@ private FetchPlanningResultResponse( List credentials) { super(planTasks, fileScanTasks, deleteFiles, specsById); this.planStatus = planStatus; + this.errorResponse = errorResponse; this.credentials = credentials; validate(); } @@ -50,6 +53,10 @@ public PlanStatus planStatus() { return planStatus; } + public ErrorResponse errorResponse() { + return errorResponse; + } + public List credentials() { return credentials != null ? credentials : ImmutableList.of(); } @@ -64,6 +71,9 @@ public void validate() { Preconditions.checkArgument( planStatus() == PlanStatus.COMPLETED || (planTasks() == null && fileScanTasks() == null), "Invalid response: tasks can only be returned in a 'completed' status"); + Preconditions.checkArgument( + planStatus() == PlanStatus.FAILED || errorResponse() == null, + "Invalid response: error can only be returned in a 'failed' status"); if (fileScanTasks() == null || fileScanTasks().isEmpty()) { Preconditions.checkArgument( (deleteFiles() == null || deleteFiles().isEmpty()), @@ -76,6 +86,7 @@ public static class Builder private Builder() {} private PlanStatus planStatus; + private ErrorResponse errorResponse; private final List credentials = Lists.newArrayList(); public Builder withPlanStatus(PlanStatus status) { @@ -83,6 +94,11 @@ public Builder withPlanStatus(PlanStatus status) { return this; } + public Builder withErrorResponse(ErrorResponse response) { + this.errorResponse = response; + return this; + } + public Builder withCredentials(List credentialsToAdd) { credentials.addAll(credentialsToAdd); return this; @@ -91,7 +107,13 @@ public Builder withCredentials(List credentialsToAdd) { @Override public FetchPlanningResultResponse build() { return new FetchPlanningResultResponse( - planStatus, planTasks(), fileScanTasks(), deleteFiles(), specsById(), credentials); + planStatus, + errorResponse, + planTasks(), + fileScanTasks(), + deleteFiles(), + specsById(), + credentials); } } } diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/FetchPlanningResultResponseParser.java b/core/src/main/java/org/apache/iceberg/rest/responses/FetchPlanningResultResponseParser.java index 4a523d3c023b..aa74049ab9f0 100644 --- a/core/src/main/java/org/apache/iceberg/rest/responses/FetchPlanningResultResponseParser.java +++ b/core/src/main/java/org/apache/iceberg/rest/responses/FetchPlanningResultResponseParser.java @@ -38,6 +38,7 @@ public class FetchPlanningResultResponseParser { private static final String STATUS = "status"; private static final String PLAN_TASKS = "plan-tasks"; private static final String STORAGE_CREDENTIALS = "storage-credentials"; + private static final String ERROR = "error"; private FetchPlanningResultResponseParser() {} @@ -58,6 +59,11 @@ public static void toJson(FetchPlanningResultResponse response, JsonGenerator ge "Cannot serialize fileScanTasks in fetchingPlanningResultResponse without specsById"); gen.writeStartObject(); gen.writeStringField(STATUS, response.planStatus().status()); + + if (response.errorResponse() != null) { + ErrorResponseParser.writeError(response.errorResponse(), gen); + } + if (response.planTasks() != null) { JsonUtil.writeStringArray(PLAN_TASKS, response.planTasks(), gen); } @@ -90,6 +96,11 @@ public static FetchPlanningResultResponse fromJson( json != null && !json.isEmpty(), "Invalid fetchPlanningResult response: null or empty"); PlanStatus planStatus = PlanStatus.fromName(JsonUtil.getString(STATUS, json)); + ErrorResponse errorResponse = null; + if (json.has(ERROR) && json.get(ERROR).isObject()) { + errorResponse = ErrorResponseParser.fromJson(json); + } + List planTasks = JsonUtil.getStringListOrNull(PLAN_TASKS, json); List deleteFiles = TableScanResponseParser.parseDeleteFiles(json, specsById); List fileScanTasks = @@ -98,6 +109,7 @@ public static FetchPlanningResultResponse fromJson( FetchPlanningResultResponse.Builder builder = FetchPlanningResultResponse.builder() .withPlanStatus(planStatus) + .withErrorResponse(errorResponse) .withPlanTasks(planTasks) .withFileScanTasks(fileScanTasks) .withSpecsById(specsById); diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/PlanTableScanResponse.java b/core/src/main/java/org/apache/iceberg/rest/responses/PlanTableScanResponse.java index 1b4bb86e65eb..d0ac222c3052 100644 --- a/core/src/main/java/org/apache/iceberg/rest/responses/PlanTableScanResponse.java +++ b/core/src/main/java/org/apache/iceberg/rest/responses/PlanTableScanResponse.java @@ -33,11 +33,13 @@ public class PlanTableScanResponse extends BaseScanTaskResponse { private final PlanStatus planStatus; private final String planId; + private final ErrorResponse errorResponse; private final List credentials; private PlanTableScanResponse( PlanStatus planStatus, String planId, + ErrorResponse errorResponse, List planTasks, List fileScanTasks, List deleteFiles, @@ -46,6 +48,7 @@ private PlanTableScanResponse( super(planTasks, fileScanTasks, deleteFiles, specsById); this.planStatus = planStatus; this.planId = planId; + this.errorResponse = errorResponse; this.credentials = credentials; validate(); } @@ -58,6 +61,10 @@ public String planId() { return planId; } + public ErrorResponse errorResponse() { + return errorResponse; + } + public List credentials() { return credentials != null ? credentials : ImmutableList.of(); } @@ -86,6 +93,10 @@ public void validate() { planStatus() == PlanStatus.COMPLETED || (planTasks() == null && fileScanTasks() == null), "Invalid response: tasks can only be defined when status is '%s'", PlanStatus.COMPLETED.status()); + Preconditions.checkArgument( + planStatus() == PlanStatus.FAILED || errorResponse() == null, + "Invalid response: error can only be defined when status is '%s'", + PlanStatus.FAILED.status()); if (null != planId()) { Preconditions.checkArgument( planStatus() == PlanStatus.SUBMITTED || planStatus() == PlanStatus.COMPLETED, @@ -108,6 +119,7 @@ public static Builder builder() { public static class Builder extends BaseScanTaskResponse.Builder { private PlanStatus planStatus; private String planId; + private ErrorResponse errorResponse; private final List credentials = Lists.newArrayList(); /** @@ -127,6 +139,11 @@ public Builder withPlanId(String id) { return this; } + public Builder withErrorResponse(ErrorResponse response) { + this.errorResponse = response; + return this; + } + public Builder withCredentials(List credentialsToAdd) { credentials.addAll(credentialsToAdd); return this; @@ -137,6 +154,7 @@ public PlanTableScanResponse build() { return new PlanTableScanResponse( planStatus, planId, + errorResponse, planTasks(), fileScanTasks(), deleteFiles(), diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/PlanTableScanResponseParser.java b/core/src/main/java/org/apache/iceberg/rest/responses/PlanTableScanResponseParser.java index c2f47b86d3f0..8ca199397ea6 100644 --- a/core/src/main/java/org/apache/iceberg/rest/responses/PlanTableScanResponseParser.java +++ b/core/src/main/java/org/apache/iceberg/rest/responses/PlanTableScanResponseParser.java @@ -39,6 +39,7 @@ public class PlanTableScanResponseParser { private static final String PLAN_ID = "plan-id"; private static final String PLAN_TASKS = "plan-tasks"; private static final String STORAGE_CREDENTIALS = "storage-credentials"; + private static final String ERROR = "error"; private PlanTableScanResponseParser() {} @@ -60,6 +61,10 @@ public static void toJson(PlanTableScanResponse response, JsonGenerator gen) thr gen.writeStartObject(); gen.writeStringField(STATUS, response.planStatus().status()); + if (response.errorResponse() != null) { + ErrorResponseParser.writeError(response.errorResponse(), gen); + } + if (response.planId() != null) { gen.writeStringField(PLAN_ID, response.planId()); } @@ -98,6 +103,11 @@ public static PlanTableScanResponse fromJson( "Cannot parse planTableScan response from empty or null object"); PlanStatus planStatus = PlanStatus.fromName(JsonUtil.getString(STATUS, json)); + ErrorResponse errorResponse = null; + if (json.has(ERROR) && json.get(ERROR).isObject()) { + errorResponse = ErrorResponseParser.fromJson(json); + } + String planId = JsonUtil.getStringOrNull(PLAN_ID, json); List planTasks = JsonUtil.getStringListOrNull(PLAN_TASKS, json); List deleteFiles = TableScanResponseParser.parseDeleteFiles(json, specsById); @@ -108,6 +118,7 @@ public static PlanTableScanResponse fromJson( PlanTableScanResponse.builder() .withPlanId(planId) .withPlanStatus(planStatus) + .withErrorResponse(errorResponse) .withPlanTasks(planTasks) .withFileScanTasks(fileScanTasks) .withSpecsById(specsById); diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/RemoteSignResponse.java b/core/src/main/java/org/apache/iceberg/rest/responses/RemoteSignResponse.java new file mode 100644 index 000000000000..c5009505bf4f --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/rest/responses/RemoteSignResponse.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.rest.responses; + +import java.net.URI; +import java.util.List; +import java.util.Map; +import org.apache.iceberg.rest.RESTResponse; +import org.immutables.value.Value; + +@Value.Immutable +public interface RemoteSignResponse extends RESTResponse { + URI uri(); + + Map> headers(); + + @Override + default void validate() {} +} diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/RemoteSignResponseParser.java b/core/src/main/java/org/apache/iceberg/rest/responses/RemoteSignResponseParser.java new file mode 100644 index 000000000000..f53e844c6162 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/rest/responses/RemoteSignResponseParser.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.rest.responses; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.databind.JsonNode; +import java.io.IOException; +import java.net.URI; +import java.util.List; +import java.util.Map; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.rest.requests.RemoteSignRequestParser; +import org.apache.iceberg.util.JsonUtil; + +public class RemoteSignResponseParser { + + private static final String URI_FIELD = "uri"; + private static final String HEADERS = "headers"; + + private RemoteSignResponseParser() {} + + public static String toJson(RemoteSignResponse response) { + return toJson(response, false); + } + + public static String toJson(RemoteSignResponse response, boolean pretty) { + return JsonUtil.generate(gen -> toJson(response, gen), pretty); + } + + public static void toJson(RemoteSignResponse response, JsonGenerator gen) throws IOException { + Preconditions.checkArgument(null != response, "Invalid remote sign response: null"); + + gen.writeStartObject(); + + gen.writeStringField(URI_FIELD, response.uri().toString()); + RemoteSignRequestParser.headersToJson(HEADERS, response.headers(), gen); + + gen.writeEndObject(); + } + + public static RemoteSignResponse fromJson(String json) { + return JsonUtil.parse(json, RemoteSignResponseParser::fromJson); + } + + public static RemoteSignResponse fromJson(JsonNode json) { + Preconditions.checkArgument(null != json, "Cannot parse remote sign response from null object"); + Preconditions.checkArgument( + json.isObject(), "Cannot parse remote sign response from non-object: %s", json); + + URI uri = URI.create(JsonUtil.getString(URI_FIELD, json)); + Map> headers = RemoteSignRequestParser.headersFromJson(HEADERS, json); + + return ImmutableRemoteSignResponse.builder().uri(uri).headers(headers).build(); + } +} diff --git a/core/src/main/java/org/apache/iceberg/util/LocationUtil.java b/core/src/main/java/org/apache/iceberg/util/LocationUtil.java index 400307149238..4c0d401c74b9 100644 --- a/core/src/main/java/org/apache/iceberg/util/LocationUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/LocationUtil.java @@ -18,6 +18,8 @@ */ package org.apache.iceberg.util; +import java.util.UUID; +import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.base.Strings; @@ -33,4 +35,26 @@ public static String stripTrailingSlash(String path) { } return result; } + + /** + * Returns a path component derived from the {@code tableIdentifier}, used as part of the table + * location URI. + * + *

If {@code useUniqueLocation} is {@code true}, the returned component will include a random + * UUID suffix. Otherwise, the plain table name is returned. + * + * @param tableIdentifier Iceberg table identifier + * @param useUniqueLocation whether to ensure uniqueness + * @return a string representing the table name component for a location URI + */ + public static String tableLocation(TableIdentifier tableIdentifier, boolean useUniqueLocation) { + Preconditions.checkArgument(null != tableIdentifier, "Invalid identifier: null"); + + if (useUniqueLocation) { + String uniqueSuffix = UUID.randomUUID().toString().replace("-", ""); + return String.format("%s-%s", tableIdentifier.name(), uniqueSuffix); + } else { + return tableIdentifier.name(); + } + } } diff --git a/core/src/main/java/org/apache/iceberg/util/LockManagers.java b/core/src/main/java/org/apache/iceberg/util/LockManagers.java index 96622cb57f83..561d0a8975dd 100644 --- a/core/src/main/java/org/apache/iceberg/util/LockManagers.java +++ b/core/src/main/java/org/apache/iceberg/util/LockManagers.java @@ -18,11 +18,9 @@ */ package org.apache.iceberg.util; -import java.util.List; import java.util.Map; import java.util.Optional; import java.util.concurrent.Executors; -import java.util.concurrent.Future; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.ScheduledThreadPoolExecutor; @@ -108,6 +106,11 @@ public int heartbeatThreads() { return heartbeatThreads; } + /** + * Returns the shared scheduler for lock heartbeats. + * + *

Callers must not shut down this scheduler. It is shared across lock manager instances. + */ public ScheduledExecutorService scheduler() { if (scheduler == null) { synchronized (BaseLockManager.class) { @@ -159,16 +162,10 @@ public void initialize(Map properties) { @Override public void close() throws Exception { - if (scheduler != null) { - List tasks = scheduler.shutdownNow(); - tasks.forEach( - task -> { - if (task instanceof Future) { - ((Future) task).cancel(true); - } - }); - scheduler = null; - } + // The scheduler is a shared static resource used across all BaseLockManager instances. + // Individual instances must not shut it down, as other instances may still be using it. + // The scheduler uses daemon threads and will be terminated at JVM exit by the shutdown + // hook registered via MoreExecutors.getExitingScheduledExecutorService. } } diff --git a/core/src/main/java/org/apache/iceberg/util/StructLikeWrapper.java b/core/src/main/java/org/apache/iceberg/util/StructLikeWrapper.java index 28629706bf5e..2e71d2419185 100644 --- a/core/src/main/java/org/apache/iceberg/util/StructLikeWrapper.java +++ b/core/src/main/java/org/apache/iceberg/util/StructLikeWrapper.java @@ -88,7 +88,13 @@ public boolean equals(Object other) { return false; } - return comparator.compare(this.struct, that.struct) == 0; + try { + return comparator.compare(this.struct, that.struct) == 0; + } catch (RuntimeException e) { + // An exception may occur, for example, when struct is PartitionData and its type does not + // match its data. + return false; + } } @Override diff --git a/core/src/test/java/org/apache/iceberg/stats/TestContentStats.java b/core/src/test/java/org/apache/iceberg/TestContentStats.java similarity index 95% rename from core/src/test/java/org/apache/iceberg/stats/TestContentStats.java rename to core/src/test/java/org/apache/iceberg/TestContentStats.java index 6baff7dfe63e..0f06276d454b 100644 --- a/core/src/test/java/org/apache/iceberg/stats/TestContentStats.java +++ b/core/src/test/java/org/apache/iceberg/TestContentStats.java @@ -16,22 +16,21 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.stats; - -import static org.apache.iceberg.stats.FieldStatistic.AVG_VALUE_SIZE; -import static org.apache.iceberg.stats.FieldStatistic.EXACT_BOUNDS; -import static org.apache.iceberg.stats.FieldStatistic.LOWER_BOUND; -import static org.apache.iceberg.stats.FieldStatistic.MAX_VALUE_SIZE; -import static org.apache.iceberg.stats.FieldStatistic.NAN_VALUE_COUNT; -import static org.apache.iceberg.stats.FieldStatistic.NULL_VALUE_COUNT; -import static org.apache.iceberg.stats.FieldStatistic.UPPER_BOUND; -import static org.apache.iceberg.stats.FieldStatistic.VALUE_COUNT; +package org.apache.iceberg; + +import static org.apache.iceberg.FieldStatistic.AVG_VALUE_SIZE; +import static org.apache.iceberg.FieldStatistic.EXACT_BOUNDS; +import static org.apache.iceberg.FieldStatistic.LOWER_BOUND; +import static org.apache.iceberg.FieldStatistic.MAX_VALUE_SIZE; +import static org.apache.iceberg.FieldStatistic.NAN_VALUE_COUNT; +import static org.apache.iceberg.FieldStatistic.NULL_VALUE_COUNT; +import static org.apache.iceberg.FieldStatistic.UPPER_BOUND; +import static org.apache.iceberg.FieldStatistic.VALUE_COUNT; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; -import org.apache.iceberg.Schema; import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; import org.apache.iceberg.types.Types; @@ -154,7 +153,7 @@ public void retrievalByPosition() { assertThatThrownBy(() -> stats.get(0, Long.class)) .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining( - "Wrong class, expected java.lang.Long but was org.apache.iceberg.stats.BaseFieldStats for object:"); + "Wrong class, expected java.lang.Long but was org.apache.iceberg.BaseFieldStats for object:"); } @Test diff --git a/core/src/test/java/org/apache/iceberg/TestDeleteFiles.java b/core/src/test/java/org/apache/iceberg/TestDeleteFiles.java index 68e5fa8b560e..d7cdd5c5d884 100644 --- a/core/src/test/java/org/apache/iceberg/TestDeleteFiles.java +++ b/core/src/test/java/org/apache/iceberg/TestDeleteFiles.java @@ -615,6 +615,10 @@ public void removingDataFileByExpressionAlsoRemovesDV() { .containsEntry(SnapshotSummary.REPLACED_MANIFESTS_COUNT, "2"); assertThat(deleteSnap.deleteManifests(table.io())).hasSize(1); + assertThat(deleteSnap.summary()) + .containsEntry(SnapshotSummary.REMOVED_DVS_PROP, "1") + .containsEntry(SnapshotSummary.REMOVED_DELETE_FILES_PROP, "1"); + validateDeleteManifest( deleteSnap.deleteManifests(table.io()).get(0), dataSeqs(1L, 1L), @@ -658,6 +662,10 @@ public void removingDataFileByPathAlsoRemovesDV() { .containsEntry(SnapshotSummary.REPLACED_MANIFESTS_COUNT, "2"); assertThat(deleteSnap.deleteManifests(table.io())).hasSize(1); + assertThat(deleteSnap.summary()) + .containsEntry(SnapshotSummary.REMOVED_DVS_PROP, "1") + .containsEntry(SnapshotSummary.REMOVED_DELETE_FILES_PROP, "1"); + validateDeleteManifest( deleteSnap.deleteManifests(table.io()).get(0), dataSeqs(1L, 1L), @@ -667,6 +675,69 @@ public void removingDataFileByPathAlsoRemovesDV() { statuses(ManifestEntry.Status.DELETED, ManifestEntry.Status.EXISTING)); } + @TestTemplate + public void removingDataFilesWhenTruncatingAlsoRemovesDVs() { + assumeThat(formatVersion).isGreaterThanOrEqualTo(3); + DeleteFile dv1 = + FileMetadata.deleteFileBuilder(SPEC) + .ofPositionDeletes() + .withPath("/path/to/data-1-deletes.puffin") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=0") + .withRecordCount(5) + .withReferencedDataFile(DATA_FILE_BUCKET_0_IDS_0_2.location()) + .withContentOffset(4) + .withContentSizeInBytes(6) + .build(); + + DeleteFile dv2 = + FileMetadata.deleteFileBuilder(SPEC) + .ofPositionDeletes() + .withPath("/path/to/data-2-deletes.puffin") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=0") + .withRecordCount(5) + .withReferencedDataFile(DATA_FILE_BUCKET_0_IDS_8_10.location()) + .withContentOffset(4) + .withContentSizeInBytes(6) + .build(); + + commit( + table, + table + .newRowDelta() + .addRows(DATA_FILE_BUCKET_0_IDS_0_2) + .addRows(DATA_FILE_BUCKET_0_IDS_8_10) + .addDeletes(dv1) + .addDeletes(dv2), + branch); + + Snapshot snapshot = latestSnapshot(table, branch); + assertThat(snapshot.sequenceNumber()).isEqualTo(1); + assertThat(table.ops().current().lastSequenceNumber()).isEqualTo(1); + + // deleting by row filter should also remove the orphaned dv1 from delete manifests. When a + // table is truncated via TRUNCATE, the row filter is sent as Expressions.alwaysTrue() + commit(table, table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()), branch); + + Snapshot deleteSnap = latestSnapshot(table, branch); + assertThat(deleteSnap.sequenceNumber()).isEqualTo(2); + assertThat(table.ops().current().lastSequenceNumber()).isEqualTo(2); + + assertThat(deleteSnap.deleteManifests(table.io())).hasSize(1); + assertThat(deleteSnap.summary()) + .containsEntry(SnapshotSummary.REMOVED_DVS_PROP, "2") + .containsEntry(SnapshotSummary.REMOVED_DELETE_FILES_PROP, "2"); + + validateDeleteManifest( + deleteSnap.deleteManifests(table.io()).get(0), + dataSeqs(1L, 1L), + fileSeqs(1L, 1L), + ids(deleteSnap.snapshotId(), deleteSnap.snapshotId()), + files(dv1, dv2), + statuses(Status.DELETED, Status.DELETED)); + } + private static ByteBuffer longToBuffer(long value) { return ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN).putLong(0, value); } diff --git a/core/src/test/java/org/apache/iceberg/TestDeletionVectorStruct.java b/core/src/test/java/org/apache/iceberg/TestDeletionVectorStruct.java new file mode 100644 index 000000000000..325f9afd9ca9 --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/TestDeletionVectorStruct.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +class TestDeletionVectorStruct { + + @Test + void testFieldAccess() { + DeletionVectorStruct dv = + DeletionVectorStruct.builder() + .location("s3://bucket/data/dv.puffin") + .offset(256L) + .sizeInBytes(128L) + .cardinality(42L) + .build(); + + assertThat(dv.location()).isEqualTo("s3://bucket/data/dv.puffin"); + assertThat(dv.offset()).isEqualTo(256L); + assertThat(dv.sizeInBytes()).isEqualTo(128L); + assertThat(dv.cardinality()).isEqualTo(42L); + } + + @Test + void testCopy() { + DeletionVectorStruct dv = + DeletionVectorStruct.builder() + .location("s3://bucket/data/dv.puffin") + .offset(256L) + .sizeInBytes(128L) + .cardinality(42L) + .build(); + + DeletionVectorStruct copy = dv.copy(); + + assertThat(copy.location()).isEqualTo("s3://bucket/data/dv.puffin"); + assertThat(copy.offset()).isEqualTo(256L); + assertThat(copy.sizeInBytes()).isEqualTo(128L); + assertThat(copy.cardinality()).isEqualTo(42L); + } + + @Test + void testSize() { + DeletionVectorStruct dv = new DeletionVectorStruct(DeletionVector.schema()); + assertThat(dv.size()).isEqualTo(4); + } + + @Test + void testProjectedStructLike() { + // project only location (field ID 155) and cardinality (field ID 156) + Types.StructType projection = + Types.StructType.of(DeletionVector.LOCATION, DeletionVector.CARDINALITY); + + DeletionVectorStruct dv = new DeletionVectorStruct(projection); + assertThat(dv.size()).isEqualTo(2); + + // projected position 0 maps to internal position 0 (location) + // projected position 1 maps to internal position 3 (cardinality) + dv.set(0, "s3://bucket/data/dv.puffin"); + dv.set(1, 42L); + + assertThat(dv.location()).isEqualTo("s3://bucket/data/dv.puffin"); + assertThat(dv.cardinality()).isEqualTo(42L); + assertThat(dv.get(0, String.class)).isEqualTo("s3://bucket/data/dv.puffin"); + assertThat(dv.get(1, Long.class)).isEqualTo(42L); + } + + @Test + void testJavaSerializationRoundTrip() throws IOException, ClassNotFoundException { + DeletionVectorStruct dv = + DeletionVectorStruct.builder() + .location("s3://bucket/data/dv.puffin") + .offset(256L) + .sizeInBytes(128L) + .cardinality(42L) + .build(); + + DeletionVectorStruct deserialized = TestHelpers.roundTripSerialize(dv); + + assertThat(deserialized.location()).isEqualTo("s3://bucket/data/dv.puffin"); + assertThat(deserialized.offset()).isEqualTo(256L); + assertThat(deserialized.sizeInBytes()).isEqualTo(128L); + assertThat(deserialized.cardinality()).isEqualTo(42L); + } + + @Test + void testBuilderValidation() { + assertThatThrownBy( + () -> DeletionVectorStruct.builder().offset(0).sizeInBytes(1).cardinality(1).build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid location: null"); + + assertThatThrownBy( + () -> + DeletionVectorStruct.builder() + .location("s3://bucket/dv.puffin") + .sizeInBytes(1) + .cardinality(1) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid offset: -1 (must be >= 0)"); + + assertThatThrownBy( + () -> + DeletionVectorStruct.builder() + .location("s3://bucket/dv.puffin") + .offset(0) + .cardinality(1) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid size in bytes: -1 (must be >= 0)"); + + assertThatThrownBy( + () -> + DeletionVectorStruct.builder() + .location("s3://bucket/dv.puffin") + .offset(0) + .sizeInBytes(1) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid cardinality: -1 (must be >= 0)"); + } + + @Test + void testKryoSerializationRoundTrip() throws IOException { + DeletionVectorStruct dv = + DeletionVectorStruct.builder() + .location("s3://bucket/data/dv.puffin") + .offset(256L) + .sizeInBytes(128L) + .cardinality(42L) + .build(); + + DeletionVectorStruct deserialized = TestHelpers.KryoHelpers.roundTripSerialize(dv); + + assertThat(deserialized.location()).isEqualTo("s3://bucket/data/dv.puffin"); + assertThat(deserialized.offset()).isEqualTo(256L); + assertThat(deserialized.sizeInBytes()).isEqualTo(128L); + assertThat(deserialized.cardinality()).isEqualTo(42L); + } +} diff --git a/core/src/test/java/org/apache/iceberg/TestEntryStatus.java b/core/src/test/java/org/apache/iceberg/TestEntryStatus.java new file mode 100644 index 000000000000..c395cdcece7c --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/TestEntryStatus.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.stream.IntStream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.MethodSource; + +class TestEntryStatus { + + @ParameterizedTest + @EnumSource(EntryStatus.class) + void fromId(EntryStatus status) { + assertThat(EntryStatus.fromId(status.id())).isEqualTo(status); + } + + static IntStream invalidIds() { + return IntStream.of(-1, EntryStatus.values().length); + } + + @ParameterizedTest + @MethodSource("invalidIds") + void fromIdInvalid(int id) { + assertThatThrownBy(() -> EntryStatus.fromId(id)) + .isInstanceOf(ArrayIndexOutOfBoundsException.class) + .hasMessageContaining(String.valueOf(id)); + } +} diff --git a/core/src/test/java/org/apache/iceberg/stats/TestFieldStats.java b/core/src/test/java/org/apache/iceberg/TestFieldStats.java similarity index 94% rename from core/src/test/java/org/apache/iceberg/stats/TestFieldStats.java rename to core/src/test/java/org/apache/iceberg/TestFieldStats.java index be5f3166940d..c703a3044fc0 100644 --- a/core/src/test/java/org/apache/iceberg/stats/TestFieldStats.java +++ b/core/src/test/java/org/apache/iceberg/TestFieldStats.java @@ -16,16 +16,16 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.stats; - -import static org.apache.iceberg.stats.FieldStatistic.AVG_VALUE_SIZE; -import static org.apache.iceberg.stats.FieldStatistic.EXACT_BOUNDS; -import static org.apache.iceberg.stats.FieldStatistic.LOWER_BOUND; -import static org.apache.iceberg.stats.FieldStatistic.MAX_VALUE_SIZE; -import static org.apache.iceberg.stats.FieldStatistic.NAN_VALUE_COUNT; -import static org.apache.iceberg.stats.FieldStatistic.NULL_VALUE_COUNT; -import static org.apache.iceberg.stats.FieldStatistic.UPPER_BOUND; -import static org.apache.iceberg.stats.FieldStatistic.VALUE_COUNT; +package org.apache.iceberg; + +import static org.apache.iceberg.FieldStatistic.AVG_VALUE_SIZE; +import static org.apache.iceberg.FieldStatistic.EXACT_BOUNDS; +import static org.apache.iceberg.FieldStatistic.LOWER_BOUND; +import static org.apache.iceberg.FieldStatistic.MAX_VALUE_SIZE; +import static org.apache.iceberg.FieldStatistic.NAN_VALUE_COUNT; +import static org.apache.iceberg.FieldStatistic.NULL_VALUE_COUNT; +import static org.apache.iceberg.FieldStatistic.UPPER_BOUND; +import static org.apache.iceberg.FieldStatistic.VALUE_COUNT; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; @@ -33,7 +33,6 @@ import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.stream.Stream; -import org.apache.iceberg.TestHelpers; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.Test; diff --git a/core/src/test/java/org/apache/iceberg/TestLocationProvider.java b/core/src/test/java/org/apache/iceberg/TestLocationProvider.java index 146f2c8da5e7..d665d84cad82 100644 --- a/core/src/test/java/org/apache/iceberg/TestLocationProvider.java +++ b/core/src/test/java/org/apache/iceberg/TestLocationProvider.java @@ -295,7 +295,7 @@ public void testExcludePartitionInPath() { String fileLocation = table.locationProvider().newDataLocation(table.spec(), partitionData, "test.parquet"); - // no partition values included in the path and last part of entropy is seperated with "-" + // no partition values included in the path and last part of entropy is separated with "-" assertThat(fileLocation).endsWith("/data/0110/1010/0011/11101000-test.parquet"); } diff --git a/core/src/test/java/org/apache/iceberg/TestManifestEntryStatus.java b/core/src/test/java/org/apache/iceberg/TestManifestEntryStatus.java new file mode 100644 index 000000000000..39867bbf7c02 --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/TestManifestEntryStatus.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.stream.IntStream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.MethodSource; + +class TestManifestEntryStatus { + + @ParameterizedTest + @EnumSource(ManifestEntry.Status.class) + void fromId(ManifestEntry.Status status) { + assertThat(ManifestEntry.Status.fromId(status.id())).isEqualTo(status); + } + + static IntStream invalidIds() { + return IntStream.of(-1, ManifestEntry.Status.values().length); + } + + @ParameterizedTest + @MethodSource("invalidIds") + void fromIdInvalid(int id) { + assertThatThrownBy(() -> ManifestEntry.Status.fromId(id)) + .isInstanceOf(ArrayIndexOutOfBoundsException.class) + .hasMessageContaining(String.valueOf(id)); + } +} diff --git a/core/src/test/java/org/apache/iceberg/TestManifestInfoStruct.java b/core/src/test/java/org/apache/iceberg/TestManifestInfoStruct.java new file mode 100644 index 000000000000..3a694f1a38f2 --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/TestManifestInfoStruct.java @@ -0,0 +1,389 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.nio.ByteBuffer; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +class TestManifestInfoStruct { + + @Test + void testFieldAccess() { + ManifestInfoStruct info = new ManifestInfoStruct(ManifestInfo.schema()); + + info.set(0, 10); + info.set(1, 20); + info.set(2, 3); + info.set(3, 2); + info.set(4, 1000L); + info.set(5, 2000L); + info.set(6, 300L); + info.set(7, 200L); + info.set(8, 5L); + info.set(9, ByteBuffer.wrap(new byte[] {0xF})); + info.set(10, 1L); + + assertThat(info.addedFilesCount()).isEqualTo(10); + assertThat(info.existingFilesCount()).isEqualTo(20); + assertThat(info.deletedFilesCount()).isEqualTo(3); + assertThat(info.replacedFilesCount()).isEqualTo(2); + assertThat(info.addedRowsCount()).isEqualTo(1000L); + assertThat(info.existingRowsCount()).isEqualTo(2000L); + assertThat(info.deletedRowsCount()).isEqualTo(300L); + assertThat(info.replacedRowsCount()).isEqualTo(200L); + assertThat(info.minSequenceNumber()).isEqualTo(5L); + assertThat(info.dv()).isNotNull(); + assertThat(info.dvCardinality()).isEqualTo(1L); + } + + @Test + void testCopy() { + ManifestInfoStruct info = + ManifestInfoStruct.builder() + .addedFilesCount(10) + .existingFilesCount(20) + .deletedFilesCount(3) + .replacedFilesCount(2) + .addedRowsCount(1000L) + .existingRowsCount(2000L) + .deletedRowsCount(300L) + .replacedRowsCount(200L) + .minSequenceNumber(5L) + .dv(new byte[] {0xF}) + .dvCardinality(1L) + .build(); + + ManifestInfoStruct copy = info.copy(); + + assertThat(copy.addedFilesCount()).isEqualTo(10); + assertThat(copy.existingFilesCount()).isEqualTo(20); + assertThat(copy.deletedFilesCount()).isEqualTo(3); + assertThat(copy.replacedFilesCount()).isEqualTo(2); + assertThat(copy.addedRowsCount()).isEqualTo(1000L); + assertThat(copy.existingRowsCount()).isEqualTo(2000L); + assertThat(copy.deletedRowsCount()).isEqualTo(300L); + assertThat(copy.replacedRowsCount()).isEqualTo(200L); + assertThat(copy.minSequenceNumber()).isEqualTo(5L); + assertThat(copy.dvCardinality()).isEqualTo(1L); + + // verify deep copy of dv byte array + assertThat(copy.dv().array()).isNotSameAs(info.dv().array()); + } + + @Test + void testNullableFields() { + ManifestInfoStruct info = + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .build(); + + assertThat(info.dv()).isNull(); + assertThat(info.dvCardinality()).isNull(); + } + + @Test + void testProjectedStructLike() { + // project only added_files_count (field ID 504) and min_sequence_number (field ID 516) + Types.StructType projection = + Types.StructType.of(ManifestInfo.ADDED_FILES_COUNT, ManifestInfo.MIN_SEQUENCE_NUMBER); + + ManifestInfoStruct info = new ManifestInfoStruct(projection); + assertThat(info.size()).isEqualTo(2); + + // projected position 0 maps to internal position 0 (added_files_count) + // projected position 1 maps to internal position 8 (min_sequence_number) + info.set(0, 10); + info.set(1, 5L); + + assertThat(info.addedFilesCount()).isEqualTo(10); + assertThat(info.minSequenceNumber()).isEqualTo(5L); + assertThat(info.get(0, Integer.class)).isEqualTo(10); + assertThat(info.get(1, Long.class)).isEqualTo(5L); + } + + @Test + void testJavaSerializationRoundTrip() throws IOException, ClassNotFoundException { + ManifestInfoStruct info = + ManifestInfoStruct.builder() + .addedFilesCount(10) + .existingFilesCount(20) + .deletedFilesCount(3) + .replacedFilesCount(2) + .addedRowsCount(1000L) + .existingRowsCount(2000L) + .deletedRowsCount(300L) + .replacedRowsCount(200L) + .minSequenceNumber(5L) + .dv(new byte[] {0xF}) + .dvCardinality(1L) + .build(); + + ManifestInfoStruct deserialized = TestHelpers.roundTripSerialize(info); + + assertThat(deserialized.addedFilesCount()).isEqualTo(10); + assertThat(deserialized.existingFilesCount()).isEqualTo(20); + assertThat(deserialized.deletedFilesCount()).isEqualTo(3); + assertThat(deserialized.replacedFilesCount()).isEqualTo(2); + assertThat(deserialized.addedRowsCount()).isEqualTo(1000L); + assertThat(deserialized.existingRowsCount()).isEqualTo(2000L); + assertThat(deserialized.deletedRowsCount()).isEqualTo(300L); + assertThat(deserialized.replacedRowsCount()).isEqualTo(200L); + assertThat(deserialized.minSequenceNumber()).isEqualTo(5L); + assertThat(deserialized.dv()).isEqualTo(ByteBuffer.wrap(new byte[] {0xF})); + assertThat(deserialized.dvCardinality()).isEqualTo(1L); + } + + @Test + void testBuilderValidation() { + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .existingFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid added files count: -1 (must be >= 0)"); + + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid existing files count: -1 (must be >= 0)"); + + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid deleted files count: -1 (must be >= 0)"); + + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .deletedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid replaced files count: -1 (must be >= 0)"); + + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid added rows count: -1 (must be >= 0)"); + + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid existing rows count: -1 (must be >= 0)"); + + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid deleted rows count: -1 (must be >= 0)"); + + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .minSequenceNumber(0L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid replaced rows count: -1 (must be >= 0)"); + + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid min sequence number: -1 (must be >= 0)"); + } + + @Test + void testBuilderDvPairingValidation() { + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .dv(new byte[] {0xF}) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid DV and cardinality: must both be null or non-null"); + + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .dvCardinality(1L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid DV and cardinality: must both be null or non-null"); + + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .dv(new byte[] {0xF}) + .dvCardinality(0L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid DV cardinality: 0 (must be positive)"); + } + + @Test + void testKryoSerializationRoundTrip() throws IOException { + ManifestInfoStruct info = + ManifestInfoStruct.builder() + .addedFilesCount(10) + .existingFilesCount(20) + .deletedFilesCount(3) + .replacedFilesCount(2) + .addedRowsCount(1000L) + .existingRowsCount(2000L) + .deletedRowsCount(300L) + .replacedRowsCount(200L) + .minSequenceNumber(5L) + .dv(new byte[] {0xF}) + .dvCardinality(1L) + .build(); + + ManifestInfoStruct deserialized = TestHelpers.KryoHelpers.roundTripSerialize(info); + + assertThat(deserialized.addedFilesCount()).isEqualTo(10); + assertThat(deserialized.existingFilesCount()).isEqualTo(20); + assertThat(deserialized.deletedFilesCount()).isEqualTo(3); + assertThat(deserialized.replacedFilesCount()).isEqualTo(2); + assertThat(deserialized.addedRowsCount()).isEqualTo(1000L); + assertThat(deserialized.existingRowsCount()).isEqualTo(2000L); + assertThat(deserialized.deletedRowsCount()).isEqualTo(300L); + assertThat(deserialized.replacedRowsCount()).isEqualTo(200L); + assertThat(deserialized.minSequenceNumber()).isEqualTo(5L); + assertThat(deserialized.dv()).isEqualTo(ByteBuffer.wrap(new byte[] {0xF})); + assertThat(deserialized.dvCardinality()).isEqualTo(1L); + } +} diff --git a/core/src/test/java/org/apache/iceberg/TestMetrics.java b/core/src/test/java/org/apache/iceberg/TestMetrics.java index d9048a5d5ed3..874bb6d74d44 100644 --- a/core/src/test/java/org/apache/iceberg/TestMetrics.java +++ b/core/src/test/java/org/apache/iceberg/TestMetrics.java @@ -41,8 +41,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.stats.ContentStats; -import org.apache.iceberg.stats.FieldStats; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.BinaryType; diff --git a/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java b/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java index 384b7132ef76..09e9fdd1f722 100644 --- a/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java +++ b/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java @@ -2207,4 +2207,47 @@ private static PartitionStatisticsFile reusePartitionStatsFile( private static void commitPartitionStats(Table table, PartitionStatisticsFile statisticsFile) { table.updatePartitionStatistics().setPartitionStatistics(statisticsFile).commit(); } + + @TestTemplate + public void testAppendOnlyManifestsNotScannedDuringCleanup() { + assumeThat(incrementalCleanup).isTrue(); + + TestTables.LocalFileIO spyFileIO = Mockito.spy(new TestTables.LocalFileIO()); + String tableName = "testAppendOnlyManifests"; + Table testTable = + TestTables.create( + tableDir, + tableName, + SCHEMA, + SPEC, + SortOrder.unsorted(), + formatVersion, + new TestTables.TestTableOperations(tableName, tableDir, spyFileIO)); + + testTable.newAppend().appendFile(FILE_A).commit(); + Snapshot firstSnapshot = testTable.currentSnapshot(); + + Set appendOnlyManifestPaths = + firstSnapshot.allManifests(testTable.io()).stream() + .map(ManifestFile::path) + .collect(Collectors.toSet()); + + waitUntilAfter(firstSnapshot.timestampMillis()); + + testTable.newAppend().appendFile(FILE_B).commit(); + long tAfterCommits = waitUntilAfter(testTable.currentSnapshot().timestampMillis()); + + Mockito.clearInvocations(spyFileIO); + + Set deletedFiles = Sets.newHashSet(); + removeSnapshots(testTable) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .commit(); + + assertThat(deletedFiles).containsExactly(firstSnapshot.manifestListLocation()); + + appendOnlyManifestPaths.forEach( + path -> Mockito.verify(spyFileIO, Mockito.never()).newInputFile(path)); + } } diff --git a/core/src/test/java/org/apache/iceberg/TestRowDelta.java b/core/src/test/java/org/apache/iceberg/TestRowDelta.java index c442541289a9..aaccf4122481 100644 --- a/core/src/test/java/org/apache/iceberg/TestRowDelta.java +++ b/core/src/test/java/org/apache/iceberg/TestRowDelta.java @@ -2419,6 +2419,27 @@ public void testManifestMergingAfterUpgradeToV3() { assertThat(taskDV.contentSizeInBytes()).isEqualTo(dv.contentSizeInBytes()); } + @TestTemplate + public void testV2StagedPositionDeleteCannotCommitToV3() { + assumeThat(formatVersion).isEqualTo(2); + + Snapshot initial = commit(table, table.newAppend().appendFile(FILE_A), branch); + + // Stage RowDelta at v2: position delete for FILE_A + add new data FILE_B. + RowDelta rowDelta = table.newRowDelta().addDeletes(FILE_A_DELETES).addRows(FILE_B); + + // upgrade the table + table.updateProperties().set(TableProperties.FORMAT_VERSION, "3").commit(); + + assertThatThrownBy(rowDelta::commit) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Must use DVs for position deletes in V3"); + + table.refresh(); + assertThat(table.operations().current().formatVersion()).isEqualTo(3); + assertThat(table.snapshot(branch)).isEqualTo(initial); + } + @TestTemplate public void testInabilityToAddPositionDeleteFilesInTablesWithDVs() { assumeThat(formatVersion).isGreaterThanOrEqualTo(3); diff --git a/core/src/test/java/org/apache/iceberg/TestSchemaUpdate.java b/core/src/test/java/org/apache/iceberg/TestSchemaUpdate.java index fb942dde2aa2..5325e4013c68 100644 --- a/core/src/test/java/org/apache/iceberg/TestSchemaUpdate.java +++ b/core/src/test/java/org/apache/iceberg/TestSchemaUpdate.java @@ -1181,6 +1181,17 @@ public void testDeleteMapKey() { .hasMessageStartingWith("Cannot delete map keys"); } + @Test + public void testDeleteMapValue() { + assertThatThrownBy( + () -> + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) + .deleteColumn("locations.value") + .apply()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot delete value type from map"); + } + @Test public void testAddFieldToMapKey() { assertThatThrownBy( diff --git a/api/src/test/java/org/apache/iceberg/stats/TestStatsUtil.java b/core/src/test/java/org/apache/iceberg/TestStatsUtil.java similarity index 95% rename from api/src/test/java/org/apache/iceberg/stats/TestStatsUtil.java rename to core/src/test/java/org/apache/iceberg/TestStatsUtil.java index 62c7c0ea75fb..54db9e5d2095 100644 --- a/api/src/test/java/org/apache/iceberg/stats/TestStatsUtil.java +++ b/core/src/test/java/org/apache/iceberg/TestStatsUtil.java @@ -16,23 +16,22 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.stats; +package org.apache.iceberg; -import static org.apache.iceberg.stats.FieldStatistic.AVG_VALUE_SIZE; -import static org.apache.iceberg.stats.FieldStatistic.EXACT_BOUNDS; -import static org.apache.iceberg.stats.FieldStatistic.LOWER_BOUND; -import static org.apache.iceberg.stats.FieldStatistic.MAX_VALUE_SIZE; -import static org.apache.iceberg.stats.FieldStatistic.NAN_VALUE_COUNT; -import static org.apache.iceberg.stats.FieldStatistic.NULL_VALUE_COUNT; -import static org.apache.iceberg.stats.FieldStatistic.UPPER_BOUND; -import static org.apache.iceberg.stats.FieldStatistic.VALUE_COUNT; +import static org.apache.iceberg.FieldStatistic.AVG_VALUE_SIZE; +import static org.apache.iceberg.FieldStatistic.EXACT_BOUNDS; +import static org.apache.iceberg.FieldStatistic.LOWER_BOUND; +import static org.apache.iceberg.FieldStatistic.MAX_VALUE_SIZE; +import static org.apache.iceberg.FieldStatistic.NAN_VALUE_COUNT; +import static org.apache.iceberg.FieldStatistic.NULL_VALUE_COUNT; +import static org.apache.iceberg.FieldStatistic.UPPER_BOUND; +import static org.apache.iceberg.FieldStatistic.VALUE_COUNT; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; import static org.assertj.core.api.Assertions.assertThat; import java.util.List; import java.util.concurrent.ThreadLocalRandom; -import org.apache.iceberg.Schema; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.Test; diff --git a/core/src/test/java/org/apache/iceberg/TestTrackedFile.java b/core/src/test/java/org/apache/iceberg/TestTrackedFile.java new file mode 100644 index 000000000000..6d84fd542345 --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/TestTrackedFile.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +public class TestTrackedFile { + + private static final Schema TABLE_SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); + + private static final Types.StructType CONTENT_STATS_TYPE = + StatsUtil.contentStatsFor(TABLE_SCHEMA).type().asStructType(); + + @Test + public void schemaWithContentStatsFieldOrder() { + Types.StructType type = TrackedFile.schemaWithContentStats(CONTENT_STATS_TYPE); + List fields = type.fields(); + + assertThat(fields) + .extracting(Types.NestedField::name) + .containsExactly( + "tracking", + "content_type", + "location", + "file_format", + "record_count", + "file_size_in_bytes", + "spec_id", + "content_stats", + "sort_order_id", + "deletion_vector", + "manifest_info", + "key_metadata", + "split_offsets", + "equality_ids"); + } + + @Test + public void schemaWithContentStatsFieldIds() { + Types.StructType type = TrackedFile.schemaWithContentStats(CONTENT_STATS_TYPE); + List fields = type.fields(); + + assertThat(fields) + .extracting(Types.NestedField::fieldId) + .containsExactly(147, 134, 100, 101, 103, 104, 141, 146, 140, 148, 150, 131, 132, 135); + } + + @Test + public void schemaWithContentStatsUsesProvidedType() { + Types.StructType type = TrackedFile.schemaWithContentStats(CONTENT_STATS_TYPE); + Types.NestedField contentStatsField = type.field(TrackedFile.CONTENT_STATS_ID); + + assertThat(contentStatsField.type().asStructType()).isEqualTo(CONTENT_STATS_TYPE); + } + + @Test + public void schemaWithContentStatsReflectsInput() { + Schema smallSchema = new Schema(optional(1, "id", Types.IntegerType.get())); + Schema largeSchema = + new Schema( + optional(1, "id", Types.IntegerType.get()), + optional(2, "data", Types.StringType.get()), + optional(3, "ts", Types.TimestampType.withoutZone())); + + Types.StructType smallStats = StatsUtil.contentStatsFor(smallSchema).type().asStructType(); + Types.StructType largeStats = StatsUtil.contentStatsFor(largeSchema).type().asStructType(); + + Types.StructType smallType = TrackedFile.schemaWithContentStats(smallStats); + Types.StructType largeType = TrackedFile.schemaWithContentStats(largeStats); + + Types.StructType smallResult = + smallType.field(TrackedFile.CONTENT_STATS_ID).type().asStructType(); + Types.StructType largeResult = + largeType.field(TrackedFile.CONTENT_STATS_ID).type().asStructType(); + + assertThat(smallResult.fields()).hasSize(1); + assertThat(largeResult.fields()).hasSize(3); + } +} diff --git a/core/src/test/java/org/apache/iceberg/TestTrackedFileStruct.java b/core/src/test/java/org/apache/iceberg/TestTrackedFileStruct.java new file mode 100644 index 000000000000..62324e5607ef --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/TestTrackedFileStruct.java @@ -0,0 +1,379 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.Set; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +class TestTrackedFileStruct { + @Test + void testFieldAccess() { + TrackedFileStruct file = new TrackedFileStruct(); + TrackingStruct tracking = + TrackingStruct.builder().status(EntryStatus.ADDED).snapshotId(42L).build(); + DeletionVectorStruct dv = + DeletionVectorStruct.builder() + .location("s3://bucket/dv.puffin") + .offset(100L) + .sizeInBytes(50L) + .cardinality(5L) + .build(); + ManifestInfoStruct info = + ManifestInfoStruct.builder() + .addedFilesCount(10) + .existingFilesCount(20) + .deletedFilesCount(3) + .replacedFilesCount(2) + .addedRowsCount(1000L) + .existingRowsCount(2000L) + .deletedRowsCount(300L) + .replacedRowsCount(200L) + .minSequenceNumber(5L) + .build(); + + file.set(0, tracking); + file.set(1, FileContent.EQUALITY_DELETES.id()); + file.set(2, "s3://bucket/data/eq-delete.avro"); + file.set(3, "avro"); + file.set(4, 50L); + file.set(5, 512L); + file.set(6, 1); + file.set(8, 5); + file.set(9, dv); + file.set(10, info); + file.set(11, ByteBuffer.wrap(new byte[] {1, 2, 3})); + file.set(12, ImmutableList.of(100L, 200L)); + file.set(13, ImmutableList.of(1, 2, 3)); + + assertThat(file.tracking()).isNotNull(); + assertThat(file.tracking().status()).isEqualTo(EntryStatus.ADDED); + assertThat(file.tracking().snapshotId()).isEqualTo(42L); + assertThat(file.contentType()).isEqualTo(FileContent.EQUALITY_DELETES); + assertThat(file.location()).isEqualTo("s3://bucket/data/eq-delete.avro"); + assertThat(file.fileFormat()).isEqualTo(FileFormat.AVRO); + assertThat(file.recordCount()).isEqualTo(50L); + assertThat(file.fileSizeInBytes()).isEqualTo(512L); + assertThat(file.specId()).isEqualTo(1); + assertThat(file.sortOrderId()).isEqualTo(5); + assertThat(file.deletionVector()).isSameAs(dv); + assertThat(file.manifestInfo()).isSameAs(info); + assertThat(file.keyMetadata()).isEqualTo(ByteBuffer.wrap(new byte[] {1, 2, 3})); + assertThat(file.splitOffsets()).containsExactly(100L, 200L); + assertThat(file.equalityIds()).containsExactly(1, 2, 3); + } + + @Test + void testReaderSideFields() { + TrackedFileStruct file = new TrackedFileStruct(); + + TrackingStruct tracking = TrackingStruct.builder().status(EntryStatus.ADDED).build(); + tracking.setManifestLocation("s3://bucket/metadata/manifest.avro"); + tracking.set(8, 7L); + + file.set(0, tracking); + file.set(1, FileContent.DATA.id()); + file.set(2, "test"); + file.set(3, "parquet"); + file.set(4, 0L); + file.set(5, 0L); + + assertThat(file.tracking().manifestLocation()).isEqualTo("s3://bucket/metadata/manifest.avro"); + assertThat(file.tracking().manifestPos()).isEqualTo(7L); + } + + @Test + void testCopy() { + TrackedFileStruct file = createFullTrackedFile(); + + TrackedFile copy = file.copy(); + assertThat(copy).isInstanceOf(TrackedFileStruct.class); + + assertThat(copy.contentType()).isEqualTo(FileContent.DATA); + assertThat(copy.location()).isEqualTo("s3://bucket/data/file.parquet"); + assertThat(copy.fileFormat()).isEqualTo(FileFormat.PARQUET); + assertThat(copy.tracking().status()).isEqualTo(EntryStatus.ADDED); + assertThat(copy.tracking().snapshotId()).isEqualTo(42L); + assertThat(copy.deletionVector().location()).isEqualTo("s3://bucket/dv.puffin"); + assertThat(copy.specId()).isEqualTo(0); + assertThat(copy.sortOrderId()).isEqualTo(1); + assertThat(copy.recordCount()).isEqualTo(100L); + assertThat(copy.fileSizeInBytes()).isEqualTo(1024L); + assertThat(copy.keyMetadata()).isNotNull(); + assertThat(copy.splitOffsets()).containsExactly(50L); + assertThat(copy.equalityIds()).isNull(); + assertThat(copy.tracking().manifestLocation()).isEqualTo("s3://bucket/manifest.avro"); + assertThat(copy.tracking().manifestPos()).isEqualTo(3L); + } + + @Test + void testCopyWithoutStats() { + TrackedFileStruct file = createTrackedFileWithStats(); + assertThat(file.contentStats()).isNotNull(); + + TrackedFile copy = file.copyWithoutStats(); + + assertThat(copy.contentType()).isEqualTo(FileContent.DATA); + assertThat(copy.location()).isEqualTo("s3://bucket/data/file.parquet"); + assertThat(copy.contentStats()).isNull(); + } + + @Test + void testCopyWithStatsFilters() { + TrackedFileStruct file = createTrackedFileWithStats(); + Set keepFieldIds = ImmutableSet.of(1); + + TrackedFile copy = file.copyWithStats(keepFieldIds); + + assertThat(copy.contentStats()).isNotNull(); + ContentStats stats = copy.contentStats(); + assertThat(stats.fieldStats()).hasSize(1); + assertThat(stats.fieldStats().get(0).fieldId()).isEqualTo(1); + } + + @Test + void testCopyIsDeep() { + TrackedFileStruct file = createFullTrackedFile(); + + TrackedFile copy = file.copy(); + + // keyMetadata should be a deep copy + assertThat(copy.keyMetadata()).isNotSameAs(file.keyMetadata()); + } + + @Test + void testStructLikeSize() { + TrackedFileStruct file = new TrackedFileStruct(); + assertThat(file.size()).isEqualTo(14); + } + + @Test + void testStructLikeGetSet() { + TrackedFileStruct file = new TrackedFileStruct(); + + file.set(1, FileContent.DATA.id()); + assertThat(file.get(1, Integer.class)).isEqualTo(FileContent.DATA.id()); + + file.set(2, "test-location"); + assertThat(file.get(2, String.class)).isEqualTo("test-location"); + + file.set(4, 999L); + assertThat(file.get(4, Long.class)).isEqualTo(999L); + } + + @Test + void testProjectedStructLike() { + // project only location (field ID 100) and file_size_in_bytes (field ID 104) + Types.StructType projection = + Types.StructType.of(TrackedFile.LOCATION, TrackedFile.FILE_SIZE_IN_BYTES); + + TrackedFileStruct file = new TrackedFileStruct(projection); + assertThat(file.size()).isEqualTo(2); + + // projected position 0 maps to internal position 2 (location) + // projected position 1 maps to internal position 5 (file_size_in_bytes) + file.set(0, "s3://bucket/file.parquet"); + file.set(1, 1024L); + + assertThat(file.location()).isEqualTo("s3://bucket/file.parquet"); + assertThat(file.fileSizeInBytes()).isEqualTo(1024L); + assertThat(file.get(0, String.class)).isEqualTo("s3://bucket/file.parquet"); + assertThat(file.get(1, Long.class)).isEqualTo(1024L); + } + + @Test + void testContentStatsReturnedWhenPresent() { + TrackedFileStruct file = createTrackedFileWithStats(); + assertThat(file.contentStats()).isNotNull(); + assertThat(file.contentStats().fieldStats()).hasSize(2); + } + + @Test + void testContentStatsNullWhenNotSet() { + TrackedFileStruct file = new TrackedFileStruct(); + file.set(1, FileContent.DATA.id()); + file.set(2, "test"); + file.set(3, "parquet"); + file.set(4, 0L); + file.set(5, 0L); + file.set(6, 0); + + assertThat(file.contentStats()).isNull(); + } + + @Test + void testAllFileContentTypesSupported() { + for (FileContent content : FileContent.values()) { + TrackedFileStruct file = new TrackedFileStruct(); + file.set(1, content.id()); + assertThat(file.contentType()).isEqualTo(content); + } + } + + @Test + void testJavaSerializationRoundTrip() throws IOException, ClassNotFoundException { + TrackedFileStruct file = createFullTrackedFile(); + + TrackedFileStruct deserialized = TestHelpers.roundTripSerialize(file); + + assertThat(deserialized.contentType()).isEqualTo(FileContent.DATA); + assertThat(deserialized.location()).isEqualTo("s3://bucket/data/file.parquet"); + assertThat(deserialized.fileFormat()).isEqualTo(FileFormat.PARQUET); + assertThat(deserialized.recordCount()).isEqualTo(100L); + assertThat(deserialized.fileSizeInBytes()).isEqualTo(1024L); + assertThat(deserialized.specId()).isEqualTo(0); + assertThat(deserialized.sortOrderId()).isEqualTo(1); + assertThat(deserialized.tracking().status()).isEqualTo(EntryStatus.ADDED); + assertThat(deserialized.tracking().snapshotId()).isEqualTo(42L); + assertThat(deserialized.deletionVector().location()).isEqualTo("s3://bucket/dv.puffin"); + assertThat(deserialized.keyMetadata()).isEqualTo(ByteBuffer.wrap(new byte[] {1, 2, 3})); + assertThat(deserialized.splitOffsets()).containsExactly(50L); + assertThat(deserialized.tracking().manifestPos()).isEqualTo(3L); + assertThat(deserialized.tracking().manifestLocation()).isEqualTo("s3://bucket/manifest.avro"); + } + + @Test + void testKryoSerializationRoundTrip() throws IOException { + TrackedFileStruct file = createFullTrackedFile(); + + TrackedFileStruct deserialized = TestHelpers.KryoHelpers.roundTripSerialize(file); + + assertThat(deserialized.contentType()).isEqualTo(FileContent.DATA); + assertThat(deserialized.location()).isEqualTo("s3://bucket/data/file.parquet"); + assertThat(deserialized.fileFormat()).isEqualTo(FileFormat.PARQUET); + assertThat(deserialized.recordCount()).isEqualTo(100L); + assertThat(deserialized.fileSizeInBytes()).isEqualTo(1024L); + assertThat(deserialized.specId()).isEqualTo(0); + assertThat(deserialized.sortOrderId()).isEqualTo(1); + assertThat(deserialized.tracking().status()).isEqualTo(EntryStatus.ADDED); + assertThat(deserialized.tracking().snapshotId()).isEqualTo(42L); + assertThat(deserialized.deletionVector().location()).isEqualTo("s3://bucket/dv.puffin"); + assertThat(deserialized.keyMetadata()).isEqualTo(ByteBuffer.wrap(new byte[] {1, 2, 3})); + assertThat(deserialized.splitOffsets()).containsExactly(50L); + assertThat(deserialized.tracking().manifestPos()).isEqualTo(3L); + assertThat(deserialized.tracking().manifestLocation()).isEqualTo("s3://bucket/manifest.avro"); + } + + static TrackedFileStruct createFullTrackedFile() { + TrackingStruct tracking = + TrackingStruct.builder() + .status(EntryStatus.ADDED) + .snapshotId(42L) + .dataSequenceNumber(10L) + .build(); + tracking.setManifestLocation("s3://bucket/manifest.avro"); + tracking.set(8, 3L); + + DeletionVectorStruct dv = + DeletionVectorStruct.builder() + .location("s3://bucket/dv.puffin") + .offset(100L) + .sizeInBytes(50L) + .cardinality(5L) + .build(); + + TrackedFileStruct file = + new TrackedFileStruct( + tracking, + FileContent.DATA, + "s3://bucket/data/file.parquet", + FileFormat.PARQUET, + 100L, + 1024L); + file.set(6, 0); + file.set(8, 1); + file.set(9, dv); + file.set(11, ByteBuffer.wrap(new byte[] {1, 2, 3})); + file.set(12, ImmutableList.of(50L)); + + return file; + } + + @SuppressWarnings("unchecked") + static TrackedFileStruct createTrackedFileWithStats() { + Types.StructType statsStruct = + Types.StructType.of( + Types.NestedField.optional( + 10000, + "1", + Types.StructType.of( + Types.NestedField.optional(10001, "value_count", Types.LongType.get()), + Types.NestedField.optional(10002, "null_value_count", Types.LongType.get()), + Types.NestedField.optional(10003, "nan_value_count", Types.LongType.get()), + Types.NestedField.optional(10006, "lower_bound", Types.IntegerType.get()), + Types.NestedField.optional(10007, "upper_bound", Types.IntegerType.get()))), + Types.NestedField.optional( + 20000, + "2", + Types.StructType.of( + Types.NestedField.optional(20001, "value_count", Types.LongType.get()), + Types.NestedField.optional(20002, "null_value_count", Types.LongType.get()), + Types.NestedField.optional(20003, "nan_value_count", Types.LongType.get()), + Types.NestedField.optional(20006, "lower_bound", Types.FloatType.get()), + Types.NestedField.optional(20007, "upper_bound", Types.FloatType.get())))); + + List> fieldStatsList = + ImmutableList.of( + (FieldStats) + BaseFieldStats.builder() + .fieldId(1) + .type(Types.IntegerType.get()) + .valueCount(100L) + .nullValueCount(5L) + .lowerBound(1) + .upperBound(1000) + .build(), + (FieldStats) + BaseFieldStats.builder() + .fieldId(2) + .type(Types.FloatType.get()) + .valueCount(200L) + .nullValueCount(10L) + .nanValueCount(3L) + .lowerBound(1.0f) + .upperBound(100.0f) + .build()); + + BaseContentStats stats = + BaseContentStats.builder() + .withStatsStruct(statsStruct) + .withFieldStats(fieldStatsList) + .build(); + + TrackedFileStruct file = + new TrackedFileStruct( + null, + FileContent.DATA, + "s3://bucket/data/file.parquet", + FileFormat.PARQUET, + 100L, + 1024L); + file.set(6, 0); + file.set(7, stats); + + return file; + } +} diff --git a/core/src/test/java/org/apache/iceberg/TestTrackingStruct.java b/core/src/test/java/org/apache/iceberg/TestTrackingStruct.java new file mode 100644 index 000000000000..98a7eff2af45 --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/TestTrackingStruct.java @@ -0,0 +1,247 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.nio.ByteBuffer; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +class TestTrackingStruct { + + @Test + void testFieldAccess() { + TrackingStruct tracking = new TrackingStruct(Tracking.schema()); + + tracking.set(0, EntryStatus.ADDED.id()); + tracking.set(1, 42L); + tracking.set(2, 10L); + tracking.set(3, 11L); + tracking.set(4, 43L); + tracking.set(5, 1000L); + + assertThat(tracking.status()).isEqualTo(EntryStatus.ADDED); + assertThat(tracking.snapshotId()).isEqualTo(42L); + assertThat(tracking.dataSequenceNumber()).isEqualTo(10L); + assertThat(tracking.fileSequenceNumber()).isEqualTo(11L); + assertThat(tracking.dvSnapshotId()).isEqualTo(43L); + assertThat(tracking.firstRowId()).isEqualTo(1000L); + assertThat(tracking.deletedPositions()).isNull(); + assertThat(tracking.replacedPositions()).isNull(); + } + + @Test + void testCopy() { + TrackingStruct tracking = + TrackingStruct.builder() + .status(EntryStatus.ADDED) + .snapshotId(42L) + .dataSequenceNumber(10L) + .deletedPositions(new byte[] {1, 2}) + .build(); + + TrackingStruct copy = tracking.copy(); + + assertThat(copy.status()).isEqualTo(EntryStatus.ADDED); + assertThat(copy.snapshotId()).isEqualTo(42L); + assertThat(copy.dataSequenceNumber()).isEqualTo(10L); + assertThat(copy.deletedPositions()).isNotNull(); + + // verify deep copy of ByteBuffer + assertThat(copy.deletedPositions()).isNotSameAs(tracking.deletedPositions()); + } + + @ParameterizedTest + @EnumSource(EntryStatus.class) + void testAllStatuses(EntryStatus status) { + TrackingStruct tracking = new TrackingStruct(Tracking.schema()); + tracking.set(0, status.id()); + assertThat(tracking.status()).isEqualTo(status); + } + + @Test + void testIsLive() { + TrackingStruct tracking = new TrackingStruct(Tracking.schema()); + + tracking.set(0, EntryStatus.ADDED.id()); + assertThat(tracking.isLive()).isTrue(); + + tracking.set(0, EntryStatus.EXISTING.id()); + assertThat(tracking.isLive()).isTrue(); + + tracking.set(0, EntryStatus.DELETED.id()); + assertThat(tracking.isLive()).isFalse(); + + tracking.set(0, EntryStatus.REPLACED.id()); + assertThat(tracking.isLive()).isFalse(); + } + + @Test + void testInheritSnapshotId() { + TrackingStruct tracking = TrackingStruct.builder().status(EntryStatus.ADDED).build(); + tracking.inheritFrom(createManifestTracking(100L, 60L)); + + // snapshotId is null, should inherit from manifest + assertThat(tracking.snapshotId()).isEqualTo(100L); + } + + @Test + void testInheritSequenceNumberForAddedEntries() { + TrackingStruct tracking = TrackingStruct.builder().status(EntryStatus.ADDED).build(); + tracking.inheritFrom(createManifestTracking(100L, 60L)); + + // sequence numbers are null and status is ADDED, should inherit + assertThat(tracking.dataSequenceNumber()).isEqualTo(60L); + assertThat(tracking.fileSequenceNumber()).isEqualTo(60L); + } + + @Test + void testDoNotInheritSequenceNumberForExistingEntries() { + TrackingStruct tracking = + TrackingStruct.builder() + .status(EntryStatus.EXISTING) + .dataSequenceNumber(5L) + .fileSequenceNumber(6L) + .build(); + tracking.inheritFrom(createManifestTracking(100L, 60L)); + + // sequence numbers are not inherited for EXISTING entries + assertThat(tracking.dataSequenceNumber()).isEqualTo(5L); + assertThat(tracking.fileSequenceNumber()).isEqualTo(6L); + } + + @Test + void testExplicitValuesOverrideInheritance() { + TrackingStruct tracking = + TrackingStruct.builder() + .status(EntryStatus.ADDED) + .snapshotId(200L) + .dataSequenceNumber(75L) + .fileSequenceNumber(76L) + .build(); + tracking.inheritFrom(createManifestTracking(100L, 60L)); + + // explicit values should take precedence + assertThat(tracking.snapshotId()).isEqualTo(200L); + assertThat(tracking.dataSequenceNumber()).isEqualTo(75L); + assertThat(tracking.fileSequenceNumber()).isEqualTo(76L); + } + + @Test + void testInheritFromRejectsUnequalSequenceNumbers() { + TrackingStruct tracking = new TrackingStruct(Tracking.schema()); + tracking.set(0, EntryStatus.ADDED.id()); + + TrackingStruct manifestTracking = new TrackingStruct(Tracking.schema()); + manifestTracking.set(0, EntryStatus.ADDED.id()); + manifestTracking.set(1, 100L); + manifestTracking.set(2, 50L); + manifestTracking.set(3, 60L); + + assertThatThrownBy(() -> tracking.inheritFrom(manifestTracking)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Manifest data and file sequence numbers must be equal, got 50 and 60"); + } + + @Test + void testNoDefaultingWithoutInheritance() { + TrackingStruct tracking = TrackingStruct.builder().status(EntryStatus.ADDED).build(); + + // no inheritance, nulls stay null + assertThat(tracking.snapshotId()).isNull(); + assertThat(tracking.dataSequenceNumber()).isNull(); + assertThat(tracking.fileSequenceNumber()).isNull(); + } + + private static Tracking createManifestTracking(long snapshotId, long sequenceNumber) { + return TrackingStruct.builder() + .status(EntryStatus.ADDED) + .snapshotId(snapshotId) + .dataSequenceNumber(sequenceNumber) + .fileSequenceNumber(sequenceNumber) + .build(); + } + + @Test + void testBuilderValidation() { + assertThatThrownBy(() -> TrackingStruct.builder().build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid status: null"); + } + + @Test + void testProjectedStructLike() { + // project only snapshot_id (field ID 1) and first_row_id (field ID 142) + Types.StructType projection = Types.StructType.of(Tracking.SNAPSHOT_ID, Tracking.FIRST_ROW_ID); + + TrackingStruct tracking = new TrackingStruct(projection); + assertThat(tracking.size()).isEqualTo(2); + + // projected position 0 maps to internal position 1 (snapshot_id) + // projected position 1 maps to internal position 5 (first_row_id) + tracking.set(0, 42L); + tracking.set(1, 1000L); + + assertThat(tracking.snapshotId()).isEqualTo(42L); + assertThat(tracking.firstRowId()).isEqualTo(1000L); + assertThat(tracking.get(0, Long.class)).isEqualTo(42L); + assertThat(tracking.get(1, Long.class)).isEqualTo(1000L); + } + + @Test + void testJavaSerializationRoundTrip() throws IOException, ClassNotFoundException { + TrackingStruct tracking = + TrackingStruct.builder() + .status(EntryStatus.ADDED) + .snapshotId(42L) + .dataSequenceNumber(10L) + .deletedPositions(new byte[] {1, 2}) + .build(); + + TrackingStruct deserialized = TestHelpers.roundTripSerialize(tracking); + + assertThat(deserialized.status()).isEqualTo(EntryStatus.ADDED); + assertThat(deserialized.snapshotId()).isEqualTo(42L); + assertThat(deserialized.dataSequenceNumber()).isEqualTo(10L); + assertThat(deserialized.deletedPositions()).isEqualTo(ByteBuffer.wrap(new byte[] {1, 2})); + } + + @Test + void testKryoSerializationRoundTrip() throws IOException { + TrackingStruct tracking = + TrackingStruct.builder() + .status(EntryStatus.ADDED) + .snapshotId(42L) + .dataSequenceNumber(10L) + .deletedPositions(new byte[] {1, 2}) + .build(); + + TrackingStruct deserialized = TestHelpers.KryoHelpers.roundTripSerialize(tracking); + + assertThat(deserialized.status()).isEqualTo(EntryStatus.ADDED); + assertThat(deserialized.snapshotId()).isEqualTo(42L); + assertThat(deserialized.dataSequenceNumber()).isEqualTo(10L); + assertThat(deserialized.deletedPositions()).isEqualTo(ByteBuffer.wrap(new byte[] {1, 2})); + } +} diff --git a/core/src/test/java/org/apache/iceberg/avro/AvroTestHelpers.java b/core/src/test/java/org/apache/iceberg/avro/AvroTestHelpers.java index 0a1cf43f4fb5..fd73706ce082 100644 --- a/core/src/test/java/org/apache/iceberg/avro/AvroTestHelpers.java +++ b/core/src/test/java/org/apache/iceberg/avro/AvroTestHelpers.java @@ -177,4 +177,12 @@ public static String readAvroCodec(File file) throws IOException { return reader.getMetaString("avro.codec"); } } + + public static boolean hasIds(Schema schema) { + return AvroSchemaUtil.hasIds(schema); + } + + public static Schema removeIds(org.apache.iceberg.Schema schema) { + return RemoveIds.removeIds(schema); + } } diff --git a/core/src/test/java/org/apache/iceberg/catalog/CatalogTests.java b/core/src/test/java/org/apache/iceberg/catalog/CatalogTests.java index 833b2fb0b46f..8997cf15a08c 100644 --- a/core/src/test/java/org/apache/iceberg/catalog/CatalogTests.java +++ b/core/src/test/java/org/apache/iceberg/catalog/CatalogTests.java @@ -72,6 +72,8 @@ import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.expressions.Literal; import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.InputFile; import org.apache.iceberg.metrics.CommitReport; import org.apache.iceberg.metrics.MetricsReport; import org.apache.iceberg.metrics.MetricsReporter; @@ -429,6 +431,44 @@ public void testDropNonEmptyNamespace() { assertThat(catalog.namespaceExists(NS)).as("Namespace should not exist").isFalse(); } + @Test + public void testDropNamespaceWithNestedNamespace() { + assumeThat(supportsNestedNamespaces()) + .as("Only valid when the catalog supports nested namespaces") + .isTrue(); + + C catalog = catalog(); + + Namespace parent = Namespace.of("parent"); + Namespace nested = Namespace.of("parent", "child"); + + assertThat(catalog.namespaceExists(parent)).as("Parent namespace should not exist").isFalse(); + assertThat(catalog.namespaceExists(nested)).as("Nested namespace should not exist").isFalse(); + + catalog.createNamespace(parent); + catalog.createNamespace(nested); + + assertThat(catalog.namespaceExists(parent)).as("Parent namespace should exist").isTrue(); + assertThat(catalog.namespaceExists(nested)).as("Nested namespace should exist").isTrue(); + + assertThatThrownBy(() -> catalog.dropNamespace(parent)) + .isInstanceOf(NamespaceNotEmptyException.class) + .hasMessageContaining("is not empty"); + + assertThat(catalog.namespaceExists(parent)).as("Parent namespace should still exist").isTrue(); + assertThat(catalog.namespaceExists(nested)).as("Nested namespace should still exist").isTrue(); + + assertThat(catalog.dropNamespace(nested)) + .as("Dropping an existing nested namespace should return true") + .isTrue(); + assertThat(catalog.namespaceExists(nested)).as("Nested namespace should not exist").isFalse(); + + assertThat(catalog.dropNamespace(parent)) + .as("Dropping an existing namespace should return true") + .isTrue(); + assertThat(catalog.namespaceExists(parent)).as("Parent namespace should not exist").isFalse(); + } + @Test public void testListNamespaces() { C catalog = catalog(); @@ -1025,6 +1065,86 @@ public void testRenameTable() { assertEmpty("Should not contain table after drop", catalog, NS); } + @Test + public void createTableInUniqueLocation() { + Map additionalProperties = + ImmutableMap.of(CatalogProperties.UNIQUE_TABLE_LOCATION, "true"); + C catalog = initCatalog("uniq_path_catalog", additionalProperties); + + if (requiresNamespaceCreate()) { + catalog.createNamespace(NS); + } + + catalog.createTable(TABLE, SCHEMA, PartitionSpec.unpartitioned()); + catalog.renameTable(TABLE, RENAMED_TABLE); + catalog.createTable(TABLE, SCHEMA, PartitionSpec.unpartitioned()); + + Table table = catalog.loadTable(TABLE); + Table renamedTable = catalog.loadTable(RENAMED_TABLE); + + assertThat(table.location()) + .as("Tables %s and %s have different location", TABLE, RENAMED_TABLE) + .isNotEqualTo(renamedTable.location()); + } + + @Test + public void dropAfterRenameDoesntCorruptTable() throws IOException { + C catalog = catalog(); + + if (requiresNamespaceCreate()) { + catalog.createNamespace(TABLE.namespace()); + } + + PartitionSpec spec = PartitionSpec.unpartitioned(); + + Table initialTable = catalog.createTable(TABLE, SCHEMA, spec); + String initialFilePath = initialTable.locationProvider().newDataLocation("data-a.parquet"); + DataFile dataFile = + DataFiles.builder(spec) + .withPath(initialFilePath) + .withFileSizeInBytes(10) + .withRecordCount(2) + .build(); + initialTable.io().newOutputFile(initialFilePath).create().close(); + initialTable.newAppend().appendFile(dataFile).commit(); + + catalog.renameTable(TABLE, RENAMED_TABLE); + + Table newTable = catalog.createTable(TABLE, SCHEMA, spec); + String newFilePath = newTable.locationProvider().newDataLocation("data-b.parquet"); + DataFile anotherFile = + DataFiles.builder(spec) + .withPath(newFilePath) + .withFileSizeInBytes(10) + .withRecordCount(2) + .build(); + newTable.io().newOutputFile(newFilePath).create().close(); + newTable.newAppend().appendFile(anotherFile).commit(); + + catalog.dropTable(RENAMED_TABLE, true); + + assertThat(catalog.tableExists(RENAMED_TABLE)) + .as("After PURGE, %s must not exist", RENAMED_TABLE) + .isFalse(); + assertThat(catalog.tableExists(TABLE)) + .as( + "After dropping the renamed table with PURGE, the recreated table with the original name (%s) must exist", + TABLE) + .isTrue(); + + Table table = catalog.loadTable(TABLE); + FileIO io = table.io(); + try (CloseableIterable tasks = table.newScan().planFiles()) { + tasks.forEach( + task -> { + InputFile file = io.newInputFile(task.file().location()); + assertThat(file.exists()) + .as("Table %s should remain unaffected by dropping %s", TABLE, RENAMED_TABLE) + .isTrue(); + }); + } + } + @Test public void testRenameTableMissingSourceTable() { C catalog = catalog(); diff --git a/data/src/test/java/org/apache/iceberg/data/DataTestHelpers.java b/core/src/test/java/org/apache/iceberg/data/DataTestHelpers.java similarity index 100% rename from data/src/test/java/org/apache/iceberg/data/DataTestHelpers.java rename to core/src/test/java/org/apache/iceberg/data/DataTestHelpers.java diff --git a/core/src/test/java/org/apache/iceberg/deletes/TestRoaringPositionBitmap.java b/core/src/test/java/org/apache/iceberg/deletes/TestRoaringPositionBitmap.java index 2daf0382973b..68b73ed0a218 100644 --- a/core/src/test/java/org/apache/iceberg/deletes/TestRoaringPositionBitmap.java +++ b/core/src/test/java/org/apache/iceberg/deletes/TestRoaringPositionBitmap.java @@ -148,9 +148,116 @@ public void testAddRangeAcrossKeys() { @TestTemplate public void testAddEmptyRange() { + RoaringPositionBitmap equalRange = new RoaringPositionBitmap(); + equalRange.setRange(10, 10); + assertThat(equalRange.isEmpty()).isTrue(); + assertThat(equalRange.cardinality()).isEqualTo(0); + assertThat(equalRange.contains(10)).isFalse(); + } + + @TestTemplate + public void testSetRangeReversedThrows() { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + assertThatThrownBy(() -> bitmap.setRange(100, 50)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Start position must not exceed end position"); + } + + @TestTemplate + public void testAddRangeLargeContiguous() { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + + long start = 500L; + long end = 200_500L; + bitmap.setRange(start, end); + + assertThat(bitmap.cardinality()).isEqualTo(200_000L); + assertThat(bitmap.contains(start)).isTrue(); + assertThat(bitmap.contains(end - 1)).isTrue(); + assertThat(bitmap.contains(start - 1)).isFalse(); + assertThat(bitmap.contains(end)).isFalse(); + } + + @TestTemplate + public void testAddRangeSpanningThreeKeys() { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + + long start = ((long) 0 << 32) | 0xFFFFFFF0L; + long end = ((long) 2 << 32) | 0x10L; + bitmap.setRange(start, end); + + assertThat(bitmap.contains(start)).isTrue(); + assertThat(bitmap.contains(end - 1)).isTrue(); + assertThat(bitmap.contains(start - 1)).isFalse(); + assertThat(bitmap.contains(end)).isFalse(); + + // key 1 should be fully covered + assertThat(bitmap.contains((long) 1 << 32)).isTrue(); + assertThat(bitmap.contains(((long) 1 << 32) | 0xFFFFFFFFL)).isTrue(); + + long expectedCardinality = end - start; + assertThat(bitmap.cardinality()).isEqualTo(expectedCardinality); + } + + @TestTemplate + public void testAddRangeSinglePosition() { + RoaringPositionBitmap rangeBitmap = new RoaringPositionBitmap(); + rangeBitmap.setRange(42, 43); + + RoaringPositionBitmap setBitmap = new RoaringPositionBitmap(); + setBitmap.set(42); + + assertThat(rangeBitmap.cardinality()).isEqualTo(setBitmap.cardinality()); + assertThat(rangeBitmap.contains(42)).isTrue(); + assertThat(rangeBitmap.contains(41)).isFalse(); + assertThat(rangeBitmap.contains(43)).isFalse(); + } + + @TestTemplate + public void testAddRangeAtKeyBoundary() { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + + bitmap.setRange(0L, 1L << 32); + + assertThat(bitmap.cardinality()).isEqualTo(1L << 32); + assertThat(bitmap.contains(0L)).isTrue(); + assertThat(bitmap.contains((1L << 32) - 1)).isTrue(); + assertThat(bitmap.contains(1L << 32)).isFalse(); + assertThat(bitmap.allocatedBitmapCount()).isEqualTo(1); + } + + @TestTemplate + public void testAddRangeSameKeyForEachExact() { RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); - bitmap.setRange(10, 10); - assertThat(bitmap.isEmpty()).isTrue(); + + long start = 1000L; + long end = 1200L; + bitmap.setRange(start, end); + + assertThat(bitmap.cardinality()).isEqualTo(end - start); + assertThat(bitmap.contains(start - 1)).isFalse(); + assertThat(bitmap.contains(end)).isFalse(); + + for (long pos = start; pos < end; pos++) { + assertThat(bitmap.contains(pos)).isTrue(); + } + } + + @TestTemplate + public void testAddRangeCrossKeyForEachExact() { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + + long start = ((long) 1 << 32) - 100L; + long end = ((long) 1 << 32) + 100L; + bitmap.setRange(start, end); + + assertThat(bitmap.cardinality()).isEqualTo(end - start); + assertThat(bitmap.contains(start - 1)).isFalse(); + assertThat(bitmap.contains(end)).isFalse(); + + for (long pos = start; pos < end; pos++) { + assertThat(bitmap.contains(pos)).isTrue(); + } } @TestTemplate @@ -357,6 +464,18 @@ public void testUnsupportedPositions() { .hasMessageContaining( "Bitmap supports positions that are >= 0 and <= %s", RoaringPositionBitmap.MAX_POSITION); + + assertThatThrownBy(() -> bitmap.setRange(-1L, 1L)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining( + "Bitmap supports positions that are >= 0 and <= %s", + RoaringPositionBitmap.MAX_POSITION); + + assertThatThrownBy(() -> bitmap.setRange(0L, RoaringPositionBitmap.MAX_POSITION + 2L)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining( + "Bitmap supports positions that are >= 0 and <= %s", + RoaringPositionBitmap.MAX_POSITION); } @TestTemplate diff --git a/core/src/test/java/org/apache/iceberg/inmemory/TestInMemoryCatalog.java b/core/src/test/java/org/apache/iceberg/inmemory/TestInMemoryCatalog.java index c2c683e7d882..827450d4a398 100644 --- a/core/src/test/java/org/apache/iceberg/inmemory/TestInMemoryCatalog.java +++ b/core/src/test/java/org/apache/iceberg/inmemory/TestInMemoryCatalog.java @@ -82,6 +82,11 @@ protected boolean supportsEmptyNamespace() { return true; } + @Override + protected boolean supportsNestedNamespaces() { + return true; + } + @Test @Override public void testLoadTableWithMissingMetadataFile(@TempDir Path tempDir) throws IOException { diff --git a/core/src/test/java/org/apache/iceberg/io/TestBufferedFileAppender.java b/core/src/test/java/org/apache/iceberg/io/TestBufferedFileAppender.java new file mode 100644 index 000000000000..9bbc0f9f8c71 --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/io/TestBufferedFileAppender.java @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.io; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.util.List; +import java.util.function.Function; +import org.apache.iceberg.Schema; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.avro.AvroIterable; +import org.apache.iceberg.data.DataTestHelpers; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.avro.DataWriter; +import org.apache.iceberg.data.avro.PlannedDataReader; +import org.apache.iceberg.exceptions.RuntimeIOException; +import org.apache.iceberg.inmemory.InMemoryOutputFile; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +public class TestBufferedFileAppender { + + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); + + private InMemoryOutputFile outputFile; + private GenericRecord record; + + @BeforeEach + public void before() { + this.outputFile = new InMemoryOutputFile(); + this.record = GenericRecord.create(SCHEMA); + } + + private Function, FileAppender> avroFactory(OutputFile out) { + return bufferedRows -> { + try { + return Avro.write(out) + .createWriterFunc(DataWriter::create) + .schema(SCHEMA) + .overwrite() + .build(); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + }; + } + + private BufferedFileAppender createAppender(int bufferSize) { + return new BufferedFileAppender<>(bufferSize, avroFactory(outputFile), Record::copy); + } + + private Record createRecord(long id, String data) { + return record.copy(ImmutableMap.of("id", id, "data", data)); + } + + private List readBack() throws IOException { + try (AvroIterable reader = + Avro.read(outputFile.toInputFile()) + .project(SCHEMA) + .createResolvingReader(PlannedDataReader::create) + .build()) { + return Lists.newArrayList(reader); + } + } + + @Test + public void testBufferFlushesOnThreshold() throws IOException { + BufferedFileAppender appender = createAppender(3); + + appender.add(createRecord(1L, "a")); + appender.add(createRecord(2L, "b")); + + // delegate not yet created, length should be 0 + assertThat(appender.length()).isEqualTo(0L); + + appender.add(createRecord(3L, "c")); + + // delegate created after 3rd row, length should be > 0 + assertThat(appender.length()).isGreaterThan(0L); + + appender.add(createRecord(4L, "d")); + appender.add(createRecord(5L, "e")); + appender.close(); + + List expected = + Lists.newArrayList( + createRecord(1L, "a"), + createRecord(2L, "b"), + createRecord(3L, "c"), + createRecord(4L, "d"), + createRecord(5L, "e")); + DataTestHelpers.assertEquals(SCHEMA.asStruct(), expected, readBack()); + } + + @Test + public void testCloseWithPartialBuffer() throws IOException { + BufferedFileAppender appender = createAppender(10); + + appender.add(createRecord(1L, "a")); + appender.add(createRecord(2L, "b")); + appender.add(createRecord(3L, "c")); + + // buffer not full yet + assertThat(appender.length()).isEqualTo(0L); + + // close flushes partial buffer through factory + appender.close(); + + List expected = + Lists.newArrayList(createRecord(1L, "a"), createRecord(2L, "b"), createRecord(3L, "c")); + DataTestHelpers.assertEquals(SCHEMA.asStruct(), expected, readBack()); + } + + @Test + public void testCopyFuncIsApplied() throws IOException { + BufferedFileAppender appender = createAppender(3); + + // use a single mutable record, relying on copyFunc to snapshot it + record.set(0, 1L); + record.set(1, "first"); + appender.add(record); + + record.set(0, 2L); + record.set(1, "second"); + appender.add(record); + + record.set(0, 3L); + record.set(1, "third"); + appender.add(record); + + appender.close(); + + List expected = + Lists.newArrayList( + createRecord(1L, "first"), createRecord(2L, "second"), createRecord(3L, "third")); + DataTestHelpers.assertEquals(SCHEMA.asStruct(), expected, readBack()); + } + + @Test + public void testMetricsAfterClose() throws IOException { + BufferedFileAppender appender = createAppender(2); + + appender.add(createRecord(1L, "a")); + appender.add(createRecord(2L, "b")); + appender.add(createRecord(3L, "c")); + appender.close(); + + assertThat(appender.metrics()).isNotNull(); + assertThat(appender.metrics().recordCount()).isEqualTo(3L); + assertThat(appender.length()).isGreaterThan(0L); + } + + @Test + public void testMetricsBeforeCloseThrows() throws IOException { + try (BufferedFileAppender appender = createAppender(10)) { + assertThatThrownBy(appender::metrics) + .isInstanceOf(IllegalStateException.class) + .hasMessage("Cannot return metrics for unclosed appender"); + } + } + + @Test + public void testAddAfterCloseThrows() throws IOException { + try (BufferedFileAppender appender = createAppender(10)) { + appender.add(createRecord(1L, "a")); + appender.close(); + + assertThatThrownBy(() -> appender.add(createRecord(2L, "b"))) + .isInstanceOf(IllegalStateException.class) + .hasMessage("Cannot add to a closed appender"); + } + } + + @Test + public void testAddAllSpanningBuffer() throws IOException { + BufferedFileAppender appender = createAppender(2); + + List records = + Lists.newArrayList( + createRecord(1L, "a"), + createRecord(2L, "b"), + createRecord(3L, "c"), + createRecord(4L, "d")); + + appender.addAll(records); + appender.close(); + + DataTestHelpers.assertEquals(SCHEMA.asStruct(), records, readBack()); + } + + @Test + public void testCloseWithNoData() throws IOException { + BufferedFileAppender appender = createAppender(10); + // close immediately with no data written + appender.close(); + // delegate was never created + assertThat(appender.length()).isEqualTo(0L); + assertThat(appender.metrics()).isNotNull(); + assertThat(appender.metrics().recordCount()).isEqualTo(0L); + assertThat(appender.splitOffsets()).isNull(); + } +} diff --git a/core/src/test/java/org/apache/iceberg/jdbc/TestJdbcCatalog.java b/core/src/test/java/org/apache/iceberg/jdbc/TestJdbcCatalog.java index 310d918849f3..ff0af5c56306 100644 --- a/core/src/test/java/org/apache/iceberg/jdbc/TestJdbcCatalog.java +++ b/core/src/test/java/org/apache/iceberg/jdbc/TestJdbcCatalog.java @@ -852,11 +852,11 @@ public void testDropNamespace() { assertThatThrownBy(() -> catalog.dropNamespace(tbl2.namespace())) .isInstanceOf(NamespaceNotEmptyException.class) - .hasMessage("Namespace db.ns1 is not empty. Contains 1 table(s)."); + .hasMessage("Namespace db.ns1 is not empty. Contains 1 child namespace(s)."); assertThatThrownBy(() -> catalog.dropNamespace(tbl4.namespace())) .isInstanceOf(NamespaceNotEmptyException.class) - .hasMessage("Namespace db is not empty. Contains 1 table(s)."); + .hasMessage("Namespace db is not empty. Contains 2 child namespace(s)."); } @Test diff --git a/core/src/test/java/org/apache/iceberg/rest/RESTCatalogAdapter.java b/core/src/test/java/org/apache/iceberg/rest/RESTCatalogAdapter.java index 8ba5daef3f9b..8c6dc52b1575 100644 --- a/core/src/test/java/org/apache/iceberg/rest/RESTCatalogAdapter.java +++ b/core/src/test/java/org/apache/iceberg/rest/RESTCatalogAdapter.java @@ -746,7 +746,7 @@ private static SnapshotMode snapshotModeFromQueryParams(Map quer queryParams .getOrDefault( RESTCatalogProperties.SNAPSHOTS_QUERY_PARAMETER, - RESTCatalogProperties.SNAPSHOT_LOADING_MODE_DEFAULT) + RESTCatalogProperties.SNAPSHOT_LOADING_MODE_DEFAULT.name()) .toUpperCase(Locale.US)); } } diff --git a/core/src/test/java/org/apache/iceberg/rest/RemoteSignerServlet.java b/core/src/test/java/org/apache/iceberg/rest/RemoteSignerServlet.java new file mode 100644 index 000000000000..c55224c00b2f --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/rest/RemoteSignerServlet.java @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.rest; + +import static java.lang.String.format; +import static org.apache.iceberg.rest.RESTCatalogAdapter.castRequest; +import static org.apache.iceberg.rest.RESTCatalogAdapter.castResponse; + +import jakarta.servlet.http.HttpServlet; +import jakarta.servlet.http.HttpServletRequest; +import jakarta.servlet.http.HttpServletResponse; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import org.apache.hc.core5.http.ContentType; +import org.apache.hc.core5.http.HttpHeaders; +import org.apache.iceberg.exceptions.RESTException; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.io.CharStreams; +import org.apache.iceberg.rest.requests.RemoteSignRequest; +import org.apache.iceberg.rest.responses.OAuthTokenResponse; +import org.apache.iceberg.rest.responses.RemoteSignResponse; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Base servlet for remote signing tests. This servlet handles OAuth token requests and delegates + * signing to subclasses. It does not handle any other requests. + * + *

Subclasses must implement {@link #signRequest(RemoteSignRequest)} to provide the actual + * signing logic. + */ +public abstract class RemoteSignerServlet extends HttpServlet { + + private static final Logger LOG = LoggerFactory.getLogger(RemoteSignerServlet.class); + private static final String POST = "POST"; + + private static final String CACHE_CONTROL = "Cache-Control"; + private static final String CACHE_CONTROL_PRIVATE = "private"; + private static final String CACHE_CONTROL_NO_CACHE = "no-cache"; + + private static final Set CACHEABLE_METHODS = Set.of("GET", "HEAD"); + + private static final Map RESPONSE_HEADERS = + ImmutableMap.of(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType()); + + private final String signEndpoint; + + protected RemoteSignerServlet(String signEndpoint) { + this.signEndpoint = signEndpoint; + } + + @Override + protected void doGet(HttpServletRequest request, HttpServletResponse response) { + execute(request, response); + } + + @Override + protected void doHead(HttpServletRequest request, HttpServletResponse response) { + execute(request, response); + } + + @Override + protected void doPost(HttpServletRequest request, HttpServletResponse response) { + execute(request, response); + } + + @Override + protected void doDelete(HttpServletRequest request, HttpServletResponse response) { + execute(request, response); + } + + /** + * Sign the given request and return the signed response. + * + * @param request the remote sign request + * @return the signed response + */ + protected abstract RemoteSignResponse signRequest(RemoteSignRequest request); + + /** + * Called after a sign request is parsed but before signing. Subclasses can override to add + * additional validation. + * + * @param request the remote sign request + */ + protected void validateSignRequest(RemoteSignRequest request) { + // no-op by default + } + + /** + * Called after signing to allow subclasses to add response headers (e.g., cache control). By + * default, this method adds cache control headers based on the request method. + * + * @param request the original sign request + * @param response the HTTP response to add headers to + */ + protected void addSignResponseHeaders(RemoteSignRequest request, HttpServletResponse response) { + if (CACHEABLE_METHODS.contains(request.method().toUpperCase(Locale.ROOT))) { + // tell the client this can be cached + response.setHeader(CACHE_CONTROL, CACHE_CONTROL_PRIVATE); + } else { + response.setHeader(CACHE_CONTROL, CACHE_CONTROL_NO_CACHE); + } + } + + private OAuthTokenResponse handleOAuth(Map requestMap) { + String grantType = requestMap.get("grant_type"); + switch (grantType) { + case "client_credentials": + return castResponse( + OAuthTokenResponse.class, + OAuthTokenResponse.builder() + .withToken("client-credentials-token:sub=" + requestMap.get("client_id")) + .withIssuedTokenType("urn:ietf:params:oauth:token-type:access_token") + .withTokenType("Bearer") + .setExpirationInSeconds(10000) + .build()); + + case "urn:ietf:params:oauth:grant-type:token-exchange": + String actor = requestMap.get("actor_token"); + String token = + String.format( + "token-exchange-token:sub=%s%s", + requestMap.get("subject_token"), actor != null ? ",act=" + actor : ""); + return castResponse( + OAuthTokenResponse.class, + OAuthTokenResponse.builder() + .withToken(token) + .withIssuedTokenType("urn:ietf:params:oauth:token-type:access_token") + .withTokenType("Bearer") + .setExpirationInSeconds(10000) + .build()); + + default: + throw new UnsupportedOperationException("Unsupported grant_type: " + grantType); + } + } + + protected void execute(HttpServletRequest request, HttpServletResponse response) { + response.setStatus(HttpServletResponse.SC_OK); + RESPONSE_HEADERS.forEach(response::setHeader); + + String path = request.getRequestURI().substring(1); + Object requestBody; + try { + if (POST.equals(request.getMethod()) && signEndpoint.equals(path)) { + RemoteSignRequest signRequest = + castRequest( + RemoteSignRequest.class, + RESTObjectMapper.mapper().readValue(request.getReader(), RemoteSignRequest.class)); + validateSignRequest(signRequest); + RemoteSignResponse signResponse = signRequest(signRequest); + addSignResponseHeaders(signRequest, response); + RESTObjectMapper.mapper().writeValue(response.getWriter(), signResponse); + } else if (POST.equals(request.getMethod()) && ResourcePaths.tokens().equals(path)) { + try (Reader reader = new InputStreamReader(request.getInputStream())) { + requestBody = RESTUtil.decodeFormData(CharStreams.toString(reader)); + } + + @SuppressWarnings("unchecked") + OAuthTokenResponse oAuthTokenResponse = + handleOAuth((Map) castRequest(Map.class, requestBody)); + RESTObjectMapper.mapper().writeValue(response.getWriter(), oAuthTokenResponse); + } else { + response.setStatus(HttpServletResponse.SC_BAD_REQUEST); + RESTObjectMapper.mapper() + .writeValue( + response.getWriter(), + org.apache.iceberg.rest.responses.ErrorResponse.builder() + .responseCode(400) + .withType("BadRequestException") + .withMessage(format("No route for request: %s %s", request.getMethod(), path)) + .build()); + } + } catch (RESTException e) { + LOG.error("Error processing REST request", e); + response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR); + } catch (Exception e) { + LOG.error("Unexpected exception when processing REST request", e); + response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR); + } + } +} diff --git a/core/src/test/java/org/apache/iceberg/rest/TestBaseWithRESTServer.java b/core/src/test/java/org/apache/iceberg/rest/TestBaseWithRESTServer.java index a79977c2464e..9cab8b1f240e 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestBaseWithRESTServer.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestBaseWithRESTServer.java @@ -34,10 +34,11 @@ import org.apache.iceberg.inmemory.InMemoryCatalog; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.rest.responses.ErrorResponse; +import org.eclipse.jetty.compression.gzip.GzipCompression; +import org.eclipse.jetty.compression.server.CompressionHandler; +import org.eclipse.jetty.ee10.servlet.ServletContextHandler; +import org.eclipse.jetty.ee10.servlet.ServletHolder; import org.eclipse.jetty.server.Server; -import org.eclipse.jetty.server.handler.gzip.GzipHandler; -import org.eclipse.jetty.servlet.ServletContextHandler; -import org.eclipse.jetty.servlet.ServletHolder; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.io.TempDir; @@ -61,6 +62,15 @@ public abstract class TestBaseWithRESTServer { @TempDir private Path temp; + /** + * GZIP responses interfere with freshness-aware loading tests that assert on {@code ETag} and + * conditional requests. Subclasses may disable HTTP compression while keeping the default for + * other REST catalog tests. + */ + protected boolean useHttpCompression() { + return true; + } + @BeforeEach public void before() throws Exception { File warehouse = temp.toFile(); @@ -75,7 +85,11 @@ public void before() throws Exception { new ServletContextHandler(ServletContextHandler.NO_SESSIONS); servletContext.addServlet( new ServletHolder(new RESTCatalogServlet(adapterForRESTServer)), "/*"); - servletContext.setHandler(new GzipHandler()); + if (useHttpCompression()) { + CompressionHandler compressionHandler = new CompressionHandler(); + compressionHandler.putCompression(new GzipCompression()); + servletContext.insertHandler(compressionHandler); + } this.httpServer = new Server(new InetSocketAddress(InetAddress.getLoopbackAddress(), 0)); httpServer.setHandler(servletContext); diff --git a/core/src/test/java/org/apache/iceberg/rest/TestErrorHandlers.java b/core/src/test/java/org/apache/iceberg/rest/TestErrorHandlers.java index 8bf62c3c6cf5..b7bbe337cd27 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestErrorHandlers.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestErrorHandlers.java @@ -20,7 +20,9 @@ import static org.assertj.core.api.Assertions.assertThatThrownBy; +import org.apache.iceberg.exceptions.NoSuchWarehouseException; import org.apache.iceberg.exceptions.RESTException; +import org.apache.iceberg.exceptions.ServiceFailureException; import org.apache.iceberg.rest.responses.ErrorResponse; import org.junit.jupiter.api.Test; @@ -68,4 +70,38 @@ public void errorHandlerWithCodeAndTypeOnly() { .isInstanceOf(RESTException.class) .hasMessage("Unable to process (code: 422, type: ValidationException): null"); } + + @Test + public void testConfigErrorHandler404ThrowsNoSuchWarehouseException() { + ErrorResponse error = + ErrorResponse.builder() + .responseCode(404) + .withType("NotFoundException") + .withMessage("Warehouse not found") + .build(); + + assertThatThrownBy(() -> ErrorHandlers.configErrorHandler().accept(error)) + .isInstanceOf(NoSuchWarehouseException.class) + .hasMessage("Warehouse not found"); + } + + @Test + public void testConfigErrorHandler404ForMisconfiguredUri() { + ErrorResponse error = + ErrorResponse.builder().responseCode(404).withMessage("Not Found").build(); + + assertThatThrownBy(() -> ErrorHandlers.configErrorHandler().accept(error)) + .isInstanceOf(RESTException.class) + .hasMessageContaining("Not Found"); + } + + @Test + public void testConfigErrorHandlerDelegatesToDefaultForNon404() { + ErrorResponse error = + ErrorResponse.builder().responseCode(500).withMessage("Internal server error").build(); + + assertThatThrownBy(() -> ErrorHandlers.configErrorHandler().accept(error)) + .isInstanceOf(ServiceFailureException.class) + .hasMessageContaining("Internal server error"); + } } diff --git a/core/src/test/java/org/apache/iceberg/rest/TestFreshnessAwareLoading.java b/core/src/test/java/org/apache/iceberg/rest/TestFreshnessAwareLoading.java index 80981df1fcb3..a4bb170d1411 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestFreshnessAwareLoading.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestFreshnessAwareLoading.java @@ -67,6 +67,11 @@ import org.mockito.stubbing.Answer; public class TestFreshnessAwareLoading extends TestBaseWithRESTServer { + @Override + protected boolean useHttpCompression() { + return false; + } + private static final ResourcePaths RESOURCE_PATHS = ResourcePaths.forCatalogProperties( ImmutableMap.of( diff --git a/core/src/test/java/org/apache/iceberg/rest/TestHTTPClient.java b/core/src/test/java/org/apache/iceberg/rest/TestHTTPClient.java index 8cf97bca32ef..701ae699f136 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestHTTPClient.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestHTTPClient.java @@ -35,22 +35,30 @@ import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; import java.net.SocketTimeoutException; +import java.nio.file.Path; +import java.security.KeyStore; +import java.security.cert.CertificateException; import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.concurrent.TimeUnit; import java.util.function.Consumer; +import javax.net.ssl.HostnameVerifier; +import javax.net.ssl.SSLContext; +import javax.net.ssl.TrustManagerFactory; import org.apache.hc.client5.http.auth.AuthScope; import org.apache.hc.client5.http.auth.UsernamePasswordCredentials; import org.apache.hc.client5.http.config.ConnectionConfig; import org.apache.hc.client5.http.impl.auth.BasicCredentialsProvider; import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager; import org.apache.hc.client5.http.io.HttpClientConnectionManager; +import org.apache.hc.client5.http.ssl.NoopHostnameVerifier; import org.apache.hc.core5.http.HttpHost; import org.apache.hc.core5.http.HttpStatus; import org.apache.iceberg.IcebergBuild; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.rest.auth.AuthSession; import org.apache.iceberg.rest.auth.TLSConfigurer; import org.apache.iceberg.rest.responses.ErrorResponse; @@ -58,14 +66,17 @@ import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.EnumSource; import org.junit.jupiter.params.provider.ValueSource; import org.mockserver.configuration.Configuration; import org.mockserver.integration.ClientAndServer; +import org.mockserver.logging.MockServerLogger; import org.mockserver.matchers.Times; import org.mockserver.model.HttpRequest; import org.mockserver.model.HttpResponse; +import org.mockserver.socket.tls.KeyStoreFactory; import org.mockserver.verify.VerificationTimes; /** @@ -87,6 +98,7 @@ public class TestHTTPClient { private static RESTClient restClient; public static class DefaultTLSConfigurer implements TLSConfigurer { + public static int count = 0; public DefaultTLSConfigurer() { @@ -95,6 +107,7 @@ public DefaultTLSConfigurer() { } public static class TLSConfigurerMissingNoArgCtor implements TLSConfigurer { + TLSConfigurerMissingNoArgCtor(String str) {} } @@ -395,6 +408,101 @@ public void testLoadTLSConfigurerNotImplementTLSConfigurer() { .hasMessageContaining("does not implement TLSConfigurer"); } + /** A TLSConfigurer that relies on the default (built-in) JSSE verifier. */ + public static class BuiltInHostnameVerifierTLSConfigurer implements TLSConfigurer { + + @Override + public SSLContext sslContext() { + return mockServerSSLContext(); + } + } + + /** A TLSConfigurer that overrides hostnameVerifier() to return a custom verifier. */ + public static class CustomHostnameVerifierTLSConfigurer implements TLSConfigurer { + + @Override + public SSLContext sslContext() { + return mockServerSSLContext(); + } + + @Override + public HostnameVerifier hostnameVerifier() { + return NoopHostnameVerifier.INSTANCE; + } + } + + private static SSLContext mockServerSSLContext() { + try { + KeyStore keyStore = + new KeyStoreFactory(Configuration.configuration(), new MockServerLogger()) + .loadOrCreateKeyStore(); + TrustManagerFactory tmf = + TrustManagerFactory.getInstance(TrustManagerFactory.getDefaultAlgorithm()); + tmf.init(keyStore); + SSLContext sslContext = SSLContext.getInstance("TLSv1.2"); + sslContext.init(null, tmf.getTrustManagers(), null); + return sslContext; + } catch (Exception e) { + throw new RuntimeException("Failed to create SSLContext", e); + } + } + + @Test + public void testTLSConfigurerHostnameVerifier(@TempDir Path temp) throws IOException { + + // Start a dedicated MockServer with a certificate that does NOT include + // 127.0.0.1 or localhost in its SANs. + Configuration tlsConfig = Configuration.configuration(); + tlsConfig.proactivelyInitialiseTLS(true); + tlsConfig.preventCertificateDynamicUpdate(true); + tlsConfig.sslCertificateDomainName("example.com"); + tlsConfig.sslSubjectAlternativeNameIps(Sets.newHashSet("1.2.3.4")); + tlsConfig.sslSubjectAlternativeNameDomains(Sets.newHashSet("example.com")); + tlsConfig.directoryToSaveDynamicSSLCertificate(temp.toFile().getAbsolutePath()); + + int tlsPort = PORT + 1; + try (ClientAndServer server = startClientAndServer(tlsConfig, tlsPort)) { + + String path = "tls/hostname-verifier/path"; + HttpRequest mockRequest = + request() + .withPath("/" + path) + .withMethod(HttpMethod.HEAD.name().toUpperCase(Locale.ROOT)); + HttpResponse mockResponse = response().withStatusCode(200).withBody("TLS response"); + server.when(mockRequest).respond(mockResponse); + + // With no custom hostnameVerifier (null), the BUILTIN policy is used automatically, + // so the JSSE built-in verifier rejects the connection because the SANs don't match + try (HTTPClient builtInVerifierClient = + HTTPClient.builder( + Map.of( + HTTPClient.REST_TLS_CONFIGURER, + BuiltInHostnameVerifierTLSConfigurer.class.getName())) + .uri(String.format("https://127.0.0.1:%d", tlsPort)) + .withAuthSession(AuthSession.EMPTY) + .build()) { + assertThatThrownBy(() -> builtInVerifierClient.head(path, Map.of(), (unused) -> {})) + .rootCause() + .isInstanceOf(CertificateException.class) + .hasMessage("No subject alternative names matching IP address 127.0.0.1 found"); + } + + // With a custom hostnameVerifier (NoopHostnameVerifier), the CLIENT policy is used + // automatically, so hostname verification is bypassed and the request succeeds + try (HTTPClient customVerifierClient = + HTTPClient.builder( + Map.of( + HTTPClient.REST_TLS_CONFIGURER, + CustomHostnameVerifierTLSConfigurer.class.getName())) + .uri(String.format("https://127.0.0.1:%d", tlsPort)) + .withAuthSession(AuthSession.EMPTY) + .build()) { + assertThatCode(() -> customVerifierClient.head(path, Map.of(), (unused) -> {})) + .doesNotThrowAnyException(); + } + } + } + @Test public void testSocketTimeout() throws IOException { long socketTimeoutMs = 2000L; @@ -613,6 +721,7 @@ private static Item doExecuteRequest( } public static class Item implements RESTRequest, RESTResponse { + private Long id; private String data; diff --git a/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java b/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java index 571b8002389f..017f400f860f 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java @@ -62,6 +62,7 @@ import org.apache.iceberg.DataFile; import org.apache.iceberg.DataFiles; import org.apache.iceberg.HasTableOperations; +import org.apache.iceberg.HistoryEntry; import org.apache.iceberg.MetadataUpdate; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; @@ -120,10 +121,11 @@ import org.apache.iceberg.util.Pair; import org.assertj.core.api.InstanceOfAssertFactories; import org.awaitility.Awaitility; +import org.eclipse.jetty.compression.gzip.GzipCompression; +import org.eclipse.jetty.compression.server.CompressionHandler; +import org.eclipse.jetty.ee10.servlet.ServletContextHandler; +import org.eclipse.jetty.ee10.servlet.ServletHolder; import org.eclipse.jetty.server.Server; -import org.eclipse.jetty.server.handler.gzip.GzipHandler; -import org.eclipse.jetty.servlet.ServletContextHandler; -import org.eclipse.jetty.servlet.ServletHolder; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -273,12 +275,7 @@ protected T execute( @BeforeEach public void createCatalog() throws Exception { - File warehouse = temp.toFile(); - this.backendCatalog = new InMemoryCatalog(); - this.backendCatalog.initialize( - "in-memory", - ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouse.getAbsolutePath())); HTTPHeaders catalogHeaders = HTTPHeaders.of( @@ -302,7 +299,9 @@ public void createCatalog() throws Exception { new ServletContextHandler(ServletContextHandler.NO_SESSIONS); servletContext.addServlet( new ServletHolder(new RESTCatalogServlet(adapterForRESTServer)), "/*"); - servletContext.setHandler(new GzipHandler()); + CompressionHandler compressionHandler = new CompressionHandler(); + compressionHandler.putCompression(new GzipCompression()); + servletContext.insertHandler(compressionHandler); this.httpServer = new Server(new InetSocketAddress(InetAddress.getLoopbackAddress(), 0)); httpServer.setHandler(servletContext); @@ -314,6 +313,14 @@ public void createCatalog() throws Exception { @Override protected RESTCatalog initCatalog(String catalogName, Map additionalProperties) { Configuration conf = new Configuration(); + File warehouse = temp.toFile(); + + backendCatalog.initialize( + "in-memory", + ImmutableMap.builder() + .put(CatalogProperties.WAREHOUSE_LOCATION, warehouse.getAbsolutePath()) + .putAll(additionalProperties) + .build()); RESTCatalog catalog = new RESTCatalog( @@ -416,6 +423,15 @@ protected boolean requiresNamespaceCreate() { return true; } + @Override + protected boolean supportsNamesWithSlashes() { + // names with slashes are rejected and considered as suspicious characters after upgrading Jetty + // and the Servlet API. See also + // https://jakarta.ee/specifications/servlet/6.0/jakarta-servlet-spec-6.0.html#uri-path-canonicalization + // for additional details + return false; + } + /* RESTCatalog specific tests */ @Test @@ -1071,6 +1087,14 @@ public void testTableSnapshotLoading() { .asInstanceOf(InstanceOfAssertFactories.list(Snapshot.class)) .hasSize(1); + // snapshot log is complete regardless REFS mode + assertThat(((BaseTable) refsTable).operations().current()) + .extracting("snapshotLog") + .asInstanceOf(InstanceOfAssertFactories.list(HistoryEntry.class)) + .hasSize(2) + .containsExactlyInAnyOrderElementsOf( + ((BaseTable) table).operations().current().snapshotLog()); + assertThat(refsTable.currentSnapshot()).isEqualTo(table.currentSnapshot()); // verify that the table was loaded with the refs argument @@ -1165,6 +1189,14 @@ public void testTableSnapshotLoadingWithDivergedBranches(String formatVersion) { .asInstanceOf(InstanceOfAssertFactories.list(Snapshot.class)) .hasSize(2); + // snapshot log is complete regardless REFS mode + assertThat(((BaseTable) refsTable).operations().current()) + .extracting("snapshotLog") + .asInstanceOf(InstanceOfAssertFactories.list(HistoryEntry.class)) + .hasSize(1) // main branch has a single snapshot + .containsExactlyInAnyOrderElementsOf( + ((BaseTable) table).operations().current().snapshotLog()); + assertThat(refsTable.currentSnapshot()).isEqualTo(table.currentSnapshot()); // verify that the table was loaded with the refs argument @@ -1250,6 +1282,14 @@ public void lazySnapshotLoadingWithDivergedHistory() { .asInstanceOf(InstanceOfAssertFactories.list(Snapshot.class)) .hasSize(1); + // snapshot log is complete regardless REFS mode + assertThat(((BaseTable) refsTable).operations().current()) + .extracting("snapshotLog") + .asInstanceOf(InstanceOfAssertFactories.list(HistoryEntry.class)) + .hasSize(numSnapshots) + .containsExactlyInAnyOrderElementsOf( + ((BaseTable) table).operations().current().snapshotLog()); + assertThat(refsTable.currentSnapshot()).isEqualTo(table.currentSnapshot()); assertThat(refsTable.snapshots()).hasSize(numSnapshots); assertThat(refsTable.history()).hasSize(numSnapshots); diff --git a/core/src/test/java/org/apache/iceberg/rest/TestRESTScanPlanning.java b/core/src/test/java/org/apache/iceberg/rest/TestRESTScanPlanning.java index a7fbe43463ac..9b42d445f585 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestRESTScanPlanning.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestRESTScanPlanning.java @@ -1153,6 +1153,255 @@ public void serverSupportsPlanningButNotCancellation() throws IOException { assertThat(cancelled).isFalse(); } + @Test + public void asyncPlanningRespectsConfigurablePollTimeout() { + // Create an adapter that always returns SUBMITTED (never completes) + List endpoints = + endpointsWithPlanning( + Endpoint.V1_SUBMIT_TABLE_SCAN_PLAN, + Endpoint.V1_FETCH_TABLE_SCAN_PLAN, + Endpoint.V1_CANCEL_TABLE_SCAN_PLAN, + Endpoint.V1_FETCH_TABLE_SCAN_PLAN_TASKS); + + RESTCatalogAdapter adapter = + Mockito.spy( + new RESTCatalogAdapter(backendCatalog) { + @Override + public T execute( + HTTPRequest request, + Class responseType, + Consumer errorHandler, + Consumer> responseHeaders, + ParserContext parserContext) { + if (ResourcePaths.config().equals(request.path())) { + return castResponse( + responseType, ConfigResponse.builder().withEndpoints(endpoints).build()); + } + T response = + super.execute( + request, responseType, errorHandler, responseHeaders, parserContext); + if (response instanceof LoadTableResponse) { + return castResponse( + responseType, + withPlanningMode( + (LoadTableResponse) response, + RESTCatalogProperties.ScanPlanningMode.SERVER.modeName())); + } + + // Override fetch responses to always return SUBMITTED so the poll never completes + if (response instanceof FetchPlanningResultResponse) { + return castResponse( + responseType, + FetchPlanningResultResponse.builder() + .withPlanStatus(PlanStatus.SUBMITTED) + .build()); + } + + return response; + } + }); + + adapter.setPlanningBehavior(TestPlanningBehavior.builder().asynchronous().build()); + + RESTCatalog catalog = + new RESTCatalog(SessionCatalog.SessionContext.createEmpty(), (config) -> adapter); + catalog.initialize( + "test-poll-timeout", + ImmutableMap.of( + CatalogProperties.FILE_IO_IMPL, + "org.apache.iceberg.inmemory.InMemoryFileIO", + RESTCatalogProperties.SCAN_PLANNING_MODE, + RESTCatalogProperties.ScanPlanningMode.SERVER.modeName(), + RESTCatalogProperties.REST_SCAN_PLANNING_POLL_TIMEOUT_MS, + "1")); + + RESTTable table = restTableFor(catalog, "poll_timeout_test"); + setParserContext(table); + RESTTableScan scan = restTableScanFor(table); + + // With a 1ms timeout and a server that never completes, planFiles should fail + assertThatThrownBy(scan::planFiles) + .isInstanceOf(RemotePlanTimeoutException.class) + .hasMessageContaining("did not complete within configured limits"); + } + + @Test + public void asyncPlanningSucceedsWithCustomTimeout() { + List endpoints = + endpointsWithPlanning( + Endpoint.V1_SUBMIT_TABLE_SCAN_PLAN, + Endpoint.V1_FETCH_TABLE_SCAN_PLAN, + Endpoint.V1_CANCEL_TABLE_SCAN_PLAN, + Endpoint.V1_FETCH_TABLE_SCAN_PLAN_TASKS); + + CatalogWithAdapter catalogWithAdapter = + catalogWithEndpoints(endpoints, TestPlanningBehavior.builder().asynchronous().build()); + + catalogWithAdapter.catalog.initialize( + "test-custom-timeout", + ImmutableMap.of( + CatalogProperties.FILE_IO_IMPL, + "org.apache.iceberg.inmemory.InMemoryFileIO", + RESTCatalogProperties.SCAN_PLANNING_MODE, + RESTCatalogProperties.ScanPlanningMode.SERVER.modeName(), + RESTCatalogProperties.REST_SCAN_PLANNING_POLL_TIMEOUT_MS, + "30000")); + + RESTTable table = restTableFor(catalogWithAdapter.catalog, "custom_timeout_success"); + setParserContext(table); + assertThat(table.newScan().planFiles()).hasSize(1); + } + + @Test + public void asyncPlanningRejectsInvalidTimeout() { + List endpoints = + endpointsWithPlanning( + Endpoint.V1_SUBMIT_TABLE_SCAN_PLAN, + Endpoint.V1_FETCH_TABLE_SCAN_PLAN, + Endpoint.V1_CANCEL_TABLE_SCAN_PLAN, + Endpoint.V1_FETCH_TABLE_SCAN_PLAN_TASKS); + + CatalogWithAdapter catalogWithAdapter = + catalogWithEndpoints(endpoints, TestPlanningBehavior.builder().asynchronous().build()); + + // re-initialize with an invalid timeout + catalogWithAdapter.catalog.initialize( + "test-invalid-timeout", + ImmutableMap.of( + CatalogProperties.FILE_IO_IMPL, + "org.apache.iceberg.inmemory.InMemoryFileIO", + RESTCatalogProperties.SCAN_PLANNING_MODE, + RESTCatalogProperties.ScanPlanningMode.SERVER.modeName(), + RESTCatalogProperties.REST_SCAN_PLANNING_POLL_TIMEOUT_MS, + "-1")); + + RESTTable table = restTableFor(catalogWithAdapter.catalog, "invalid_timeout_test"); + setParserContext(table); + RESTTableScan scan = restTableScanFor(table); + + assertThatThrownBy(scan::planFiles) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("must be positive"); + } + + @ParameterizedTest + @EnumSource(PlanningMode.class) + public void planningFailsWithServerError( + Function planMode) { + ErrorResponse serverError = + ErrorResponse.builder() + .withMessage("table too large to plan") + .withType("IllegalStateException") + .responseCode(500) + .build(); + + TestPlanningBehavior behavior = planMode.apply(TestPlanningBehavior.builder()).build(); + CatalogWithAdapter catalogWithAdapter = + catalogThatFailsPlanning(serverError, behavior, "test-planning-failed"); + + RESTTable table = restTableFor(catalogWithAdapter.catalog, "planning_failed_test"); + setParserContext(table); + RESTTableScan scan = restTableScanFor(table); + + assertThatThrownBy(scan::planFiles) + .isInstanceOf(IllegalStateException.class) + .hasMessageContaining("Remote scan planning failed") + .hasMessageContaining(serverError.type()) + .hasMessageContaining("code=" + serverError.code()) + .hasMessageContaining(serverError.message()); + } + + @ParameterizedTest + @EnumSource(PlanningMode.class) + public void planningFailsWithoutServerErrorIsStillSurfaced( + Function planMode) { + // Spec requires an error payload with a FAILED status; if a server violates that, + // the client must still surface a meaningful failure rather than throw on top of it. + TestPlanningBehavior behavior = planMode.apply(TestPlanningBehavior.builder()).build(); + CatalogWithAdapter catalogWithAdapter = + catalogThatFailsPlanning(null, behavior, "test-planning-failed-no-error"); + + RESTTable table = restTableFor(catalogWithAdapter.catalog, "planning_failed_no_error_test"); + setParserContext(table); + RESTTableScan scan = restTableScanFor(table); + + assertThatThrownBy(scan::planFiles) + .isInstanceOf(IllegalStateException.class) + .hasMessageContaining("Remote scan planning failed") + .hasMessageContaining("unknown") + .hasMessageContaining("code=0"); + } + + private CatalogWithAdapter catalogThatFailsPlanning( + ErrorResponse serverError, TestPlanningBehavior behavior, String catalogName) { + List endpoints = + endpointsWithPlanning( + Endpoint.V1_SUBMIT_TABLE_SCAN_PLAN, + Endpoint.V1_FETCH_TABLE_SCAN_PLAN, + Endpoint.V1_CANCEL_TABLE_SCAN_PLAN, + Endpoint.V1_FETCH_TABLE_SCAN_PLAN_TASKS); + + RESTCatalogAdapter adapter = + Mockito.spy( + new RESTCatalogAdapter(backendCatalog) { + @Override + public T execute( + HTTPRequest request, + Class responseType, + Consumer errorHandler, + Consumer> responseHeaders, + ParserContext parserContext) { + if (ResourcePaths.config().equals(request.path())) { + return castResponse( + responseType, ConfigResponse.builder().withEndpoints(endpoints).build()); + } + T response = + super.execute( + request, responseType, errorHandler, responseHeaders, parserContext); + if (response instanceof LoadTableResponse) { + return castResponse( + responseType, + withPlanningMode( + (LoadTableResponse) response, + RESTCatalogProperties.ScanPlanningMode.SERVER.modeName())); + } + // Leave SUBMITTED untouched so async mode polls and hits the fetch below. + if (response instanceof PlanTableScanResponse planResp + && planResp.planStatus() == PlanStatus.COMPLETED) { + return castResponse( + responseType, + PlanTableScanResponse.builder() + .withPlanStatus(PlanStatus.FAILED) + .withErrorResponse(serverError) + .withSpecsById(planResp.specsById()) + .build()); + } + if (response instanceof FetchPlanningResultResponse) { + return castResponse( + responseType, + FetchPlanningResultResponse.builder() + .withPlanStatus(PlanStatus.FAILED) + .withErrorResponse(serverError) + .build()); + } + return response; + } + }); + + adapter.setPlanningBehavior(behavior); + + RESTCatalog catalog = + new RESTCatalog(SessionCatalog.SessionContext.createEmpty(), (config) -> adapter); + catalog.initialize( + catalogName, + ImmutableMap.of( + CatalogProperties.FILE_IO_IMPL, + "org.apache.iceberg.inmemory.InMemoryFileIO", + RESTCatalogProperties.SCAN_PLANNING_MODE, + RESTCatalogProperties.ScanPlanningMode.SERVER.modeName())); + return new CatalogWithAdapter(catalog, adapter); + } + @ParameterizedTest @EnumSource(PlanningMode.class) void fileIOForRemotePlanningIsPropagated( diff --git a/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalog.java b/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalog.java index fd2faf55087c..24450949df5f 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalog.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalog.java @@ -57,10 +57,11 @@ import org.apache.iceberg.rest.responses.LoadViewResponse; import org.apache.iceberg.view.ViewCatalogTests; import org.apache.iceberg.view.ViewMetadata; +import org.eclipse.jetty.compression.gzip.GzipCompression; +import org.eclipse.jetty.compression.server.CompressionHandler; +import org.eclipse.jetty.ee10.servlet.ServletContextHandler; +import org.eclipse.jetty.ee10.servlet.ServletHolder; import org.eclipse.jetty.server.Server; -import org.eclipse.jetty.server.handler.gzip.GzipHandler; -import org.eclipse.jetty.servlet.ServletContextHandler; -import org.eclipse.jetty.servlet.ServletHolder; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -114,7 +115,9 @@ public T execute( new ServletContextHandler(ServletContextHandler.NO_SESSIONS); servletContext.setContextPath("/"); servletContext.addServlet(new ServletHolder(new RESTCatalogServlet(adaptor)), "/*"); - servletContext.setHandler(new GzipHandler()); + CompressionHandler compressionHandler = new CompressionHandler(); + compressionHandler.putCompression(new GzipCompression()); + servletContext.insertHandler(compressionHandler); this.httpServer = new Server(new InetSocketAddress(InetAddress.getLoopbackAddress(), 0)); httpServer.setHandler(servletContext); diff --git a/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalogWithAssumedViewSupport.java b/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalogWithAssumedViewSupport.java index 1ba340cc56c2..3f3d7ba77493 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalogWithAssumedViewSupport.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalogWithAssumedViewSupport.java @@ -31,10 +31,11 @@ import org.apache.iceberg.inmemory.InMemoryCatalog; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.rest.responses.ConfigResponse; +import org.eclipse.jetty.compression.gzip.GzipCompression; +import org.eclipse.jetty.compression.server.CompressionHandler; +import org.eclipse.jetty.ee10.servlet.ServletContextHandler; +import org.eclipse.jetty.ee10.servlet.ServletHolder; import org.eclipse.jetty.server.Server; -import org.eclipse.jetty.server.handler.gzip.GzipHandler; -import org.eclipse.jetty.servlet.ServletContextHandler; -import org.eclipse.jetty.servlet.ServletHolder; import org.junit.jupiter.api.BeforeEach; public class TestRESTViewCatalogWithAssumedViewSupport extends TestRESTViewCatalog { @@ -71,7 +72,9 @@ public T handleRequest( new ServletContextHandler(ServletContextHandler.NO_SESSIONS); servletContext.setContextPath("/"); servletContext.addServlet(new ServletHolder(new RESTCatalogServlet(adaptor)), "/*"); - servletContext.setHandler(new GzipHandler()); + CompressionHandler compressionHandler = new CompressionHandler(); + compressionHandler.putCompression(new GzipCompression()); + servletContext.insertHandler(compressionHandler); this.httpServer = new Server(new InetSocketAddress(InetAddress.getLoopbackAddress(), 0)); httpServer.setHandler(servletContext); diff --git a/core/src/test/java/org/apache/iceberg/rest/TestResourcePaths.java b/core/src/test/java/org/apache/iceberg/rest/TestResourcePaths.java index f40b1302f90e..a742b89a7627 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestResourcePaths.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestResourcePaths.java @@ -336,4 +336,20 @@ public void cancelPlanEndpointPath() { assertThat(withoutPrefix.plan(complexId, "plan-xyz-789")) .isEqualTo("v1/namespaces/db%1Fschema/tables/my_table/plan/plan-xyz-789"); } + + @Test + public void testRemoteSign() { + TableIdentifier tableId = TableIdentifier.of("test_namespace", "test_table"); + assertThat(withPrefix.remoteSign(tableId)) + .isEqualTo("v1/ws/catalog/namespaces/test_namespace/tables/test_table/sign"); + assertThat(withoutPrefix.remoteSign(tableId)) + .isEqualTo("v1/namespaces/test_namespace/tables/test_table/sign"); + + // Test with different identifiers + TableIdentifier complexId = TableIdentifier.of(Namespace.of("db", "schema"), "my_table"); + assertThat(withPrefix.remoteSign(complexId)) + .isEqualTo("v1/ws/catalog/namespaces/db%1Fschema/tables/my_table/sign"); + assertThat(withoutPrefix.remoteSign(complexId)) + .isEqualTo("v1/namespaces/db%1Fschema/tables/my_table/sign"); + } } diff --git a/core/src/test/java/org/apache/iceberg/rest/auth/TestOAuth2Util.java b/core/src/test/java/org/apache/iceberg/rest/auth/TestOAuth2Util.java index fbcb87fb06e2..4a6fbf7a1cdc 100644 --- a/core/src/test/java/org/apache/iceberg/rest/auth/TestOAuth2Util.java +++ b/core/src/test/java/org/apache/iceberg/rest/auth/TestOAuth2Util.java @@ -26,9 +26,13 @@ import static org.mockito.ArgumentMatchers.anyMap; import static org.mockito.ArgumentMatchers.argThat; +import com.nimbusds.jwt.JWTClaimsSet; +import com.nimbusds.jwt.PlainJWT; import java.io.IOException; import java.util.Map; +import java.util.concurrent.TimeUnit; import org.apache.iceberg.rest.RESTClient; +import org.apache.iceberg.rest.auth.OAuth2Util.AuthSession; import org.apache.iceberg.rest.responses.OAuthTokenResponse; import org.junit.jupiter.api.Test; import org.mockito.Mockito; @@ -135,4 +139,97 @@ public void testCredentialFlowForSessionRefresh() throws IOException { any()); } } + + @Test + void fromTokenResponseUsesChildTokenExpiry() { + AuthSession parent = parentSession(7200); + OAuthTokenResponse response = childTokenResponse(tokenWithExp(300), 300); + + AuthSession child = + AuthSession.fromTokenResponse(null, null, response, System.currentTimeMillis(), parent); + assertThat(child.expiresAtMillis()) + .as("Child session should use the child token's exp, not the parent's") + .isEqualTo(TimeUnit.SECONDS.toMillis(300)); + } + + @Test + void fromTokenResponseOpaqueTokenDoesNotInheritParentExpiry() { + AuthSession parent = parentSession(7200); + OAuthTokenResponse response = childTokenResponse("opaque-access-token", 600); + + AuthSession child = + AuthSession.fromTokenResponse(null, null, response, System.currentTimeMillis(), parent); + + assertThat(child.expiresAtMillis()) + .as("Child session with opaque token should not inherit parent's expiresAtMillis") + .isNull(); + } + + @Test + void fromAccessTokenUsesChildTokenExpiry() { + AuthSession parent = parentSession(7200); + String childToken = tokenWithExp(300); + + AuthSession child = AuthSession.fromAccessToken(null, null, childToken, null, parent); + assertThat(child.expiresAtMillis()) + .as("Child session should use the child token's exp, not the parent's") + .isEqualTo(TimeUnit.SECONDS.toMillis(300)); + } + + @Test + void refreshUsesRefreshedTokenExpiry() throws IOException { + String parentToken = tokenWithExp(7200); + String refreshedToken = tokenWithExp(500); + + AuthConfig authConfig = + AuthConfig.builder() + .token(parentToken) + .tokenType(OAuth2Properties.ACCESS_TOKEN_TYPE) + .keepRefreshed(true) + .credential("testClientId:testClientSecret") + .oauth2ServerUri("/v1/token") + .expiresAtMillis(OAuth2Util.expiresAtMillis(parentToken)) + .build(); + + OAuthTokenResponse response = childTokenResponse(refreshedToken, 500); + + try (RESTClient client = Mockito.mock(RESTClient.class); + AuthSession session = new AuthSession(Map.of(), authConfig)) { + Mockito.when(client.postForm(any(), anyMap(), any(), anyMap(), any())).thenReturn(response); + + session.refresh(client); + + assertThat(session.expiresAtMillis()) + .as("After refresh, session should use the refreshed token's exp") + .isEqualTo(TimeUnit.SECONDS.toMillis(500)); + } + } + + private static AuthSession parentSession(long expSeconds) { + String parentToken = tokenWithExp(expSeconds); + AuthConfig parentConfig = + AuthConfig.builder() + .token(parentToken) + .tokenType(OAuth2Properties.ACCESS_TOKEN_TYPE) + .keepRefreshed(false) + .build(); + AuthSession parent = new AuthSession(Map.of(), parentConfig); + assertThat(parent.expiresAtMillis()).isEqualTo(TimeUnit.SECONDS.toMillis(expSeconds)); + return parent; + } + + private static OAuthTokenResponse childTokenResponse(String token, int expiresInSeconds) { + return OAuthTokenResponse.builder() + .withToken(token) + .withTokenType(BEARER) + .withIssuedTokenType(OAuth2Properties.ACCESS_TOKEN_TYPE) + .setExpirationInSeconds(expiresInSeconds) + .build(); + } + + private static String tokenWithExp(long expSeconds) { + JWTClaimsSet claimsSet = + new JWTClaimsSet.Builder().subject("test").claim("exp", expSeconds).build(); + return new PlainJWT(claimsSet).serialize(); + } } diff --git a/aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3SignRequestParser.java b/core/src/test/java/org/apache/iceberg/rest/requests/TestRemoteSignRequestParser.java similarity index 70% rename from aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3SignRequestParser.java rename to core/src/test/java/org/apache/iceberg/rest/requests/TestRemoteSignRequestParser.java index 75ae2d88cccf..3515588e444d 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3SignRequestParser.java +++ b/core/src/test/java/org/apache/iceberg/rest/requests/TestRemoteSignRequestParser.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.aws.s3.signer; +package org.apache.iceberg.rest.requests; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; @@ -28,37 +28,39 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.junit.jupiter.api.Test; -public class TestS3SignRequestParser { +public class TestRemoteSignRequestParser { @Test public void nullRequest() { - assertThatThrownBy(() -> S3SignRequestParser.fromJson((JsonNode) null)) + assertThatThrownBy(() -> RemoteSignRequestParser.fromJson((JsonNode) null)) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot parse s3 sign request from null object"); + .hasMessage("Cannot parse remote sign request from null object"); - assertThatThrownBy(() -> S3SignRequestParser.toJson(null)) + assertThatThrownBy(() -> RemoteSignRequestParser.toJson(null)) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Invalid s3 sign request: null"); + .hasMessage("Invalid remote sign request: null"); } @Test public void missingFields() { - assertThatThrownBy(() -> S3SignRequestParser.fromJson("{}")) + assertThatThrownBy(() -> RemoteSignRequestParser.fromJson("{}")) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot parse missing string: region"); - assertThatThrownBy(() -> S3SignRequestParser.fromJson("{\"region\":\"us-west-2\"}")) + assertThatThrownBy(() -> RemoteSignRequestParser.fromJson("{\"region\":\"us-west-2\"}")) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot parse missing string: method"); assertThatThrownBy( - () -> S3SignRequestParser.fromJson("{\"region\":\"us-west-2\", \"method\" : \"PUT\"}")) + () -> + RemoteSignRequestParser.fromJson( + "{\"region\":\"us-west-2\", \"method\" : \"PUT\"}")) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot parse missing string: uri"); assertThatThrownBy( () -> - S3SignRequestParser.fromJson( + RemoteSignRequestParser.fromJson( "{\n" + " \"region\" : \"us-west-2\",\n" + " \"method\" : \"PUT\",\n" @@ -72,7 +74,7 @@ public void missingFields() { public void invalidMethod() { assertThatThrownBy( () -> - S3SignRequestParser.fromJson( + RemoteSignRequestParser.fromJson( "{\n" + " \"region\" : \"us-west-2\",\n" + " \"method\" : 23,\n" @@ -87,7 +89,7 @@ public void invalidMethod() { public void invalidUri() { assertThatThrownBy( () -> - S3SignRequestParser.fromJson( + RemoteSignRequestParser.fromJson( "{\n" + " \"region\" : \"us-west-2\",\n" + " \"method\" : \"PUT\",\n" @@ -102,7 +104,7 @@ public void invalidUri() { public void invalidRegion() { assertThatThrownBy( () -> - S3SignRequestParser.fromJson( + RemoteSignRequestParser.fromJson( "{\n" + " \"region\" : 23,\n" + " \"method\" : \"PUT\",\n" @@ -115,8 +117,8 @@ public void invalidRegion() { @Test public void roundTripSerde() { - ImmutableS3SignRequest s3SignRequest = - ImmutableS3SignRequest.builder() + RemoteSignRequest request = + ImmutableRemoteSignRequest.builder() .uri(URI.create("http://localhost:49208/iceberg-signer-test")) .method("PUT") .region("us-west-2") @@ -132,8 +134,8 @@ public void roundTripSerde() { Arrays.asList("aws-sdk-java/2.20.18", "Linux/5.4.0-126"))) .build(); - String json = S3SignRequestParser.toJson(s3SignRequest, true); - assertThat(S3SignRequestParser.fromJson(json)).isEqualTo(s3SignRequest); + String json = RemoteSignRequestParser.toJson(request, true); + assertThat(RemoteSignRequestParser.fromJson(json)).isEqualTo(request); assertThat(json) .isEqualTo( "{\n" @@ -151,8 +153,8 @@ public void roundTripSerde() { @Test public void roundTripSerdeWithProperties() { - ImmutableS3SignRequest s3SignRequest = - ImmutableS3SignRequest.builder() + RemoteSignRequest request = + ImmutableRemoteSignRequest.builder() .uri(URI.create("http://localhost:49208/iceberg-signer-test")) .method("PUT") .region("us-west-2") @@ -169,8 +171,8 @@ public void roundTripSerdeWithProperties() { .properties(ImmutableMap.of("k1", "v1")) .build(); - String json = S3SignRequestParser.toJson(s3SignRequest, true); - assertThat(S3SignRequestParser.fromJson(json)).isEqualTo(s3SignRequest); + String json = RemoteSignRequestParser.toJson(request, true); + assertThat(RemoteSignRequestParser.fromJson(json)).isEqualTo(request); assertThat(json) .isEqualTo( "{\n" @@ -191,8 +193,8 @@ public void roundTripSerdeWithProperties() { @Test public void roundTripWithBody() { - ImmutableS3SignRequest s3SignRequest = - ImmutableS3SignRequest.builder() + RemoteSignRequest request = + ImmutableRemoteSignRequest.builder() .uri(URI.create("http://localhost:49208/iceberg-signer-test")) .method("PUT") .region("us-west-2") @@ -210,8 +212,8 @@ public void roundTripWithBody() { .body("some-body") .build(); - String json = S3SignRequestParser.toJson(s3SignRequest, true); - assertThat(S3SignRequestParser.fromJson(json)).isEqualTo(s3SignRequest); + String json = RemoteSignRequestParser.toJson(request, true); + assertThat(RemoteSignRequestParser.fromJson(json)).isEqualTo(request); assertThat(json) .isEqualTo( "{\n" @@ -230,4 +232,46 @@ public void roundTripWithBody() { + " \"body\" : \"some-body\"\n" + "}"); } + + @Test + public void roundTripWithProvider() { + RemoteSignRequest request = + ImmutableRemoteSignRequest.builder() + .uri(URI.create("http://localhost:49208/iceberg-signer-test")) + .method("PUT") + .region("us-west-2") + .headers( + ImmutableMap.of( + "amz-sdk-request", + Arrays.asList("attempt=1", "max=4"), + "Content-Length", + Collections.singletonList("191"), + "Content-Type", + Collections.singletonList("application/json"), + "User-Agent", + Arrays.asList("aws-sdk-java/2.20.18", "Linux/5.4.0-126"))) + .properties(ImmutableMap.of("k1", "v1")) + .provider("s3") + .build(); + + String json = RemoteSignRequestParser.toJson(request, true); + assertThat(RemoteSignRequestParser.fromJson(json)).isEqualTo(request); + assertThat(json) + .isEqualTo( + "{\n" + + " \"region\" : \"us-west-2\",\n" + + " \"method\" : \"PUT\",\n" + + " \"uri\" : \"http://localhost:49208/iceberg-signer-test\",\n" + + " \"headers\" : {\n" + + " \"amz-sdk-request\" : [ \"attempt=1\", \"max=4\" ],\n" + + " \"Content-Length\" : [ \"191\" ],\n" + + " \"Content-Type\" : [ \"application/json\" ],\n" + + " \"User-Agent\" : [ \"aws-sdk-java/2.20.18\", \"Linux/5.4.0-126\" ]\n" + + " },\n" + + " \"properties\" : {\n" + + " \"k1\" : \"v1\"\n" + + " },\n" + + " \"provider\" : \"s3\"\n" + + "}"); + } } diff --git a/core/src/test/java/org/apache/iceberg/rest/responses/TestFetchPlanningResultResponseParser.java b/core/src/test/java/org/apache/iceberg/rest/responses/TestFetchPlanningResultResponseParser.java index 5fdfdc281f4f..841083f88baf 100644 --- a/core/src/test/java/org/apache/iceberg/rest/responses/TestFetchPlanningResultResponseParser.java +++ b/core/src/test/java/org/apache/iceberg/rest/responses/TestFetchPlanningResultResponseParser.java @@ -330,4 +330,70 @@ public void roundTripSerdeWithCredentials() { assertThat(FetchPlanningResultResponseParser.toJson(copyResponse, true)) .isEqualTo(expectedJson); } + + @Test + public void roundTripSerdeWithFailedStatusAndErrorResponse() { + ErrorResponse errorResponse = + ErrorResponse.builder() + .withMessage("Scan planning failed: table too large to plan") + .withType("IllegalStateException") + .responseCode(500) + .build(); + + FetchPlanningResultResponse response = + FetchPlanningResultResponse.builder() + .withPlanStatus(PlanStatus.FAILED) + .withErrorResponse(errorResponse) + .build(); + + String expectedJson = + "{\"status\":\"failed\"," + + "\"error\":{\"message\":\"Scan planning failed: table too large to plan\"," + + "\"type\":\"IllegalStateException\",\"code\":500}}"; + String json = FetchPlanningResultResponseParser.toJson(response); + assertThat(json).isEqualTo(expectedJson); + + FetchPlanningResultResponse fromResponse = + FetchPlanningResultResponseParser.fromJson(json, PARTITION_SPECS_BY_ID, false); + assertThat(fromResponse.planStatus()).isEqualTo(PlanStatus.FAILED); + assertThat(fromResponse.errorResponse()).isNotNull(); + assertThat(fromResponse.errorResponse().message()) + .isEqualTo("Scan planning failed: table too large to plan"); + assertThat(fromResponse.errorResponse().type()).isEqualTo("IllegalStateException"); + assertThat(fromResponse.errorResponse().code()).isEqualTo(500); + } + + @Test + public void parseFailedStatusWithoutErrorObject() { + // Spec requires an `error` object on failed responses, but parse leniently so + // a non-compliant server still surfaces the failure to the client. + String json = "{\"status\":\"failed\"}"; + FetchPlanningResultResponse response = + FetchPlanningResultResponseParser.fromJson(json, PARTITION_SPECS_BY_ID, false); + assertThat(response.planStatus()).isEqualTo(PlanStatus.FAILED); + assertThat(response.errorResponse()).isNull(); + } + + @Test + public void parseFailedStatusWithPrimitiveErrorField() { + String json = "{\"status\":\"failed\",\"error\":\"oops\"}"; + FetchPlanningResultResponse response = + FetchPlanningResultResponseParser.fromJson(json, PARTITION_SPECS_BY_ID, false); + assertThat(response.planStatus()).isEqualTo(PlanStatus.FAILED); + assertThat(response.errorResponse()).isNull(); + } + + @Test + public void cannotBuildWithErrorResponseWhenStatusIsNotFailed() { + ErrorResponse errorResponse = + ErrorResponse.builder().withMessage("boom").withType("X").responseCode(500).build(); + assertThatThrownBy( + () -> + FetchPlanningResultResponse.builder() + .withPlanStatus(PlanStatus.COMPLETED) + .withErrorResponse(errorResponse) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid response: error can only be returned in a 'failed' status"); + } } diff --git a/core/src/test/java/org/apache/iceberg/rest/responses/TestPlanTableScanResponseParser.java b/core/src/test/java/org/apache/iceberg/rest/responses/TestPlanTableScanResponseParser.java index 454e838bcca2..6354e7bf246f 100644 --- a/core/src/test/java/org/apache/iceberg/rest/responses/TestPlanTableScanResponseParser.java +++ b/core/src/test/java/org/apache/iceberg/rest/responses/TestPlanTableScanResponseParser.java @@ -648,4 +648,72 @@ public void roundTripSerdeWithValidStatusAndFileScanTasksAndCredentials() { assertThat(PlanTableScanResponseParser.toJson(copyResponse, true)).isEqualTo(expectedJson); } + + @Test + public void roundTripSerdeWithFailedStatusAndErrorResponse() { + ErrorResponse errorResponse = + ErrorResponse.builder() + .withMessage("Scan planning failed: table too large to plan") + .withType("IllegalStateException") + .responseCode(500) + .build(); + + PlanTableScanResponse response = + PlanTableScanResponse.builder() + .withPlanStatus(PlanStatus.FAILED) + .withErrorResponse(errorResponse) + .withSpecsById(PARTITION_SPECS_BY_ID) + .build(); + + String expectedJson = + "{\"status\":\"failed\"," + + "\"error\":{\"message\":\"Scan planning failed: table too large to plan\"," + + "\"type\":\"IllegalStateException\",\"code\":500}}"; + String json = PlanTableScanResponseParser.toJson(response); + assertThat(json).isEqualTo(expectedJson); + + PlanTableScanResponse fromResponse = + PlanTableScanResponseParser.fromJson(json, PARTITION_SPECS_BY_ID, false); + assertThat(fromResponse.planStatus()).isEqualTo(PlanStatus.FAILED); + assertThat(fromResponse.errorResponse()).isNotNull(); + assertThat(fromResponse.errorResponse().message()) + .isEqualTo("Scan planning failed: table too large to plan"); + assertThat(fromResponse.errorResponse().type()).isEqualTo("IllegalStateException"); + assertThat(fromResponse.errorResponse().code()).isEqualTo(500); + } + + @Test + public void parseFailedStatusWithoutErrorObject() { + // Spec requires an `error` object on failed responses, but parse leniently so + // a non-compliant server still surfaces the failure to the client. + String json = "{\"status\":\"failed\"}"; + PlanTableScanResponse response = + PlanTableScanResponseParser.fromJson(json, PARTITION_SPECS_BY_ID, false); + assertThat(response.planStatus()).isEqualTo(PlanStatus.FAILED); + assertThat(response.errorResponse()).isNull(); + } + + @Test + public void parseFailedStatusWithPrimitiveErrorField() { + String json = "{\"status\":\"failed\",\"error\":\"oops\"}"; + PlanTableScanResponse response = + PlanTableScanResponseParser.fromJson(json, PARTITION_SPECS_BY_ID, false); + assertThat(response.planStatus()).isEqualTo(PlanStatus.FAILED); + assertThat(response.errorResponse()).isNull(); + } + + @Test + public void cannotBuildWithErrorResponseWhenStatusIsNotFailed() { + ErrorResponse errorResponse = + ErrorResponse.builder().withMessage("boom").withType("X").responseCode(500).build(); + assertThatThrownBy( + () -> + PlanTableScanResponse.builder() + .withPlanStatus(PlanStatus.COMPLETED) + .withErrorResponse(errorResponse) + .withSpecsById(PARTITION_SPECS_BY_ID) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid response: error can only be defined when status is 'failed'"); + } } diff --git a/aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3SignResponseParser.java b/core/src/test/java/org/apache/iceberg/rest/responses/TestRemoteSignResponseParser.java similarity index 78% rename from aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3SignResponseParser.java rename to core/src/test/java/org/apache/iceberg/rest/responses/TestRemoteSignResponseParser.java index 19f2f540d765..b6d1178c3fa1 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3SignResponseParser.java +++ b/core/src/test/java/org/apache/iceberg/rest/responses/TestRemoteSignResponseParser.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.aws.s3.signer; +package org.apache.iceberg.rest.responses; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; @@ -28,28 +28,28 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.junit.jupiter.api.Test; -public class TestS3SignResponseParser { +public class TestRemoteSignResponseParser { @Test public void nullResponse() { - assertThatThrownBy(() -> S3SignResponseParser.fromJson((JsonNode) null)) + assertThatThrownBy(() -> RemoteSignResponseParser.fromJson((JsonNode) null)) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot parse s3 sign response from null object"); + .hasMessage("Cannot parse remote sign response from null object"); - assertThatThrownBy(() -> S3SignResponseParser.toJson(null)) + assertThatThrownBy(() -> RemoteSignResponseParser.toJson(null)) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Invalid s3 sign response: null"); + .hasMessage("Invalid remote sign response: null"); } @Test public void missingFields() { - assertThatThrownBy(() -> S3SignResponseParser.fromJson("{}")) + assertThatThrownBy(() -> RemoteSignResponseParser.fromJson("{}")) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot parse missing string: uri"); assertThatThrownBy( () -> - S3SignResponseParser.fromJson( + RemoteSignResponseParser.fromJson( "{\"uri\" : \"http://localhost:49208/iceberg-signer-test\"}")) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot parse missing field: headers"); @@ -57,15 +57,15 @@ public void missingFields() { @Test public void invalidUri() { - assertThatThrownBy(() -> S3SignResponseParser.fromJson("{\"uri\" : 45, \"headers\" : {}}}")) + assertThatThrownBy(() -> RemoteSignResponseParser.fromJson("{\"uri\" : 45, \"headers\" : {}}}")) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot parse to a string value: uri: 45"); } @Test public void roundTripSerde() { - S3SignResponse s3SignResponse = - ImmutableS3SignResponse.builder() + RemoteSignResponse response = + ImmutableRemoteSignResponse.builder() .uri(URI.create("http://localhost:49208/iceberg-signer-test")) .headers( ImmutableMap.of( @@ -79,8 +79,8 @@ public void roundTripSerde() { Arrays.asList("aws-sdk-java/2.20.18", "Linux/5.4.0-126"))) .build(); - String json = S3SignResponseParser.toJson(s3SignResponse, true); - assertThat(S3SignResponseParser.fromJson(json)).isEqualTo(s3SignResponse); + String json = RemoteSignResponseParser.toJson(response, true); + assertThat(RemoteSignResponseParser.fromJson(json)).isEqualTo(response); assertThat(json) .isEqualTo( "{\n" diff --git a/core/src/test/java/org/apache/iceberg/util/TestStructLikeWrapper.java b/core/src/test/java/org/apache/iceberg/util/TestStructLikeWrapper.java new file mode 100644 index 000000000000..9eaa45c85a48 --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/util/TestStructLikeWrapper.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.util; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.iceberg.PartitionData; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +public class TestStructLikeWrapper { + @Test + public void equalsTypeAndDataMismatch() { + Types.StructType intType = + Types.StructType.of(Types.NestedField.required(1, "a", Types.IntegerType.get())); + Types.StructType stringType = + Types.StructType.of(Types.NestedField.required(1, "a", Types.StringType.get())); + + PartitionData intData = new PartitionData(intType); + intData.set(0, 1); + + PartitionData stringData = new PartitionData(stringType); + stringData.set(0, "test"); + + StructLikeWrapper integerStruct = StructLikeWrapper.forType(intType).set(intData); + StructLikeWrapper stringStruct = StructLikeWrapper.forType(stringType).set(stringData); + + // StructLikeWrapper.equals previously threw an exception when the type and data mismatch + assertThat(integerStruct).isNotEqualTo(stringStruct); + } +} diff --git a/data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java b/data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java index e295b5fbc1bb..a38b025e0f05 100644 --- a/data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java +++ b/data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java @@ -26,18 +26,43 @@ import static org.assertj.core.api.Assumptions.assumeThat; import static org.junit.jupiter.api.Assumptions.assumeFalse; +import java.io.File; import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; import java.util.Arrays; +import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.UUID; +import java.util.function.BiConsumer; +import java.util.function.BiFunction; +import java.util.function.Function; import java.util.stream.IntStream; +import org.apache.avro.file.DataFileStream; +import org.apache.avro.file.DataFileWriter; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.generic.GenericDatumWriter; +import org.apache.avro.io.DatumWriter; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.ContentFile; import org.apache.iceberg.DataFile; import org.apache.iceberg.DeleteFile; import org.apache.iceberg.FileFormat; +import org.apache.iceberg.MetadataColumns; +import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.MetricsModes; +import org.apache.iceberg.MetricsModes.MetricsMode; +import org.apache.iceberg.PartitionData; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; import org.apache.iceberg.TableProperties; +import org.apache.iceberg.TestTables; +import org.apache.iceberg.avro.AvroTestHelpers; +import org.apache.iceberg.data.orc.GenericOrcWriter; import org.apache.iceberg.deletes.EqualityDeleteWriter; import org.apache.iceberg.deletes.PositionDelete; import org.apache.iceberg.deletes.PositionDeleteWriter; @@ -48,6 +73,7 @@ import org.apache.iceberg.exceptions.ValidationException; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.expressions.Literal; import org.apache.iceberg.formats.FileWriterBuilder; import org.apache.iceberg.formats.FormatModelRegistry; import org.apache.iceberg.inmemory.InMemoryFileIO; @@ -55,10 +81,33 @@ import org.apache.iceberg.io.DataWriter; import org.apache.iceberg.io.DeleteSchemaUtil; import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.mapping.MappingUtil; +import org.apache.iceberg.mapping.NameMapping; +import org.apache.iceberg.orc.ORCSchemaUtil; +import org.apache.iceberg.orc.OrcRowWriter; +import org.apache.iceberg.orc.OrcWritingTestUtils; +import org.apache.iceberg.orc.TestORCSchemaUtil; +import org.apache.iceberg.parquet.ParquetFileTestUtils; +import org.apache.iceberg.parquet.ParquetSchemaUtil; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Comparators; +import org.apache.iceberg.types.Conversions; +import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; +import org.apache.orc.TypeDescription; +import org.apache.orc.Writer; +import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; +import org.apache.parquet.avro.AvroParquetWriter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.ParquetWriter; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.io.TempDir; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.FieldSource; @@ -73,10 +122,14 @@ public abstract class BaseFormatModelTests { protected abstract void assertEquals(Schema schema, List expected, List actual); + protected abstract Object convertConstantToEngine(Type type, Object value); + protected boolean supportsBatchReads() { return false; } + @TempDir private File tableDir; + private static final FileFormat[] FILE_FORMATS = new FileFormat[] {FileFormat.AVRO, FileFormat.PARQUET, FileFormat.ORC}; @@ -91,14 +144,29 @@ protected boolean supportsBatchReads() { static final String FEATURE_FILTER = "filter"; static final String FEATURE_CASE_SENSITIVE = "caseSensitive"; static final String FEATURE_SPLIT = "split"; + static final String FEATURE_READER_DEFAULT = "readerDefault"; static final String FEATURE_REUSE_CONTAINERS = "reuseContainers"; + static final String FEATURE_META_ROW_LINEAGE = "metaRowLineage"; + static final String FEATURE_COLUMN_LEVEL_METRICS = "columnLevelMetrics"; + static final String FEATURE_COLUMN_METRICS_TRUNCATE_BINARY = "columnMetricsTruncateBinary"; private static final Map MISSING_FEATURES = Map.of( FileFormat.AVRO, - new String[] {FEATURE_FILTER, FEATURE_CASE_SENSITIVE, FEATURE_SPLIT}, + new String[] { + FEATURE_FILTER, + FEATURE_CASE_SENSITIVE, + FEATURE_SPLIT, + FEATURE_COLUMN_LEVEL_METRICS, + FEATURE_COLUMN_METRICS_TRUNCATE_BINARY + }, FileFormat.ORC, - new String[] {FEATURE_REUSE_CONTAINERS}); + new String[] { + FEATURE_REUSE_CONTAINERS, + FEATURE_COLUMN_METRICS_TRUNCATE_BINARY, + FEATURE_META_ROW_LINEAGE, + FEATURE_READER_DEFAULT + }); private InMemoryFileIO fileIO; private EncryptedOutputFile encryptedFile; @@ -123,6 +191,8 @@ void after() { if (fileIO != null) { fileIO.close(); } + + TestTables.clearTables(); } /** Write with engine type T, read with Generic Record */ @@ -134,12 +204,7 @@ void testDataWriterEngineWriteGenericRead(FileFormat fileFormat, DataGenerator d FileWriterBuilder, Object> writerBuilder = FormatModelRegistry.dataWriteBuilder(fileFormat, engineType(), encryptedFile); - DataWriter writer = - writerBuilder - .schema(schema) - .engineSchema(engineSchema(schema)) - .spec(PartitionSpec.unpartitioned()) - .build(); + DataWriter writer = writerBuilder.schema(schema).spec(PartitionSpec.unpartitioned()).build(); List genericRecords = dataGenerator.generateRecords(); List engineRecords = convertToEngineRecords(genericRecords, schema); @@ -219,7 +284,6 @@ void testEqualityDeleteWriterEngineWriteGenericRead( EqualityDeleteWriter writer = writerBuilder .schema(schema) - .engineSchema(engineSchema(schema)) .spec(PartitionSpec.unpartitioned()) .equalityFieldIds(1) .build(); @@ -374,7 +438,10 @@ void testReaderBuilderProjection(FileFormat fileFormat) throws IOException { List genericRecords = dataGenerator.generateRecords(); writeGenericRecords(fileFormat, fullSchema, genericRecords); - List projectedGenericRecords = projectRecords(genericRecords, projectedSchema); + List projectedGenericRecords = + genericRecords.stream() + .map(record -> copy(record, projectedSchema, projectedSchema)) + .toList(); List expectedEngineRecords = convertToEngineRecords(projectedGenericRecords, projectedSchema); @@ -383,7 +450,6 @@ void testReaderBuilderProjection(FileFormat fileFormat) throws IOException { try (CloseableIterable reader = FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) .project(projectedSchema) - .engineProjection(engineSchema(projectedSchema)) .build()) { readRecords = ImmutableList.copyOf(reader); } @@ -398,9 +464,6 @@ void testReaderBuilderFilter(FileFormat fileFormat) throws IOException { assumeSupports(fileFormat, FEATURE_FILTER); Schema schema = SCHEMA; - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "data", Types.StringType.get())); // Generate records with known id values [0, count) int count = 10000; @@ -419,7 +482,6 @@ void testReaderBuilderFilter(FileFormat fileFormat) throws IOException { try (CloseableIterable reader = FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) .project(schema) - .engineProjection(engineSchema(schema)) .filter(lessThanFilter) .build()) { readRecords = ImmutableList.copyOf(reader); @@ -433,7 +495,6 @@ void testReaderBuilderFilter(FileFormat fileFormat) throws IOException { try (CloseableIterable reader = FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) .project(schema) - .engineProjection(engineSchema(schema)) .filter(greaterThanFilter) .build()) { readRecords = ImmutableList.copyOf(reader); @@ -471,7 +532,6 @@ void testReaderBuilderCaseSensitive(FileFormat fileFormat) throws IOException { try (CloseableIterable reader = FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) .project(schema) - .engineProjection(engineSchema(schema)) .filter(upperCaseFilter) .caseSensitive(false) .build()) { @@ -486,7 +546,6 @@ void testReaderBuilderCaseSensitive(FileFormat fileFormat) throws IOException { try (CloseableIterable reader = FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) .project(schema) - .engineProjection(engineSchema(schema)) .filter(upperCaseFilter) .caseSensitive(true) .build()) { @@ -529,7 +588,6 @@ void testReaderBuilderSplit(FileFormat fileFormat) throws IOException { try (CloseableIterable reader = FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) .project(schema) - .engineProjection(engineSchema(schema)) .split(firstSplitStart, firstSplitLength) .build()) { readRecords = ImmutableList.copyOf(reader); @@ -542,7 +600,6 @@ void testReaderBuilderSplit(FileFormat fileFormat) throws IOException { try (CloseableIterable reader = FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) .project(schema) - .engineProjection(engineSchema(schema)) .split(fileLength, 0) .build()) { emptyReadRecords = ImmutableList.copyOf(reader); @@ -554,7 +611,6 @@ void testReaderBuilderSplit(FileFormat fileFormat) throws IOException { try (CloseableIterable reader = FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) .project(schema) - .engineProjection(engineSchema(schema)) .split(0, fileLength) .build()) { readRecords = ImmutableList.copyOf(reader); @@ -584,7 +640,6 @@ void testReaderBuilderReuseContainers(FileFormat fileFormat) throws IOException try (CloseableIterable reader = FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) .project(schema) - .engineProjection(engineSchema(schema)) .build()) { noReuseRecords = ImmutableList.copyOf(reader); } @@ -600,7 +655,6 @@ void testReaderBuilderReuseContainers(FileFormat fileFormat) throws IOException try (CloseableIterable reader = FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) .project(schema) - .engineProjection(engineSchema(schema)) .reuseContainers() .build()) { reuseRecords = ImmutableList.copyOf(reader); @@ -609,6 +663,50 @@ void testReaderBuilderReuseContainers(FileFormat fileFormat) throws IOException reuseRecords.forEach(r -> assertThat(r).isSameAs(reuseRecords.get(0))); } + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReaderSchemaEvolutionNewColumnWithDefault(FileFormat fileFormat) throws IOException { + + assumeSupports(fileFormat, FEATURE_READER_DEFAULT); + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema writeSchema = dataGenerator.schema(); + + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, writeSchema, genericRecords); + + String defaultStringValue = "default_value"; + int defaultIntValue = 42; + + int maxFieldId = + writeSchema.columns().stream().mapToInt(Types.NestedField::fieldId).max().orElse(0); + + List evolvedColumns = Lists.newArrayList(writeSchema.columns()); + evolvedColumns.add( + Types.NestedField.required("col_f") + .withId(maxFieldId + 1) + .ofType(Types.StringType.get()) + .withInitialDefault(Literal.of(defaultStringValue)) + .build()); + evolvedColumns.add( + Types.NestedField.optional("col_g") + .withId(maxFieldId + 2) + .ofType(Types.IntegerType.get()) + .withInitialDefault(Literal.of(defaultIntValue)) + .build()); + + Schema evolvedSchema = new Schema(evolvedColumns); + readAndAssertGenericRecords( + fileFormat, + evolvedSchema, + genericRecords, + record -> { + Record expected = copy(record, writeSchema, evolvedSchema); + expected.setField("col_f", defaultStringValue); + expected.setField("col_g", defaultIntValue); + return expected; + }); + } + @ParameterizedTest @FieldSource("FILE_FORMATS") void testReaderBuilderRecordsPerBatchNotSupported(FileFormat fileFormat) throws IOException { @@ -628,89 +726,1014 @@ void testReaderBuilderRecordsPerBatchNotSupported(FileFormat fileFormat) throws .isInstanceOf(UnsupportedOperationException.class); } - private void readAndAssertGenericRecords( - FileFormat fileFormat, Schema schema, List expected) throws IOException { - InputFile inputFile = encryptedFile.encryptingOutputFile().toInputFile(); - List readRecords; - try (CloseableIterable reader = - FormatModelRegistry.readBuilder(fileFormat, Record.class, inputFile) - .project(schema) - .build()) { - readRecords = ImmutableList.copyOf(reader); + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testDataWriterMetricsCollection(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema schema = dataGenerator.schema(); + + List genericRecords = dataGenerator.generateRecords(); + DataFile dataFile = writeGenericRecords(fileFormat, schema, genericRecords); + + assertCounts(fileFormat, schema, genericRecords, dataFile); + assertBounds(fileFormat, schema, genericRecords, dataFile); + assertColumnSize(fileFormat, dataFile); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testDataWriterMetricsWithNoneMode(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema schema = dataGenerator.schema(); + + MetricsConfig noneConfig = config(schema, MetricsModes.None.get()); + List genericRecords = dataGenerator.generateRecords(); + DataFile dataFile = writeGenericRecords(fileFormat, schema, genericRecords, noneConfig); + + assertCountsNull(schema, dataFile); + assertBoundsNull(schema, dataFile); + assertColumnSizeEmpty(fileFormat, dataFile); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testDataWriterMetricsWithCountsMode(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema schema = dataGenerator.schema(); + MetricsConfig countsConfig = config(schema, MetricsModes.Counts.get()); + List genericRecords = dataGenerator.generateRecords(); + DataFile dataFile = writeGenericRecords(fileFormat, schema, genericRecords, countsConfig); + + // In the counts mode, valueCounts and nullValueCounts should be present, while lowerBounds and + // upperBounds should be null. + assertCounts(fileFormat, schema, genericRecords, dataFile); + assertBoundsNull(schema, dataFile); + assertColumnSize(fileFormat, dataFile); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testDataWriterMetricsWithTruncateMode(FileFormat fileFormat) throws IOException { + int truncateLength = 5; + Schema schema = + new Schema( + Types.NestedField.required(1, "col_str", Types.StringType.get()), + Types.NestedField.required(2, "col_int", Types.IntegerType.get())); + + List records = Lists.newArrayList(); + records.add(GenericRecord.create(schema).copy("col_str", "abcdefghij", "col_int", 10)); + records.add(GenericRecord.create(schema).copy("col_str", "abcdezyxwv", "col_int", 20)); + records.add(GenericRecord.create(schema).copy("col_str", "abcdeAAAAA", "col_int", 5)); + + assertTruncateBoundsForFirstColumn( + fileFormat, + schema, + records, + truncateLength, + FEATURE_COLUMN_LEVEL_METRICS, + (lower, upper) -> { + // Lower bound: "abcdeAAAAA" truncated to "abcde" + CharSequence actualLower = Conversions.fromByteBuffer(Types.StringType.get(), lower); + assertThat(actualLower.toString()).hasSize(truncateLength); + assertThat(actualLower.toString()).isEqualTo("abcde"); + + // Upper bound: "abcdezyxwv" truncated and incremented to "abcdf" + CharSequence actualUpper = Conversions.fromByteBuffer(Types.StringType.get(), upper); + assertThat(actualUpper.toString()).hasSize(truncateLength); + assertThat(actualUpper.toString()).isEqualTo("abcdf"); + }); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testDataWriterMetricsWithTruncateModeForBinary(FileFormat fileFormat) throws IOException { + int truncateLength = 5; + Schema schema = + new Schema( + Types.NestedField.required(1, "col_bin", Types.BinaryType.get()), + Types.NestedField.required(2, "col_int", Types.IntegerType.get())); + + List records = Lists.newArrayList(); + records.add( + GenericRecord.create(schema) + .copy( + "col_bin", + ByteBuffer.wrap( + new byte[] {0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x10, 0xA, 0xB}), + "col_int", + 10)); + + assertTruncateBoundsForFirstColumn( + fileFormat, + schema, + records, + truncateLength, + FEATURE_COLUMN_METRICS_TRUNCATE_BINARY, + (lower, upper) -> { + ByteBuffer actualLower = Conversions.fromByteBuffer(Types.BinaryType.get(), lower); + ByteBuffer actualUpper = Conversions.fromByteBuffer(Types.BinaryType.get(), upper); + + ByteBuffer expectedLower = ByteBuffer.wrap(new byte[] {0x1, 0x2, 0x3, 0x4, 0x5}); + ByteBuffer expectedUpper = ByteBuffer.wrap(new byte[] {0x1, 0x2, 0x3, 0x4, 0x6}); + + assertThat(actualLower).isEqualTo(expectedLower); + assertThat(actualUpper).isEqualTo(expectedUpper); + }); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testEqualityDeleteWriterMetricsCollection(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema schema = dataGenerator.schema(); + FileWriterBuilder, Object> writerBuilder = + FormatModelRegistry.equalityDeleteWriteBuilder(fileFormat, Record.class, encryptedFile); + + EqualityDeleteWriter writer = + writerBuilder + .schema(schema) + .spec(PartitionSpec.unpartitioned()) + .equalityFieldIds(1) + .build(); + + List genericRecords = dataGenerator.generateRecords(); + + try (writer) { + genericRecords.forEach(writer::write); } - DataTestHelpers.assertEquals(schema.asStruct(), expected, readRecords); + + DeleteFile deleteFile = writer.toDeleteFile(); + + assertCounts(fileFormat, schema, genericRecords, deleteFile); + assertBounds(fileFormat, schema, genericRecords, deleteFile); + assertColumnSize(fileFormat, deleteFile); } - private void writeGenericRecords(FileFormat fileFormat, Schema schema, List records) - throws IOException { - FileWriterBuilder, Object> writerBuilder = - FormatModelRegistry.dataWriteBuilder(fileFormat, Record.class, encryptedFile); + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testPositionDeleteWriterMetricsSingleFile(FileFormat fileFormat) throws IOException { + // Single file reference: counts are removed but bounds are preserved. + List> deletes = + ImmutableList.of( + PositionDelete.create().set("d-file-1.file", 0L), + PositionDelete.create().set("d-file-1.file", 5L), + PositionDelete.create().set("d-file-1.file", 3L)); + + DeleteFile deleteFile = writePositionDeletes(fileFormat, deletes); + assertPositionDeleteMetrics(fileFormat, deletes, deleteFile, true /* checkBounds */); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testPositionDeleteWriterMetricsMultipleFiles(FileFormat fileFormat) throws IOException { + // Multiple file references: both counts and bounds are removed. + List> deletes = + ImmutableList.of( + PositionDelete.create().set("d-file-1.file", 0L), + PositionDelete.create().set("d-file-1.file", 5L), + PositionDelete.create().set("d-file-2.file", 3L)); + + DeleteFile deleteFile = writePositionDeletes(fileFormat, deletes); + assertPositionDeleteMetrics(fileFormat, deletes, deleteFile, false /* checkBounds */); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testDataWriterMetricsWithPerColumnMode(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema schema = dataGenerator.schema(); + + // Default mode is "counts", col_b is overridden to "full", col_a is overridden to "none" + MetricsConfig perColumnConfig = + config( + schema, + MetricsModes.Counts.get(), + ImmutableMap.of("col_b", MetricsModes.Full.get(), "col_a", MetricsModes.None.get())); + + List genericRecords = dataGenerator.generateRecords(); + DataFile dataFile = writeGenericRecords(fileFormat, schema, genericRecords, perColumnConfig); + + // col_a: mode=none -> no valueCounts, nullValueCounts, bounds + Schema noneSchema = new Schema(schema.findField("col_a")); + assertCountsNull(noneSchema, dataFile); + assertBoundsNull(noneSchema, dataFile); + + // col_b: mode=full -> valueCounts, nullValueCounts, and bounds all present + Schema fullSchema = new Schema(schema.findField("col_b")); + assertCounts(fileFormat, fullSchema, genericRecords, dataFile); + assertBounds(fileFormat, fullSchema, genericRecords, dataFile); + + // col_c, col_d, col_e: mode=counts (default) -> valueCounts and nullValueCounts present, + // but no bounds + Schema countsSchema = + new Schema(schema.findField("col_c"), schema.findField("col_d"), schema.findField("col_e")); + assertCounts(fileFormat, countsSchema, genericRecords, dataFile); + assertBoundsNull(countsSchema, dataFile); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testDataWriterNanMetrics(FileFormat fileFormat) throws IOException { + Schema schema = new DataGenerators.FloatDoubleSchema().schema(); + + List records = Lists.newArrayList(); + records.add( + GenericRecord.create(schema).copy("col_float", Float.NaN, "col_double", Double.NaN)); + records.add( + GenericRecord.create(schema).copy("col_float", Float.NaN, "col_double", Double.NaN)); + records.add(GenericRecord.create(schema).copy("col_float", 1.0F, "col_double", 10.0D)); + records.add(GenericRecord.create(schema).copy("col_float", 5.0F, "col_double", 50.0D)); + records.add(GenericRecord.create(schema).copy("col_float", 3.0F, "col_double", 30.0D)); + + DataFile dataFile = writeGenericRecords(fileFormat, schema, records); + + assertCounts(fileFormat, schema, records, dataFile); + assertBounds(fileFormat, schema, records, dataFile); + assertNanCounts(fileFormat, schema, records, dataFile); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testDataWriterNanSortingOrder(FileFormat fileFormat) throws IOException { + Schema schema = new DataGenerators.FloatDoubleSchema().schema(); + + List records = Lists.newArrayList(); + records.add( + GenericRecord.create(schema).copy("col_float", Float.NaN, "col_double", Double.NaN)); + records.add( + GenericRecord.create(schema) + .copy("col_float", Float.NEGATIVE_INFINITY, "col_double", Double.NEGATIVE_INFINITY)); + records.add(GenericRecord.create(schema).copy("col_float", -1.0F, "col_double", -1.0D)); + records.add(GenericRecord.create(schema).copy("col_float", -0.0F, "col_double", -0.0D)); + records.add(GenericRecord.create(schema).copy("col_float", 0.0F, "col_double", 0.0D)); + records.add(GenericRecord.create(schema).copy("col_float", 1.0F, "col_double", 1.0D)); + records.add( + GenericRecord.create(schema) + .copy("col_float", Float.POSITIVE_INFINITY, "col_double", Double.POSITIVE_INFINITY)); + records.add( + GenericRecord.create(schema).copy("col_float", Float.NaN, "col_double", Double.NaN)); + + DataFile dataFile = writeGenericRecords(fileFormat, schema, records); + + // Bounds should exclude NaN: float/double lower = -Infinity, upper = +Infinity + assertBounds(fileFormat, schema, records, dataFile); + assertNanCounts(fileFormat, schema, records, dataFile); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testDataWriterNegativeZeroBounds(FileFormat fileFormat) throws IOException { + Schema schema = new DataGenerators.FloatDoubleSchema().schema(); + + List records = Lists.newArrayList(); + records.add(GenericRecord.create(schema).copy("col_float", -0.0F, "col_double", -0.0D)); + records.add(GenericRecord.create(schema).copy("col_float", 0.0F, "col_double", 0.0D)); + + DataFile dataFile = writeGenericRecords(fileFormat, schema, records); + assertBounds(fileFormat, schema, records, dataFile); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReadMetadataColumnFilePath(FileFormat fileFormat) throws IOException { + + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema schema = dataGenerator.schema(); + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, schema, genericRecords); + + String filePath = "test-data-file.parquet"; + Schema projectionSchema = new Schema(MetadataColumns.FILE_PATH); + + Map idToConstant = + ImmutableMap.of(MetadataColumns.FILE_PATH.fieldId(), filePath); + + readAndAssertMetadataColumn( + fileFormat, + projectionSchema, + idToConstant, + genericRecords, + ignored -> + GenericRecord.create(projectionSchema) + .copy(MetadataColumns.FILE_PATH.name(), filePath)); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReadMetadataColumnSpecId(FileFormat fileFormat) throws IOException { + + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema schema = dataGenerator.schema(); + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, schema, genericRecords); + + int specId = 0; + Schema projectionSchema = new Schema(MetadataColumns.SPEC_ID); + + Map idToConstant = ImmutableMap.of(MetadataColumns.SPEC_ID.fieldId(), specId); + + readAndAssertMetadataColumn( + fileFormat, + projectionSchema, + idToConstant, + genericRecords, + ignored -> + GenericRecord.create(projectionSchema).copy(MetadataColumns.SPEC_ID.name(), specId)); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReadMetadataColumnRowPosition(FileFormat fileFormat) throws IOException { + + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema schema = dataGenerator.schema(); + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, schema, genericRecords); + + Schema projectionSchema = new Schema(MetadataColumns.ROW_POSITION); + + readAndAssertMetadataColumn( + fileFormat, + projectionSchema, + null, + genericRecords, + (position, ignored) -> + GenericRecord.create(projectionSchema) + .copy(MetadataColumns.ROW_POSITION.name(), (long) position)); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReadMetadataColumnIsDeleted(FileFormat fileFormat) throws IOException { + + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema schema = dataGenerator.schema(); + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, schema, genericRecords); + + Schema projectionSchema = new Schema(MetadataColumns.IS_DELETED); + + readAndAssertMetadataColumn( + fileFormat, + projectionSchema, + null, + genericRecords, + ignored -> + GenericRecord.create(projectionSchema).copy(MetadataColumns.IS_DELETED.name(), false)); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReadMetadataColumnRowLinage(FileFormat fileFormat) throws IOException { + assumeSupports(fileFormat, FEATURE_META_ROW_LINEAGE); + + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema schema = dataGenerator.schema(); + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, schema, genericRecords); + + long baseRowId = 100L; + long fileSeqNumber = 5L; + Schema projectionSchema = + new Schema(MetadataColumns.ROW_ID, MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER); + + Map idToConstant = + ImmutableMap.of( + MetadataColumns.ROW_ID.fieldId(), baseRowId, + MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId(), fileSeqNumber); + + readAndAssertMetadataColumn( + fileFormat, + projectionSchema, + idToConstant, + genericRecords, + (position, ignored) -> + GenericRecord.create(projectionSchema) + .copy( + MetadataColumns.ROW_ID.name(), + baseRowId + position, + MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.name(), + fileSeqNumber)); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReadMetadataColumnRowLinageExistValue(FileFormat fileFormat) throws IOException { + assumeSupports(fileFormat, FEATURE_META_ROW_LINEAGE); + + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema dataSchema = dataGenerator.schema(); + + Schema writeSchema = MetadataColumns.schemaWithRowLineage(dataSchema); + + List baseRecords = dataGenerator.generateRecords(); + List writeRecords = Lists.newArrayListWithExpectedSize(baseRecords.size()); + for (int i = 0; i < baseRecords.size(); i++) { + Record base = baseRecords.get(i); + Record rec = copy(base, dataSchema, writeSchema); + + if (i % 2 == 0) { + rec.setField(MetadataColumns.ROW_ID.name(), 555L + i); + rec.setField(MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.name(), 7L); + } else { + rec.setField(MetadataColumns.ROW_ID.name(), null); + rec.setField(MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.name(), null); + } + + writeRecords.add(rec); + } DataWriter writer = - writerBuilder.schema(schema).spec(PartitionSpec.unpartitioned()).build(); + FormatModelRegistry.dataWriteBuilder(fileFormat, Record.class, encryptedFile) + .schema(writeSchema) + .spec(PartitionSpec.unpartitioned()) + .build(); try (writer) { - records.forEach(writer::write); + writeRecords.forEach(writer::write); } - DataFile dataFile = writer.toDataFile(); - assertThat(dataFile).isNotNull(); - assertThat(dataFile.recordCount()).isEqualTo(records.size()); - assertThat(dataFile.format()).isEqualTo(fileFormat); + long baseRowId = 100L; + long fileSeqNumber = 5L; + Schema projectionSchema = + new Schema(MetadataColumns.ROW_ID, MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER); + + Map idToConstant = + ImmutableMap.of( + MetadataColumns.ROW_ID.fieldId(), baseRowId, + MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId(), fileSeqNumber); + + // Expected results: + // - Even rows (explicit values): _row_id = 555+i, _last_updated_sequence_number = 7 + // - Odd rows (null values): _row_id = baseRowId+pos, _last_updated_sequence_number = + // fileSeqNumber + readAndAssertMetadataColumn( + fileFormat, + projectionSchema, + idToConstant, + baseRecords, + (position, ignored) -> { + if (position % 2 == 0) { + return GenericRecord.create(projectionSchema) + .copy( + MetadataColumns.ROW_ID.name(), + 555L + position, + MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.name(), + 7L); + } else { + return GenericRecord.create(projectionSchema) + .copy( + MetadataColumns.ROW_ID.name(), + baseRowId + position, + MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.name(), + fileSeqNumber); + } + }); } - private List projectRecords(List records, Schema projectedSchema) { - return records.stream() - .map( - record -> { - Record projected = GenericRecord.create(projectedSchema.asStruct()); - projectedSchema - .columns() - .forEach( - field -> projected.setField(field.name(), record.getField(field.name()))); - return projected; - }) - .toList(); - } + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReadMetadataColumnPartitionIdentity(FileFormat fileFormat) throws IOException { - private List convertToEngineRecords(List records, Schema schema) { - return records.stream().map(r -> convertToEngine(r, schema)).toList(); + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + PartitionSpec spec = PartitionSpec.builderFor(dataGenerator.schema()).identity("col_a").build(); + + Types.StructType partitionType = spec.partitionType(); + PartitionData partitionData = new PartitionData(partitionType); + partitionData.set(0, "test_col_a"); + + DataWriter writer = + FormatModelRegistry.dataWriteBuilder(fileFormat, Record.class, encryptedFile) + .schema(dataGenerator.schema()) + .spec(PartitionSpec.unpartitioned()) + .build(); + + List records = dataGenerator.generateRecords(); + try (writer) { + records.forEach(writer::write); + } + + Types.NestedField partitionField = + Types.NestedField.optional( + MetadataColumns.PARTITION_COLUMN_ID, + MetadataColumns.PARTITION_COLUMN_NAME, + partitionType, + MetadataColumns.PARTITION_COLUMN_DOC); + Schema projectionSchema = new Schema(partitionField); + + Map idToConstant = + ImmutableMap.of(MetadataColumns.PARTITION_COLUMN_ID, partitionData); + + Record partitionRecord = structLikeToRecord(partitionData, partitionType); + readAndAssertMetadataColumn( + fileFormat, + projectionSchema, + idToConstant, + records, + ignored -> + GenericRecord.create(projectionSchema) + .copy(MetadataColumns.PARTITION_COLUMN_NAME, partitionRecord)); } - private static void assumeSupports(FileFormat fileFormat, String feature) { - assumeThat(MISSING_FEATURES.getOrDefault(fileFormat, new String[] {})).doesNotContain(feature); + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReadMetadataColumnPartitionEvolutionAddColumn(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema dataSchema = dataGenerator.schema(); + + // Old spec: partition by col_a only (spec id = 0) + PartitionSpec oldSpec = PartitionSpec.builderFor(dataSchema).identity("col_a").build(); + + // New spec: partition by col_a + col_b (spec id = 1, simulates partition evolution) + PartitionSpec newSpec = + PartitionSpec.builderFor(dataSchema) + .withSpecId(1) + .identity("col_a") + .identity("col_b") + .build(); + + // Partition data for the old file (only col_a is set, col_b is absent) + PartitionData oldPartitionData = new PartitionData(oldSpec.partitionType()); + oldPartitionData.set(0, "test_data"); + + // Write data using the old spec + DataWriter writer = + FormatModelRegistry.dataWriteBuilder(fileFormat, Record.class, encryptedFile) + .schema(dataSchema) + .spec(PartitionSpec.unpartitioned()) + .build(); + + List records = dataGenerator.generateRecords(); + + try (writer) { + records.forEach(writer::write); + } + + Types.StructType unifiedPartitionType = newSpec.partitionType(); + + // Build projection schema with PARTITION_COLUMN using the unified partition type + Types.NestedField partitionField = + Types.NestedField.optional( + MetadataColumns.PARTITION_COLUMN_ID, + MetadataColumns.PARTITION_COLUMN_NAME, + unifiedPartitionType, + MetadataColumns.PARTITION_COLUMN_DOC); + Schema projectionSchema = new Schema(partitionField); + + Map idToConstant = + ImmutableMap.of(MetadataColumns.PARTITION_COLUMN_ID, oldPartitionData); + + Record partitionRecord = structLikeToRecord(oldPartitionData, unifiedPartitionType); + readAndAssertMetadataColumn( + fileFormat, + projectionSchema, + idToConstant, + records, + ignored -> + GenericRecord.create(projectionSchema) + .copy(MetadataColumns.PARTITION_COLUMN_NAME, partitionRecord)); } - private DataFile writeRecordsForSplit(FileFormat fileFormat, Schema schema, List records) + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReadMetadataColumnPartitionEvolutionRemoveColumn(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema dataSchema = dataGenerator.schema(); + + PartitionSpec oldSpec = + PartitionSpec.builderFor(dataSchema).identity("col_a").identity("col_b").build(); + + PartitionSpec newSpec = + PartitionSpec.builderFor(dataSchema).withSpecId(1).identity("col_a").build(); + + // Partition data for the old file (both col_a and col_b are set) + PartitionData oldPartitionData = new PartitionData(oldSpec.partitionType()); + oldPartitionData.set(0, "test_col_a"); + oldPartitionData.set(1, 1); - String splitSizeProperty = splitSizeProperty(fileFormat); DataWriter writer = FormatModelRegistry.dataWriteBuilder(fileFormat, Record.class, encryptedFile) - .schema(schema) + .schema(dataSchema) .spec(PartitionSpec.unpartitioned()) - .set(splitSizeProperty, "1") .build(); + List records = dataGenerator.generateRecords(); + try (writer) { records.forEach(writer::write); } - DataFile dataFile = writer.toDataFile(); - List splitOffsets = dataFile.splitOffsets(); - assertThat(splitOffsets) - .as( - "Expected multiple split offsets. " - + "If this fails, the file did not produce multiple splits. " - + "Try reducing the split size property (see writeRecordsForSplit) " - + "or increasing the number of records written.") - .hasSizeGreaterThan(1); - - assertThat(dataFile.format()).isEqualTo(fileFormat); - return dataFile; + // Use the new spec's partition type for projection (only col_a remains after evolution) + // This simulates reading an old file from the perspective of the new spec + Types.StructType newPartitionType = newSpec.partitionType(); + Types.NestedField partitionField = + Types.NestedField.optional( + MetadataColumns.PARTITION_COLUMN_ID, + MetadataColumns.PARTITION_COLUMN_NAME, + newPartitionType, + MetadataColumns.PARTITION_COLUMN_DOC); + Schema projectionSchema = new Schema(partitionField); + + Map idToConstant = + ImmutableMap.of(MetadataColumns.PARTITION_COLUMN_ID, oldPartitionData); + + Record partitionRecord = structLikeToRecord(oldPartitionData, newPartitionType); + readAndAssertMetadataColumn( + fileFormat, + projectionSchema, + idToConstant, + records, + ignored -> + GenericRecord.create(projectionSchema) + .copy(MetadataColumns.PARTITION_COLUMN_NAME, partitionRecord)); } - private static String splitSizeProperty(FileFormat fileFormat) { + private void readAndAssertGenericRecords( + FileFormat fileFormat, + Schema schema, + List sourceRecords, + Function transform) + throws IOException { + readAndAssertGenericRecords(fileFormat, schema, sourceRecords.stream().map(transform).toList()); + } + + /** + * Schema evolution: Adding column (reading with wider schema). Write with DefaultSchema, read + * with additional optional columns. The new columns should be filled with null values. + */ + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testSchemaEvolutionAddColumn(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema writeSchema = dataGenerator.schema(); + + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, writeSchema, genericRecords); + + List evolvedColumns = Lists.newArrayList(writeSchema.columns()); + + int maxFieldId = + writeSchema.columns().stream().mapToInt(Types.NestedField::fieldId).max().orElse(0); + evolvedColumns.add( + Types.NestedField.optional("new_string_col") + .withId(maxFieldId + 1) + .ofType(Types.StringType.get()) + .build()); + evolvedColumns.add( + Types.NestedField.optional("new_int_col") + .withId(maxFieldId + 2) + .ofType(Types.IntegerType.get()) + .build()); + Schema readSchema = new Schema(evolvedColumns); + readAndAssertEngineRecords( + fileFormat, + readSchema, + genericRecords, + record -> { + Record expected = copy(record, writeSchema, readSchema); + + expected.setField("new_string_col", null); + expected.setField("new_int_col", null); + return expected; + }); + } + + /** + * Schema evolution: Projection / Removing column (reading with narrower schema). Write with + * DefaultSchema, read with only a subset of columns (skipping middle columns). + */ + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testSchemaEvolutionProjection(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema writeSchema = dataGenerator.schema(); + + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, writeSchema, genericRecords); + + List writeColumns = writeSchema.columns(); + assumeThat(writeColumns).hasSizeGreaterThanOrEqualTo(2); + Schema projectedSchema = + new Schema(writeColumns.get(0), writeColumns.get(writeColumns.size() - 1)); + + readAndAssertEngineRecords( + fileFormat, + projectedSchema, + genericRecords, + record -> copy(record, projectedSchema, projectedSchema)); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testSchemaEvolutionDropAndReAddSameNameColumn(FileFormat fileFormat) throws IOException { + + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema writeSchema = dataGenerator.schema(); + + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, writeSchema, genericRecords); + + // Remove col_b and add a new col_b with a different field ID + Schema readSchema = + new Schema( + Types.NestedField.required(1, "col_a", Types.StringType.get()), + Types.NestedField.optional(6, "col_b", Types.IntegerType.get()), + Types.NestedField.required(3, "col_c", Types.LongType.get()), + Types.NestedField.required(4, "col_d", Types.FloatType.get()), + Types.NestedField.required(5, "col_e", Types.DoubleType.get())); + + readAndAssertEngineRecords( + fileFormat, + readSchema, + genericRecords, + record -> { + Record expected = GenericRecord.create(readSchema); + expected.setField("col_a", record.getField("col_a")); + expected.setField("col_b", null); + expected.setField("col_c", record.getField("col_c")); + expected.setField("col_d", record.getField("col_d")); + expected.setField("col_e", record.getField("col_e")); + return expected; + }); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testSchemaEvolutionTypePromotionIntToLong(FileFormat fileFormat) throws IOException { + runTypePromotionCheck( + fileFormat, + Types.IntegerType.get(), + Types.LongType.get(), + value -> value == null ? null : ((Integer) value).longValue()); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testSchemaEvolutionTypePromotionFloatToDouble(FileFormat fileFormat) throws IOException { + runTypePromotionCheck( + fileFormat, + Types.FloatType.get(), + Types.DoubleType.get(), + value -> value == null ? null : ((Float) value).doubleValue()); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testSchemaEvolutionTypePromotionDecimalPrecision(FileFormat fileFormat) throws IOException { + runTypePromotionCheck( + fileFormat, Types.DecimalType.of(9, 2), Types.DecimalType.of(18, 2), Function.identity()); + } + + /** + * Schema evolution: Reorder columns. Write with DefaultSchema {col_a, col_b, col_c, col_d, + * col_e}, read with reordered schema {col_e, col_c, col_a, col_d, col_b}. + */ + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testSchemaEvolutionReorderColumns(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema writeSchema = dataGenerator.schema(); + + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, writeSchema, genericRecords); + + Schema reorderedSchema = + new Schema( + Types.NestedField.required(5, "col_e", Types.DoubleType.get()), + Types.NestedField.required(3, "col_c", Types.LongType.get()), + Types.NestedField.required(1, "col_a", Types.StringType.get()), + Types.NestedField.required(4, "col_d", Types.FloatType.get()), + Types.NestedField.required(2, "col_b", Types.IntegerType.get())); + + readAndAssertEngineRecords( + fileFormat, + reorderedSchema, + genericRecords, + record -> copy(record, reorderedSchema, reorderedSchema)); + } + + /** + * Schema evolution: Rename column. Write with DefaultSchema where col_b has field ID 2. Read with + * a schema where the same field ID 2 is renamed to "column_b". Since Iceberg binds by field ID, + * the renamed column should still read the original data correctly. + */ + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testSchemaEvolutionRenameColumn(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema writeSchema = dataGenerator.schema(); + + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, writeSchema, genericRecords); + + // rename col_b(id=2) -> column_b, col_d(id=4) -> column_d + Schema renamedSchema = + new Schema( + Types.NestedField.required(1, "col_a", Types.StringType.get()), + Types.NestedField.required(2, "column_b", Types.IntegerType.get()), + Types.NestedField.required(3, "col_c", Types.LongType.get()), + Types.NestedField.required(4, "column_d", Types.FloatType.get()), + Types.NestedField.required(5, "col_e", Types.DoubleType.get())); + + readAndAssertEngineRecords( + fileFormat, + renamedSchema, + genericRecords, + record -> { + Record expected = GenericRecord.create(renamedSchema); + expected.setField("col_a", record.getField("col_a")); + expected.setField("column_b", record.getField("col_b")); + expected.setField("col_c", record.getField("col_c")); + expected.setField("column_d", record.getField("col_d")); + expected.setField("col_e", record.getField("col_e")); + return expected; + }); + } + + /** + * Schema evolution: Required → Optional. Write with DefaultSchema where all columns are required. + * Read with a schema where some columns are changed to optional. Iceberg allows widening required + * to optional. The data should still be read correctly. + */ + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testSchemaEvolutionRequiredToOptional(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema writeSchema = dataGenerator.schema(); + + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, writeSchema, genericRecords); + + // change col_b and col_d to optional + Schema readSchema = + new Schema( + Types.NestedField.required(1, "col_a", Types.StringType.get()), + Types.NestedField.optional(2, "col_b", Types.IntegerType.get()), + Types.NestedField.required(3, "col_c", Types.LongType.get()), + Types.NestedField.optional(4, "col_d", Types.FloatType.get()), + Types.NestedField.required(5, "col_e", Types.DoubleType.get())); + + readAndAssertEngineRecords( + fileFormat, readSchema, genericRecords, record -> copy(record, readSchema, readSchema)); + } + + /** + * Schema evolution: Read with empty projection. Write with DefaultSchema, read with an empty + * schema (no columns). The reader should return the correct number of rows but with no data + * columns. + */ + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testSchemaEvolutionEmptyProjection(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema writeSchema = dataGenerator.schema(); + + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, writeSchema, genericRecords); + + Schema emptySchema = new Schema(); + + InputFile inputFile = encryptedFile.encryptingOutputFile().toInputFile(); + List readRecords; + try (CloseableIterable reader = + FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) + .project(emptySchema) + .build()) { + readRecords = ImmutableList.copyOf(reader); + } + + assertThat(readRecords).hasSameSizeAs(genericRecords); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReadFileWithoutFieldIdsUsingNameMapping(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema icebergSchema = dataGenerator.schema(); + + List genericRecords = dataGenerator.generateRecords(); + + // Write the file WITHOUT Iceberg field IDs (as an external writer would). + writeRecordsWithoutFieldIds(fileFormat, icebergSchema, genericRecords); + + NameMapping nameMapping = MappingUtil.create(icebergSchema); + + InputFile inputFile = encryptedFile.encryptingOutputFile().toInputFile(); + List readRecords; + try (CloseableIterable reader = + FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) + .project(icebergSchema) + .withNameMapping(nameMapping) + .build()) { + readRecords = ImmutableList.copyOf(reader); + } + + assertEquals(icebergSchema, convertToEngineRecords(genericRecords, icebergSchema), readRecords); + } + + private void readAndAssertGenericRecords( + FileFormat fileFormat, Schema schema, List expected) throws IOException { + InputFile inputFile = encryptedFile.encryptingOutputFile().toInputFile(); + List readRecords; + try (CloseableIterable reader = + FormatModelRegistry.readBuilder(fileFormat, Record.class, inputFile) + .project(schema) + .build()) { + readRecords = ImmutableList.copyOf(reader); + } + + DataTestHelpers.assertEquals(schema.asStruct(), expected, readRecords); + } + + private DataFile writeGenericRecords(FileFormat fileFormat, Schema schema, List records) + throws IOException { + return writeGenericRecords(fileFormat, schema, records, null); + } + + private DataFile writeGenericRecords( + FileFormat fileFormat, Schema schema, List records, MetricsConfig metricsConfig) + throws IOException { + FileWriterBuilder, Object> writerBuilder = + FormatModelRegistry.dataWriteBuilder(fileFormat, Record.class, encryptedFile); + + if (metricsConfig != null) { + writerBuilder.metricsConfig(metricsConfig); + } + + DataWriter writer = + writerBuilder.schema(schema).spec(PartitionSpec.unpartitioned()).build(); + + try (writer) { + records.forEach(writer::write); + } + + DataFile dataFile = writer.toDataFile(); + assertThat(dataFile).isNotNull(); + assertThat(dataFile.recordCount()).isEqualTo(records.size()); + assertThat(dataFile.format()).isEqualTo(fileFormat); + + return dataFile; + } + + private List convertToEngineRecords(List records, Schema schema) { + return records.stream().map(r -> convertToEngine(r, schema)).toList(); + } + + private static void assumeSupports(FileFormat fileFormat, String feature) { + assumeThat(MISSING_FEATURES.getOrDefault(fileFormat, new String[] {})).doesNotContain(feature); + } + + /** + * Returns whether the given file format supports the specified feature. + * + *

The check is based on {@link #MISSING_FEATURES}. Features not listed as missing for a format + * are treated as supported. + * + *

Prefer this method over {@link #assumeSupports(FileFormat, String)} when only part of a test + * should be skipped conditionally. Unlike {@code assumeSupports}, this method does not abort the + * entire test via an assumption failure; it returns {@code false} so callers can skip only + * feature-specific assertions while still validating shared behavior. + * + * @param fileFormat the file format under test + * @param feature the feature name + * @return {@code true} if the feature is supported by the format; {@code false} otherwise + */ + private static boolean supportsFeature(FileFormat fileFormat, String feature) { + String[] missing = MISSING_FEATURES.getOrDefault(fileFormat, new String[] {}); + return !Arrays.asList(missing).contains(feature); + } + + private DataFile writeRecordsForSplit(FileFormat fileFormat, Schema schema, List records) + throws IOException { + + String splitSizeProperty = splitSizeProperty(fileFormat); + DataWriter writer = + FormatModelRegistry.dataWriteBuilder(fileFormat, Record.class, encryptedFile) + .schema(schema) + .spec(PartitionSpec.unpartitioned()) + .set(splitSizeProperty, "1") + .build(); + + try (writer) { + records.forEach(writer::write); + } + + DataFile dataFile = writer.toDataFile(); + List splitOffsets = dataFile.splitOffsets(); + assertThat(splitOffsets) + .as( + "Expected multiple split offsets. " + + "If this fails, the file did not produce multiple splits. " + + "Try reducing the split size property (see writeRecordsForSplit) " + + "or increasing the number of records written.") + .hasSizeGreaterThan(1); + + assertThat(dataFile.format()).isEqualTo(fileFormat); + return dataFile; + } + + private static String splitSizeProperty(FileFormat fileFormat) { return switch (fileFormat) { case PARQUET -> TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; case ORC -> TableProperties.ORC_STRIPE_SIZE_BYTES; @@ -719,4 +1742,489 @@ private static String splitSizeProperty(FileFormat fileFormat) { "No split size property defined for format: " + fileFormat); }; } + + private static void assertCounts( + FileFormat fileFormat, Schema schema, List genericRecords, ContentFile file) { + if (!supportsFeature(fileFormat, FEATURE_COLUMN_LEVEL_METRICS)) { + return; + } + + Map valueCounts = file.valueCounts(); + Map nullValueCounts = file.nullValueCounts(); + for (Types.NestedField field : schema.columns()) { + if (field.type().isPrimitiveType()) { + assertThat(valueCounts).containsKey(field.fieldId()); + assertThat(nullValueCounts).containsKey(field.fieldId()); + + long nullCount = + genericRecords.stream().filter(r -> r.getField(field.name()) == null).count(); + + assertThat(valueCounts.get(field.fieldId())).isEqualTo(genericRecords.size()); + assertThat(nullValueCounts.get(field.fieldId())).isEqualTo(nullCount); + } + } + } + + private static void assertBounds( + FileFormat fileFormat, Schema schema, List genericRecords, ContentFile file) { + if (!supportsFeature(fileFormat, FEATURE_COLUMN_LEVEL_METRICS)) { + return; + } + + Map lowerBounds = file.lowerBounds(); + Map upperBounds = file.upperBounds(); + for (Types.NestedField field : schema.columns()) { + if (field.type().isPrimitiveType()) { + assertThat(lowerBounds).containsKey(field.fieldId()); + assertThat(upperBounds).containsKey(field.fieldId()); + + ByteBuffer lowerBuffer = lowerBounds.get(field.fieldId()); + ByteBuffer upperBuffer = upperBounds.get(field.fieldId()); + + Comparator cmp = Comparators.forType(field.type().asPrimitiveType()); + + Object[] minMax = computeMinMax(genericRecords, field, cmp); + Object expectedMin = minMax[0]; + Object expectedMax = minMax[1]; + + if (expectedMin != null) { + assertThat(lowerBuffer).isNotNull(); + Object actualLower = Conversions.fromByteBuffer(field.type(), lowerBuffer); + assertThat(cmp.compare(actualLower, expectedMin)).isEqualTo(0); + } + + if (expectedMax != null) { + assertThat(upperBuffer).isNotNull(); + Object actualUpper = Conversions.fromByteBuffer(field.type(), upperBuffer); + assertThat(cmp.compare(actualUpper, expectedMax)).isEqualTo(0); + } + } + } + } + + private static Object[] computeMinMax( + List records, Types.NestedField field, Comparator cmp) { + Object min = null; + Object max = null; + for (Record record : records) { + Object value = record.getField(field.name()); + if (value == null) { + continue; + } + + if (value instanceof Float && ((Float) value).isNaN()) { + continue; + } + + if (value instanceof Double && ((Double) value).isNaN()) { + continue; + } + + if (min == null || cmp.compare(value, min) < 0) { + min = value; + } + + if (max == null || cmp.compare(value, max) > 0) { + max = value; + } + } + + return new Object[] {min, max}; + } + + private static void assertBoundsNull(Schema schema, ContentFile file) { + Map lowerBounds = file.lowerBounds(); + Map upperBounds = file.upperBounds(); + for (Types.NestedField field : schema.columns()) { + if (field.type().isPrimitiveType()) { + assertThat(lowerBounds == null || lowerBounds.get(field.fieldId()) == null).isTrue(); + assertThat(upperBounds == null || upperBounds.get(field.fieldId()) == null).isTrue(); + } + } + } + + private static void assertColumnSize(FileFormat fileFormat, ContentFile file) { + if (!supportsFeature(fileFormat, FEATURE_COLUMN_LEVEL_METRICS)) { + return; + } + + assertThat(file.columnSizes()).isNotNull().isNotEmpty(); + } + + private static void assertColumnSizeEmpty(FileFormat fileFormat, ContentFile file) { + if (!supportsFeature(fileFormat, FEATURE_COLUMN_LEVEL_METRICS)) { + return; + } + + assertThat(file.columnSizes()).isEmpty(); + } + + private static void assertCountsNull(Schema schema, ContentFile file) { + Map valueCounts = file.valueCounts(); + Map nullValueCounts = file.nullValueCounts(); + for (Types.NestedField field : schema.columns()) { + if (field.type().isPrimitiveType()) { + assertThat(valueCounts == null || valueCounts.get(field.fieldId()) == null).isTrue(); + assertThat(nullValueCounts == null || nullValueCounts.get(field.fieldId()) == null) + .isTrue(); + } + } + } + + private static void assertNanCounts( + FileFormat fileFormat, Schema schema, List records, ContentFile file) { + if (!supportsFeature(fileFormat, FEATURE_COLUMN_LEVEL_METRICS)) { + return; + } + + Map nanValueCounts = file.nanValueCounts(); + assertThat(nanValueCounts).isNotNull(); + + for (Types.NestedField field : schema.columns()) { + if (field.type().typeId() == Type.TypeID.FLOAT + || field.type().typeId() == Type.TypeID.DOUBLE) { + long expectedNanCount = + records.stream() + .map(r -> r.getField(field.name())) + .filter( + v -> + (v instanceof Float && ((Float) v).isNaN()) + || (v instanceof Double && ((Double) v).isNaN())) + .count(); + assertThat(nanValueCounts.get(field.fieldId())).isEqualTo(expectedNanCount); + } + } + } + + private DeleteFile writePositionDeletes(FileFormat fileFormat, List> deletes) + throws IOException { + FileWriterBuilder, ?> writerBuilder = + FormatModelRegistry.positionDeleteWriteBuilder(fileFormat, encryptedFile); + + PositionDeleteWriter writer = writerBuilder.spec(PartitionSpec.unpartitioned()).build(); + try (writer) { + deletes.forEach(writer::write); + } + + return writer.toDeleteFile(); + } + + private void assertPositionDeleteMetrics( + FileFormat fileFormat, + List> deletes, + DeleteFile deleteFile, + boolean checkBounds) { + Schema positionDeleteSchema = DeleteSchemaUtil.pathPosSchema(); + + assertThat(deleteFile).isNotNull(); + assertThat(deleteFile.recordCount()).isEqualTo(deletes.size()); + assertCountsNull(positionDeleteSchema, deleteFile); + + assumeSupports(fileFormat, FEATURE_COLUMN_LEVEL_METRICS); + + if (checkBounds) { + // Single file reference: bounds are preserved + List genericRecords = + deletes.stream() + .map( + d -> + GenericRecord.create(positionDeleteSchema) + .copy( + DELETE_FILE_PATH.name(), d.path(), + DELETE_FILE_POS.name(), d.pos())) + .toList(); + assertBounds(fileFormat, positionDeleteSchema, genericRecords, deleteFile); + } else { + // Multiple file references: bounds are also removed + assertBoundsNull(positionDeleteSchema, deleteFile); + } + } + + private MetricsConfig config(Schema schema, MetricsMode defaultMode) { + return config(schema, defaultMode, ImmutableMap.of()); + } + + private MetricsConfig config( + Schema schema, MetricsMode defaultMode, Map columnModes) { + ImmutableMap.Builder properties = ImmutableMap.builder(); + properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, defaultMode.toString()); + columnModes.forEach( + (column, mode) -> + properties.put( + TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + column, mode.toString())); + + TestTables.TestTable table = + TestTables.create( + tableDir, "test", schema, PartitionSpec.unpartitioned(), 3, properties.build()); + + return MetricsConfig.forTable(table); + } + + private void assertTruncateBoundsForFirstColumn( + FileFormat fileFormat, + Schema schema, + List records, + int truncateLength, + String requiredFeature, + BiConsumer boundsAssertion) + throws IOException { + MetricsConfig truncateConfig = config(schema, MetricsModes.Truncate.withLength(truncateLength)); + + DataFile dataFile = writeGenericRecords(fileFormat, schema, records, truncateConfig); + assertCounts(fileFormat, schema, records, dataFile); + + if (!supportsFeature(fileFormat, requiredFeature)) { + return; + } + + Map lowerBounds = dataFile.lowerBounds(); + Map upperBounds = dataFile.upperBounds(); + + assertThat(lowerBounds).containsKey(1); + assertThat(upperBounds).containsKey(1); + + boundsAssertion.accept(lowerBounds.get(1), upperBounds.get(1)); + + Schema intSchema = new Schema(schema.findField("col_int")); + assertBounds(fileFormat, intSchema, records, dataFile); + + assertThat(dataFile.columnSizes()).isNotNull().isNotEmpty(); + } + + private Map convertConstantsToEngine( + Schema projectionSchema, Map idToConstant) { + return idToConstant.entrySet().stream() + .collect( + ImmutableMap.toImmutableMap( + Map.Entry::getKey, + entry -> + convertConstantToEngine( + projectionSchema.findType(entry.getKey()), entry.getValue()))); + } + + private static Record structLikeToRecord(StructLike structLike, Types.StructType structType) { + Record record = GenericRecord.create(structType); + int sourceSize = structLike.size(); + for (int i = 0; i < structType.fields().size(); i++) { + if (i < sourceSize) { + record.set(i, structLike.get(i, Object.class)); + } else { + Types.NestedField field = structType.fields().get(i); + record.set(i, field.initialDefault()); + } + } + + return record; + } + + private void readAndAssertMetadataColumn( + FileFormat fileFormat, + Schema projectionSchema, + Map idToConstant, + List sourceRecords, + Function transform) + throws IOException { + readAndAssertMetadataColumn( + fileFormat, projectionSchema, idToConstant, sourceRecords.stream().map(transform).toList()); + } + + private void readAndAssertMetadataColumn( + FileFormat fileFormat, + Schema projectionSchema, + Map idToConstant, + List sourceRecords, + BiFunction transform) + throws IOException { + readAndAssertMetadataColumn( + fileFormat, + projectionSchema, + idToConstant, + IntStream.range(0, sourceRecords.size()) + .mapToObj(index -> transform.apply(index, sourceRecords.get(index))) + .toList()); + } + + private void readAndAssertMetadataColumn( + FileFormat fileFormat, + Schema projectionSchema, + Map idToConstant, + List expectedRecords) + throws IOException { + + InputFile inputFile = encryptedFile.encryptingOutputFile().toInputFile(); + List readRecords; + + var readerBuilder = + FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) + .project(projectionSchema); + + if (idToConstant != null) { + readerBuilder.idToConstant(convertConstantsToEngine(projectionSchema, idToConstant)); + } + + try (CloseableIterable reader = readerBuilder.build()) { + readRecords = ImmutableList.copyOf(reader); + } + + assertThat(readRecords).hasSize(expectedRecords.size()); + assertEquals( + projectionSchema, convertToEngineRecords(expectedRecords, projectionSchema), readRecords); + } + + private static Record copy(Record source, Schema sourceSchema, Schema targetSchema) { + Record result = GenericRecord.create(targetSchema); + for (Types.NestedField col : sourceSchema.columns()) { + result.setField(col.name(), source.getField(col.name())); + } + + return result; + } + + private void writeRecordsWithoutFieldIds( + FileFormat fileFormat, Schema schema, List records) throws IOException { + switch (fileFormat) { + case PARQUET -> writeParquetWithoutFieldIds(schema, records); + case AVRO -> writeAvroWithoutFieldIds(schema, records); + case ORC -> writeOrcWithoutFieldIds(schema, records); + default -> throw new UnsupportedOperationException("Unsupported file format: " + fileFormat); + } + } + + private void writeAvroWithoutFieldIds(Schema schema, List records) throws IOException { + org.apache.avro.Schema avroSchemaWithoutIds = AvroTestHelpers.removeIds(schema); + + OutputFile outputFile = encryptedFile.encryptingOutputFile(); + DatumWriter datumWriter = new GenericDatumWriter<>(avroSchemaWithoutIds); + try (OutputStream out = outputFile.create(); + DataFileWriter writer = new DataFileWriter<>(datumWriter)) { + writer.create(avroSchemaWithoutIds, out); + for (Record record : records) { + GenericData.Record avroRecord = new GenericData.Record(avroSchemaWithoutIds); + for (Types.NestedField field : schema.columns()) { + avroRecord.put(field.name(), record.getField(field.name())); + } + + writer.append(avroRecord); + } + } + + try (DataFileStream reader = + new DataFileStream<>(outputFile.toInputFile().newStream(), new GenericDatumReader<>())) { + assertThat(AvroTestHelpers.hasIds(reader.getSchema())).isFalse(); + } + } + + private void writeParquetWithoutFieldIds(Schema schema, List records) throws IOException { + org.apache.avro.Schema avroSchemaWithoutIds = AvroTestHelpers.removeIds(schema); + + OutputFile outputFile = encryptedFile.encryptingOutputFile(); + + try (ParquetWriter writer = + AvroParquetWriter.builder(ParquetFileTestUtils.file(outputFile)) + .withDataModel(GenericData.get()) + .withSchema(avroSchemaWithoutIds) + .withConf(new Configuration()) + .build()) { + for (Record record : records) { + GenericData.Record avroRecord = new GenericData.Record(avroSchemaWithoutIds); + for (Types.NestedField field : schema.columns()) { + avroRecord.put(field.name(), record.getField(field.name())); + } + + writer.write(avroRecord); + } + } + + try (ParquetFileReader reader = + ParquetFileReader.open(ParquetFileTestUtils.file(outputFile.toInputFile()))) { + assertThat(ParquetSchemaUtil.hasIds(reader.getFooter().getFileMetaData().getSchema())) + .isFalse(); + } + } + + private void writeOrcWithoutFieldIds(Schema schema, List records) throws IOException { + TypeDescription typeWithIds = ORCSchemaUtil.convert(schema); + TypeDescription typeWithoutIds = TestORCSchemaUtil.removeIds(typeWithIds); + + OutputFile outputFile = encryptedFile.encryptingOutputFile(); + Path hadoopPath = new Path(outputFile.location()); + + Configuration conf = new Configuration(); + OrcFile.WriterOptions options = + OrcFile.writerOptions(conf) + .useUTCTimestamp(true) + .setSchema(typeWithoutIds) + .fileSystem(OrcWritingTestUtils.outputFileSystem(outputFile)); + + OrcRowWriter rowWriter = GenericOrcWriter.buildWriter(schema, typeWithIds); + + try (Writer orcWriter = OrcFile.createWriter(hadoopPath, options)) { + VectorizedRowBatch batch = typeWithoutIds.createRowBatch(); + for (Record record : records) { + rowWriter.write(record, batch); + if (batch.size == batch.getMaxSize()) { + orcWriter.addRowBatch(batch); + batch.reset(); + } + } + + if (batch.size > 0) { + orcWriter.addRowBatch(batch); + batch.reset(); + } + } + + InputFile inputFile = outputFile.toInputFile(); + OrcFile.ReaderOptions readerOptions = + OrcFile.readerOptions(conf) + .useUTCTimestamp(true) + .filesystem(OrcWritingTestUtils.inputFileSystem(inputFile)) + .maxLength(inputFile.getLength()); + + try (Reader reader = OrcFile.createReader(hadoopPath, readerOptions)) { + assertThat(TestORCSchemaUtil.hasIds(reader.getSchema())).isFalse(); + } + } + + private void runTypePromotionCheck( + FileFormat fileFormat, Type fromType, Type toType, Function promoteValue) + throws IOException { + String columnName = "col"; + Schema writeSchema = new Schema(Types.NestedField.required(1, columnName, fromType)); + Schema readSchema = new Schema(Types.NestedField.required(1, columnName, toType)); + + List genericRecords = RandomGenericData.generate(writeSchema, 10, 1L); + writeGenericRecords(fileFormat, writeSchema, genericRecords); + + readAndAssertEngineRecords( + fileFormat, + readSchema, + genericRecords, + record -> { + Record expected = GenericRecord.create(readSchema); + expected.setField(columnName, promoteValue.apply(record.getField(columnName))); + return expected; + }); + } + + private void readAndAssertEngineRecords( + FileFormat fileFormat, + Schema readSchema, + List sourceRecords, + Function converter) + throws IOException { + List expectedGenericRecords = sourceRecords.stream().map(converter).toList(); + InputFile inputFile = encryptedFile.encryptingOutputFile().toInputFile(); + List readRecords; + try (CloseableIterable reader = + FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) + .project(readSchema) + .build()) { + readRecords = ImmutableList.copyOf(reader); + } + + assertThat(readRecords).hasSize(expectedGenericRecords.size()); + assertEquals( + readSchema, convertToEngineRecords(expectedGenericRecords, readSchema), readRecords); + } } diff --git a/data/src/test/java/org/apache/iceberg/data/DataGenerators.java b/data/src/test/java/org/apache/iceberg/data/DataGenerators.java index 325a8b191b07..390c0949cb72 100644 --- a/data/src/test/java/org/apache/iceberg/data/DataGenerators.java +++ b/data/src/test/java/org/apache/iceberg/data/DataGenerators.java @@ -64,4 +64,16 @@ public Schema schema() { return schema; } } + + static class FloatDoubleSchema implements DataGenerator { + private final Schema schema = + new Schema( + Types.NestedField.required(1, "col_float", Types.FloatType.get()), + Types.NestedField.required(2, "col_double", Types.DoubleType.get())); + + @Override + public Schema schema() { + return schema; + } + } } diff --git a/data/src/test/java/org/apache/iceberg/data/orc/TestGenericData.java b/data/src/test/java/org/apache/iceberg/data/orc/TestGenericData.java index 3a7fec6962bb..cc318b2f53b1 100644 --- a/data/src/test/java/org/apache/iceberg/data/orc/TestGenericData.java +++ b/data/src/test/java/org/apache/iceberg/data/orc/TestGenericData.java @@ -68,6 +68,11 @@ protected boolean supportsUnknown() { return true; } + @Override + protected boolean supportsRowLineage() { + return true; + } + /** Orc writers don't have notion of non-null / required fields. */ @Override protected boolean allowsWritingNullValuesForRequiredFields() { @@ -250,13 +255,15 @@ private void writeAndValidateRecords(Schema schema, List expected) throw try (CloseableIterable reader = ORC.read(Files.localInput(testFile)) .project(schema) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(schema, fileSchema)) + .createReaderFunc( + fileSchema -> GenericOrcReader.buildReader(schema, fileSchema, ID_TO_CONSTANT)) .build()) { rows = Lists.newArrayList(reader); } for (int i = 0; i < expected.size(); i += 1) { - DataTestHelpers.assertEquals(schema.asStruct(), expected.get(i), rows.get(i)); + DataTestHelpers.assertEquals( + schema.asStruct(), expected.get(i), rows.get(i), ID_TO_CONSTANT, i); } } } diff --git a/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsCatalog.java b/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsCatalog.java index 07ad68365837..bb8150d16dca 100644 --- a/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsCatalog.java +++ b/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsCatalog.java @@ -59,6 +59,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.io.ByteStreams; import org.apache.iceberg.util.LocationUtil; +import org.apache.iceberg.util.PropertyUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -86,6 +87,7 @@ public class EcsCatalog extends BaseMetastoreCatalog private FileIO fileIO; private CloseableGroup closeableGroup; private Map catalogProperties; + private boolean uniqueTableLocation; /** * No-arg constructor to load the catalog dynamically. @@ -102,6 +104,12 @@ public void initialize(String name, Map properties) { !Strings.isNullOrEmpty(inputWarehouseLocation), "Cannot initialize EcsCatalog because warehousePath must not be null or empty"); + this.uniqueTableLocation = + PropertyUtil.propertyAsBoolean( + properties, + CatalogProperties.UNIQUE_TABLE_LOCATION, + CatalogProperties.UNIQUE_TABLE_LOCATION_DEFAULT); + this.catalogName = name; this.warehouseLocation = new EcsURI(LocationUtil.stripTrailingSlash(inputWarehouseLocation)); this.client = DellClientFactories.from(properties).ecsS3(); @@ -136,8 +144,8 @@ protected TableOperations newTableOps(TableIdentifier tableIdentifier) { @Override protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { - return String.format( - "%s/%s", namespacePrefix(tableIdentifier.namespace()), tableIdentifier.name()); + String tableLocation = LocationUtil.tableLocation(tableIdentifier, uniqueTableLocation); + return String.format("%s/%s", namespacePrefix(tableIdentifier.namespace()), tableLocation); } /** Iterate all table objects with the namespace prefix. */ diff --git a/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsCatalog.java b/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsCatalog.java index 4714d37d72b9..82549f1eccd9 100644 --- a/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsCatalog.java +++ b/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsCatalog.java @@ -55,11 +55,17 @@ public class TestEcsCatalog { @BeforeEach public void before() { - ecsCatalog = new EcsCatalog(); + ecsCatalog = createCatalog("test", ImmutableMap.of()); + } + + private EcsCatalog createCatalog(String name, Map additionalProperties) { + EcsCatalog catalog = new EcsCatalog(); Map properties = Maps.newHashMap(); properties.put(CatalogProperties.WAREHOUSE_LOCATION, new EcsURI(rule.bucket(), "").location()); properties.putAll(rule.clientProperties()); - ecsCatalog.initialize("test", properties); + properties.putAll(additionalProperties); + catalog.initialize(name, properties); + return catalog; } @AfterEach @@ -172,6 +178,30 @@ public void testRenameTable() { .isTrue(); } + @Test + public void testCreateTableInUniqueLocation() throws Exception { + try (EcsCatalog catalog = + createCatalog( + "unique_location_catalog", + ImmutableMap.of(CatalogProperties.UNIQUE_TABLE_LOCATION, "true"))) { + + Namespace ns = Namespace.of("a"); + TableIdentifier tableIdent = TableIdentifier.of(ns, "t1"); + TableIdentifier renamedIdent = TableIdentifier.of(ns, "t2"); + + catalog.createNamespace(ns); + catalog.createTable(tableIdent, SCHEMA); + catalog.renameTable(tableIdent, renamedIdent); + + Table table = catalog.createTable(tableIdent, SCHEMA); + Table renamedTable = catalog.loadTable(renamedIdent); + + assertThat(table.location()) + .as("Should have a different table location") + .isNotEqualTo(renamedTable.location()); + } + } + @Test public void testRegisterTable() { TableIdentifier identifier = TableIdentifier.of("a", "t1"); diff --git a/deploy.gradle b/deploy.gradle index 740d0056273b..65836bf1b3f1 100644 --- a/deploy.gradle +++ b/deploy.gradle @@ -75,7 +75,6 @@ subprojects { } else if (isOpenApi) { artifact testJar artifact testFixturesJar - artifact shadowJar } else { if (tasks.matching({task -> task.name == 'shadowJar'}).isEmpty()) { from components.java diff --git a/dev/.rat-excludes b/dev/.rat-excludes index 52a800723598..f94ef5bf8988 100644 --- a/dev/.rat-excludes +++ b/dev/.rat-excludes @@ -29,3 +29,4 @@ sitemap.xml .python-version **/*_index.md **/.venv/** +**/runtime-deps.txt diff --git a/docker/iceberg-rest-fixture/README.md b/docker/iceberg-rest-fixture/README.md index 3805cc2468cb..5e02a2b4712a 100644 --- a/docker/iceberg-rest-fixture/README.md +++ b/docker/iceberg-rest-fixture/README.md @@ -23,6 +23,39 @@ For converting different catalog implementations into a rest one. Adapter for wrapping the existing catalog backends over REST. +## Configuration + +All configuration is provided via environment variables. + +### Backend catalog properties + +Catalog properties can be set via `CATALOG_*` environment variables. The +`CATALOG_` prefix is stripped; single underscores become dots (`.`); double +underscores become dashes (`-`). Names are lowercased. + +| Env var | Catalog property | +|---|---| +| `CATALOG_CATALOG_NAME` | `catalog.name` | +| `CATALOG_WAREHOUSE` | `warehouse` | +| `CATALOG_URI` | `uri` | +| `CATALOG_CATALOG__IMPL` | `catalog-impl` | +| `CATALOG_IO__IMPL` | `io-impl` | +| `CATALOG_JDBC_USER` | `jdbc.user` | + +If `catalog-impl` and `uri` are unset, the fixture defaults to an in-memory +SQLite `JdbcCatalog`. + +### Catalog name + +By default, the fixture serves a catalog named `rest_backend`. To match a +name expected by a specific engine (for example, a catalog created via Trino +or PyIceberg), override the `catalog.name` property: + +```bash +docker run -e CATALOG_CATALOG_NAME=mycatalog -p 8181:8181 apache/iceberg-rest-fixture +``` + + ## Build the Docker Image When making changes to the local files and test them out, you can build the image locally: diff --git a/docs/docs/aws.md b/docs/docs/aws.md index 1fe867401296..fba4921f73a5 100644 --- a/docs/docs/aws.md +++ b/docs/docs/aws.md @@ -288,7 +288,7 @@ This feature requires the following lock related catalog properties: 2. Set `lock.table` as the DynamoDB table name you would like to use. If the lock table with the given name does not exist in DynamoDB, a new table is created with billing mode set as [pay-per-request](https://aws.amazon.com/blogs/aws/amazon-dynamodb-on-demand-no-capacity-planning-and-pay-per-request-pricing). Other lock related catalog properties can also be used to adjust locking behaviors such as heartbeat interval. -For more details, please refer to [Lock catalog properties](configuration.md#lock-catalog-properties). +For more details, please refer to [Lock catalog properties](catalog-properties.md#lock-catalog-properties). ## S3 FileIO @@ -705,10 +705,12 @@ For more details of configuration, see sections [URL Connection HTTP Client Conf Configurations for the HTTP client can be set via catalog properties. Below is an overview of available configurations: -| Property | Default | Description | -|----------------------------|---------|------------------------------------------------------------------------------------------------------------| -| http-client.type | apache | Types of HTTP Client.
`urlconnection`: URL Connection HTTP Client
`apache`: Apache HTTP Client | -| http-client.proxy-endpoint | null | An optional proxy endpoint to use for the HTTP client. | +| Property | Default | Description | +|---------------------------------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| http-client.type | apache | Types of HTTP Client.
`urlconnection`: URL Connection HTTP Client
`apache`: Apache HTTP Client | +| http-client.proxy-endpoint | null | An optional proxy endpoint to use for the HTTP client. | +| http-client.proxy-use-system-property-values | null, enabled by default | An optional `true/false` setting that controls whether proxy configuration is read from Java system properties (`http.proxyHost`, `http.proxyPort`, `http.nonProxyHosts`, etc.). | +| http-client.proxy-use-environment-variable-values | null, enabled by default | An optional `true/false` setting that controls whether proxy configuration is read from environment variables (`HTTP_PROXY`, `HTTPS_PROXY`, `NO_PROXY`, etc.). | #### URL Connection HTTP Client Configurations diff --git a/docs/docs/catalog-properties.md b/docs/docs/catalog-properties.md new file mode 100644 index 000000000000..5afae0b98ae2 --- /dev/null +++ b/docs/docs/catalog-properties.md @@ -0,0 +1,167 @@ +--- +title: "Catalog properties" +--- + + +# Catalog properties + +## Common properties + +Iceberg catalogs support using catalog properties to configure catalog behaviors. Here is a list of commonly used catalog properties: + +| Property | Default | Description | +| --------------------------------- | ------------------ | ------------------------------------------------------ | +| catalog-impl | null | a custom `Catalog` implementation to use by an engine | +| io-impl | null | a custom `FileIO` implementation to use in a catalog | +| warehouse | null | the root path of the data warehouse | +| uri | null | a URI string, such as Hive metastore URI | +| clients | 2 | client pool size | +| cache-enabled | true | Whether to cache catalog entries | +| cache.expiration-interval-ms | 30000 | How long catalog entries are locally cached, in milliseconds; 0 disables caching, negative values disable expiration | +| metrics-reporter-impl | org.apache.iceberg.metrics.LoggingMetricsReporter | Custom `MetricsReporter` implementation to use in a catalog. See the [Metrics reporting](metrics-reporting.md) section for additional details | +| unique-table-location | false | Whether to use a unique location for new tables | +| encryption.kms-impl | null | a custom `KeyManagementClient` implementation to use in a catalog for interactions with KMS (key management service). See the [Encryption](encryption.md) document for additional details | + +`HadoopCatalog` and `HiveCatalog` can access the properties in their constructors. +Any other custom catalog can access the properties by implementing `Catalog.initialize(catalogName, catalogProperties)`. +The properties can be manually constructed or passed in from a compute engine like Spark or Flink. +Spark uses its session properties as catalog properties, see more details in the [Spark configuration](spark-configuration.md#catalog-configuration) section. +Flink passes in catalog properties through `CREATE CATALOG` statement, see more details in the [Flink](flink.md#adding-catalogs) section. + +## REST catalog properties + +The following properties configure the behavior of the REST catalog client. + +| Property | Default | Description | +|---------------------------------------|-------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `snapshot-loading-mode` | `ALL` | Controls how snapshots are loaded from the REST server. Supported values: `ALL` (load all snapshots), `REFS` (load only referenced snapshots). | +| `rest-metrics-reporting-enabled` | `true` | Whether to enable metrics reporting to the REST server. | +| `view-endpoints-supported` | `false` | For backwards compatibility with older REST servers. Set to `true` if the server supports view endpoints but doesn't send the `endpoints` field in the ConfigResponse. | +| `rest-page-size` | null | The page size to use when listing namespaces, tables, or other paginated resources. | +| `namespace-separator` | `%1F` | The separator character used for namespace levels when communicating with the REST server. | +| `scan-planning-mode` | `CLIENT` | Controls where scan planning is performed. Supported values: `CLIENT` (client-side planning), `SERVER` (server-side planning). Can be overridden per-table by the server in LoadTableResponse. | + +### Table cache properties + +The following properties configure the table cache used for freshness-aware table loading. Note, this cache is different from the one that can be configured at catalog level in general. + +| Property | Default | Description | +|------------------------------------------|-------------------|----------------------------------------------------------------------------------------| +| `rest-table-cache.expire-after-write-ms` | `300000` (5 min) | Time in milliseconds after which cached table entries expire. | +| `rest-table-cache.max-entries` | `100` | Maximum number of table entries to cache. | + +### Auth properties + +The following catalog properties configure authentication for the REST catalog. +They support Basic, OAuth2, SigV4, and Google authentication. + +#### REST auth properties + +| Property | Default | Description | +|--------------------------------------|------------------|-------------------------------------------------------------------------------------------------------------------| +| `rest.auth.type` | `none` | Authentication mechanism for REST catalog access. Supported values: `none`, `basic`, `oauth2`, `sigv4`, `google`. | +| `rest.auth.basic.username` | null | Username for Basic authentication. Required if `rest.auth.type` = `basic`. | +| `rest.auth.basic.password` | null | Password for Basic authentication. Required if `rest.auth.type` = `basic`. | +| `rest.auth.sigv4.delegate-auth-type` | `oauth2` | Auth type to delegate to after `sigv4` signing. | + +#### OAuth2 auth properties +Required and optional properties to include while using `oauth2` authentication + +| Property | Default | Description | +|-------------------------|-------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `token` | null | A Bearer token to interact with the server. Either `token` or `credential` is required. | +| `credential` | null | Credential string in the form of `client_id:client_secret` to exchange for a token in the OAuth2 client credentials flow. Either `token` or `credential` is required. | +| `oauth2-server-uri` | `v1/oauth/tokens` | OAuth2 token endpoint URI. Required if the REST catalog is not the OAuth2 authentication server. | +| `token-expires-in-ms` | 3600000 (1 hour) | Time in milliseconds after which a bearer token is considered expired. Used to decide when to refresh or re-exchange a token. | +| `token-refresh-enabled` | true | Determines whether tokens are automatically refreshed when expiration details are available. | +| `token-exchange-enabled`| true | Determines whether to use the token exchange flow to acquire new tokens. Disabling this will allow fallback to the client credential flow. | +| `scope` | `catalog` | Additional scope for `oauth2`. | +| `audience` | null | Optional param to specify token `audience` | +| `resource` | null | Optional param to specify `resource` | + +#### Google auth properties +Required and optional properties to include while using `google` authentication + +| Property | Default | Description | +|----------------------------|--------------------------------------------------|--------------------------------------------------| +| `gcp.auth.credentials-path`| Application Default Credentials (ADC) | Path to a service account JSON key file. | +| `gcp.auth.credentials-json` | Application Default Credentials (ADC) | JSON string of a service account credential. | +| `gcp.auth.scopes` | `https://www.googleapis.com/auth/cloud-platform` | Comma-separated list of OAuth scopes to request. | + +## Lock catalog properties + +Here are the catalog properties related to locking. They are used by some catalog implementations to control the locking behavior during commits. + +| Property | Default | Description | +| --------------------------------- | ------------------ | ------------------------------------------------------ | +| lock-impl | null | a custom implementation of the lock manager, the actual interface depends on the catalog used | +| lock.table | null | an auxiliary table for locking, such as in [AWS DynamoDB lock manager](aws.md#dynamodb-lock-manager) | +| lock.acquire-interval-ms | 5000 (5 s) | the interval to wait between each attempt to acquire a lock | +| lock.acquire-timeout-ms | 180000 (3 min) | the maximum time to try acquiring a lock | +| lock.heartbeat-interval-ms | 3000 (3 s) | the interval to wait between each heartbeat after acquiring a lock | +| lock.heartbeat-timeout-ms | 15000 (15 s) | the maximum time without a heartbeat to consider a lock expired | + +## Hadoop configuration + +### HadoopTables Lock Configuration + +When using `HadoopTables` (tables without a catalog), lock properties from the [Lock catalog properties](#lock-catalog-properties) section can be configured by prefixing them with `iceberg.tables.hadoop.`. This ensures atomic commits on file systems like S3 that lack native write mutual exclusion. + +!!! info + To use DynamoDB as a lock manager with `HadoopTables`, set `iceberg.tables.hadoop.lock-impl` to `org.apache.iceberg.aws.dynamodb.DynamoDbLockManager` and `iceberg.tables.hadoop.lock.table` to your DynamoDB table name. See [DynamoDB Lock Manager](aws.md#dynamodb-lock-manager) for more details. + +### Hive Metastore Configuration + +The following properties from the Hadoop configuration are used by the Hive Metastore connector. +The HMS table locking is a 2-step process: + +1. Lock Creation: Create lock in HMS and queue for acquisition +2. Lock Check: Check if lock successfully acquired + +| Property | Default | Description | +|-------------------------------------------|-----------------|------------------------------------------------------------------------------| +| iceberg.hive.client-pool-size | 5 | The size of the Hive client pool when tracking tables in HMS | +| iceberg.hive.lock-creation-timeout-ms | 180000 (3 min) | Maximum time in milliseconds to create a lock in the HMS | +| iceberg.hive.lock-creation-min-wait-ms | 50 | Minimum time in milliseconds between retries of creating the lock in the HMS | +| iceberg.hive.lock-creation-max-wait-ms | 5000 | Maximum time in milliseconds between retries of creating the lock in the HMS | +| iceberg.hive.lock-timeout-ms | 180000 (3 min) | Maximum time in milliseconds to acquire a lock | +| iceberg.hive.lock-check-min-wait-ms | 50 | Minimum time in milliseconds between checking the acquisition of the lock | +| iceberg.hive.lock-check-max-wait-ms | 5000 | Maximum time in milliseconds between checking the acquisition of the lock | +| iceberg.hive.lock-heartbeat-interval-ms | 240000 (4 min) | The heartbeat interval for the HMS locks. | +| iceberg.hive.metadata-refresh-max-retries | 2 | Maximum number of retries when the metadata file is missing | +| iceberg.hive.table-level-lock-evict-ms | 600000 (10 min) | The timeout for the JVM table lock is | +| iceberg.engine.hive.lock-enabled | true | Use HMS locks to ensure atomicity of commits | + +Note: `iceberg.hive.lock-check-max-wait-ms` and `iceberg.hive.lock-heartbeat-interval-ms` should be less than the [transaction timeout](https://cwiki.apache.org/confluence/display/Hive/Configuration+Properties#ConfigurationProperties-hive.txn.timeout) +of the Hive Metastore (`hive.txn.timeout` or `metastore.txn.timeout` in the newer versions). Otherwise, the heartbeats on the lock (which happens during the lock checks) would end up expiring in the +Hive Metastore before the lock is retried from Iceberg. + +Warn: Setting `iceberg.engine.hive.lock-enabled`=`false` will cause HiveCatalog to commit to tables without using Hive locks. +This should only be set to `false` if all following conditions are met: + +- [HIVE-26882](https://issues.apache.org/jira/browse/HIVE-26882) +is available on the Hive Metastore server +- [HIVE-28121](https://issues.apache.org/jira/browse/HIVE-28121) +is available on the Hive Metastore server, if it is backed by MySQL or MariaDB +- All other HiveCatalogs committing to tables that this HiveCatalog commits to are also on Iceberg 1.3 or later +- All other HiveCatalogs committing to tables that this HiveCatalog commits to have also disabled Hive locks on commit. + +**Failing to ensure these conditions risks corrupting the table.** + +Even with `iceberg.engine.hive.lock-enabled` set to `false`, a HiveCatalog can still use locks for individual tables by setting the table property `engine.hive.lock-enabled`=`true`. +This is useful in the case where other HiveCatalogs cannot be upgraded and set to commit without using Hive locks. diff --git a/docs/docs/configuration.md b/docs/docs/configuration.md index f12bcea6afd5..17bf1f8ac0a1 100644 --- a/docs/docs/configuration.md +++ b/docs/docs/configuration.md @@ -45,10 +45,13 @@ Iceberg tables support table properties to configure table behavior, like the de | write.delete.format.default | data file format | Default delete file format for the table; parquet, avro, or orc | | write.parquet.row-group-size-bytes | 134217728 (128 MB) | Parquet row group size | | write.parquet.page-size-bytes | 1048576 (1 MB) | Parquet page size | +| write.parquet.page-version | v1 | Parquet data page version: v1 (DataPage V1) or v2 (DataPage V2) | | write.parquet.page-row-limit | 20000 | Parquet page row limit | | write.parquet.dict-size-bytes | 2097152 (2 MB) | Parquet dictionary page size | | write.parquet.compression-codec | zstd | Parquet compression codec: zstd, brotli, lz4, gzip, snappy, uncompressed | | write.parquet.compression-level | null | Parquet compression level | +| write.parquet.shred-variants | false | When true, variant columns are written with shredded Parquet encoding for improved query performance | +| write.parquet.variant-inference-buffer-size | 100 | Number of rows to buffer for schema inference when variant shredding is enabled | | write.parquet.bloom-filter-enabled.column.col1 | (not set) | Hint to parquet to write a bloom filter for the column: 'col1' | | write.parquet.bloom-filter-max-bytes | 1048576 (1 MB) | The maximum number of bytes for a bloom filter bitset | | write.parquet.bloom-filter-fpp.column.col1 | 0.01 | The false positive probability for a bloom filter applied to 'col1' (must > 0.0 and < 1.0) | @@ -140,126 +143,3 @@ Informational properties can be set to provide additional context about a table. | Property | Default | Description | | --------------------------------------------- | -------- | ------------------------------------------------------------- | | compatibility.snapshot-id-inheritance.enabled | false | Enables committing snapshots without explicit snapshot IDs (always true if the format version is > 1) | - -## Catalog properties - -Iceberg catalogs support using catalog properties to configure catalog behaviors. Here is a list of commonly used catalog properties: - -| Property | Default | Description | -| --------------------------------- | ------------------ | ------------------------------------------------------ | -| catalog-impl | null | a custom `Catalog` implementation to use by an engine | -| io-impl | null | a custom `FileIO` implementation to use in a catalog | -| warehouse | null | the root path of the data warehouse | -| uri | null | a URI string, such as Hive metastore URI | -| clients | 2 | client pool size | -| cache-enabled | true | Whether to cache catalog entries | -| cache.expiration-interval-ms | 30000 | How long catalog entries are locally cached, in milliseconds; 0 disables caching, negative values disable expiration | -| metrics-reporter-impl | org.apache.iceberg.metrics.LoggingMetricsReporter | Custom `MetricsReporter` implementation to use in a catalog. See the [Metrics reporting](metrics-reporting.md) section for additional details | -| encryption.kms-impl | null | a custom `KeyManagementClient` implementation to use in a catalog for interactions with KMS (key management service). See the [Encryption](encryption.md) document for additional details | - -`HadoopCatalog` and `HiveCatalog` can access the properties in their constructors. -Any other custom catalog can access the properties by implementing `Catalog.initialize(catalogName, catalogProperties)`. -The properties can be manually constructed or passed in from a compute engine like Spark or Flink. -Spark uses its session properties as catalog properties, see more details in the [Spark configuration](spark-configuration.md#catalog-configuration) section. -Flink passes in catalog properties through `CREATE CATALOG` statement, see more details in the [Flink](flink.md#adding-catalogs) section. - -### REST Catalog auth properties - -The following catalog properties configure authentication for the REST catalog. -They support Basic, OAuth2, SigV4, and Google authentication. - -#### REST auth properties - -| Property | Default | Description | -|--------------------------------------|------------------|-------------------------------------------------------------------------------------------------------------------| -| `rest.auth.type` | `none` | Authentication mechanism for REST catalog access. Supported values: `none`, `basic`, `oauth2`, `sigv4`, `google`. | -| `rest.auth.basic.username` | null | Username for Basic authentication. Required if `rest.auth.type` = `basic`. | -| `rest.auth.basic.password` | null | Password for Basic authentication. Required if `rest.auth.type` = `basic`. | -| `rest.auth.sigv4.delegate-auth-type` | `oauth2` | Auth type to delegate to after `sigv4` signing. | - -#### OAuth2 auth properties -Required and optional properties to include while using `oauth2` authentication - -| Property | Default | Description | -|-------------------------|-------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `token` | null | A Bearer token to interact with the server. Either `token` or `credential` is required. | -| `credential` | null | Credential string in the form of `client_id:client_secret` to exchange for a token in the OAuth2 client credentials flow. Either `token` or `credential` is required. | -| `oauth2-server-uri` | `v1/oauth/tokens` | OAuth2 token endpoint URI. Required if the REST catalog is not the OAuth2 authentication server. | -| `token-expires-in-ms` | 3600000 (1 hour) | Time in milliseconds after which a bearer token is considered expired. Used to decide when to refresh or re-exchange a token. | -| `token-refresh-enabled` | true | Determines whether tokens are automatically refreshed when expiration details are available. | -| `token-exchange-enabled`| true | Determines whether to use the token exchange flow to acquire new tokens. Disabling this will allow fallback to the client credential flow. | -| `scope` | `catalog` | Additional scope for `oauth2`. | -| `audience` | null | Optional param to specify token `audience` | -| `resource` | null | Optional param to specify `resource` | - -#### Google auth properties -Required and optional properties to include while using `google` authentication - -| Property | Default | Description | -|----------------------------|--------------------------------------------------|--------------------------------------------------| -| `gcp.auth.credentials-path`| Application Default Credentials (ADC) | Path to a service account JSON key file. | -| `gcp.auth.credentials-json` | Application Default Credentials (ADC) | JSON string of a service account credential. | -| `gcp.auth.scopes` | `https://www.googleapis.com/auth/cloud-platform` | Comma-separated list of OAuth scopes to request. | - -### Lock catalog properties - -Here are the catalog properties related to locking. They are used by some catalog implementations to control the locking behavior during commits. - -| Property | Default | Description | -| --------------------------------- | ------------------ | ------------------------------------------------------ | -| lock-impl | null | a custom implementation of the lock manager, the actual interface depends on the catalog used | -| lock.table | null | an auxiliary table for locking, such as in [AWS DynamoDB lock manager](aws.md#dynamodb-lock-manager) | -| lock.acquire-interval-ms | 5000 (5 s) | the interval to wait between each attempt to acquire a lock | -| lock.acquire-timeout-ms | 180000 (3 min) | the maximum time to try acquiring a lock | -| lock.heartbeat-interval-ms | 3000 (3 s) | the interval to wait between each heartbeat after acquiring a lock | -| lock.heartbeat-timeout-ms | 15000 (15 s) | the maximum time without a heartbeat to consider a lock expired | - -## Hadoop configuration - -### HadoopTables Lock Configuration - -When using `HadoopTables` (tables without a catalog), lock properties from the [Lock catalog properties](#lock-catalog-properties) section can be configured by prefixing them with `iceberg.tables.hadoop.`. This ensures atomic commits on file systems like S3 that lack native write mutual exclusion. - -!!! info - To use DynamoDB as a lock manager with `HadoopTables`, set `iceberg.tables.hadoop.lock-impl` to `org.apache.iceberg.aws.dynamodb.DynamoDbLockManager` and `iceberg.tables.hadoop.lock.table` to your DynamoDB table name. See [DynamoDB Lock Manager](aws.md#dynamodb-lock-manager) for more details. - -### Hive Metastore Configuration - -The following properties from the Hadoop configuration are used by the Hive Metastore connector. -The HMS table locking is a 2-step process: - -1. Lock Creation: Create lock in HMS and queue for acquisition -2. Lock Check: Check if lock successfully acquired - -| Property | Default | Description | -|-------------------------------------------|-----------------|------------------------------------------------------------------------------| -| iceberg.hive.client-pool-size | 5 | The size of the Hive client pool when tracking tables in HMS | -| iceberg.hive.lock-creation-timeout-ms | 180000 (3 min) | Maximum time in milliseconds to create a lock in the HMS | -| iceberg.hive.lock-creation-min-wait-ms | 50 | Minimum time in milliseconds between retries of creating the lock in the HMS | -| iceberg.hive.lock-creation-max-wait-ms | 5000 | Maximum time in milliseconds between retries of creating the lock in the HMS | -| iceberg.hive.lock-timeout-ms | 180000 (3 min) | Maximum time in milliseconds to acquire a lock | -| iceberg.hive.lock-check-min-wait-ms | 50 | Minimum time in milliseconds between checking the acquisition of the lock | -| iceberg.hive.lock-check-max-wait-ms | 5000 | Maximum time in milliseconds between checking the acquisition of the lock | -| iceberg.hive.lock-heartbeat-interval-ms | 240000 (4 min) | The heartbeat interval for the HMS locks. | -| iceberg.hive.metadata-refresh-max-retries | 2 | Maximum number of retries when the metadata file is missing | -| iceberg.hive.table-level-lock-evict-ms | 600000 (10 min) | The timeout for the JVM table lock is | -| iceberg.engine.hive.lock-enabled | true | Use HMS locks to ensure atomicity of commits | - -Note: `iceberg.hive.lock-check-max-wait-ms` and `iceberg.hive.lock-heartbeat-interval-ms` should be less than the [transaction timeout](https://cwiki.apache.org/confluence/display/Hive/Configuration+Properties#ConfigurationProperties-hive.txn.timeout) -of the Hive Metastore (`hive.txn.timeout` or `metastore.txn.timeout` in the newer versions). Otherwise, the heartbeats on the lock (which happens during the lock checks) would end up expiring in the -Hive Metastore before the lock is retried from Iceberg. - -Warn: Setting `iceberg.engine.hive.lock-enabled`=`false` will cause HiveCatalog to commit to tables without using Hive locks. -This should only be set to `false` if all following conditions are met: - -- [HIVE-26882](https://issues.apache.org/jira/browse/HIVE-26882) -is available on the Hive Metastore server -- [HIVE-28121](https://issues.apache.org/jira/browse/HIVE-28121) -is available on the Hive Metastore server, if it is backed by MySQL or MariaDB -- All other HiveCatalogs committing to tables that this HiveCatalog commits to are also on Iceberg 1.3 or later -- All other HiveCatalogs committing to tables that this HiveCatalog commits to have also disabled Hive locks on commit. - -**Failing to ensure these conditions risks corrupting the table.** - -Even with `iceberg.engine.hive.lock-enabled` set to `false`, a HiveCatalog can still use locks for individual tables by setting the table property `engine.hive.lock-enabled`=`true`. -This is useful in the case where other HiveCatalogs cannot be upgraded and set to commit without using Hive locks. diff --git a/docs/docs/custom-catalog.md b/docs/docs/custom-catalog.md index f0a6b5718a6c..d30a629401aa 100644 --- a/docs/docs/custom-catalog.md +++ b/docs/docs/custom-catalog.md @@ -151,7 +151,7 @@ public class CustomCatalog extends BaseMetastoreCatalog { Catalog implementations can be dynamically loaded in most compute engines. For Spark and Flink, you can specify the `catalog-impl` catalog property to load it. -Read the [Configuration](configuration.md#catalog-properties) section for more details. +Read the [Configuration](catalog-properties.md) section for more details. For MapReduce, implement `org.apache.iceberg.mr.CatalogLoader` and set Hadoop property `iceberg.mr.catalog.loader.class` to load it. If your catalog must read Hadoop configuration to access certain environment properties, make your catalog implement `org.apache.hadoop.conf.Configurable`. @@ -199,7 +199,7 @@ public class CustomFileIO implements FileIO { If you are already implementing your own catalog, you can implement `TableOperations.io()` to use your custom `FileIO`. In addition, custom `FileIO` implementations can also be dynamically loaded in `HadoopCatalog` and `HiveCatalog` by specifying the `io-impl` catalog property. -Read the [Configuration](configuration.md#catalog-properties) section for more details. +Read the [Configuration](catalog-properties.md) section for more details. If your `FileIO` must read Hadoop configuration to access certain environment properties, make your `FileIO` implement `org.apache.hadoop.conf.Configurable`. ### Custom location provider implementation diff --git a/docs/docs/flink-ddl.md b/docs/docs/flink-ddl.md index 756256f0df4f..0a9b26712235 100644 --- a/docs/docs/flink-ddl.md +++ b/docs/docs/flink-ddl.md @@ -45,6 +45,23 @@ The following properties can be set if using the Hive catalog: * `hive-conf-dir`: Path to a directory containing a `hive-site.xml` configuration file which will be used to provide custom Hive configuration values. The value of `hive.metastore.warehouse.dir` from `/hive-site.xml` (or hive configure file from classpath) will be overwritten with the `warehouse` value if setting both `hive-conf-dir` and `warehouse` when creating iceberg catalog. * `hadoop-conf-dir`: Path to a directory containing `core-site.xml` and `hdfs-site.xml` configuration files which will be used to provide custom Hadoop configuration values. +!!! warning "Hive Catalog Limitation" + The Hive Metastore (HMS) validates schema changes by comparing column types **positionally** + (`hive.metastore.disallow.incompatible.col.type.changes`, default `true`). When using a Hive catalog, + schema evolution operations that change column positions — such as dropping a non-last column or + reordering columns — may fail regardless of which engine performs the change (Spark, Flink Java API, etc.). + + To work around this, disable the HMS schema compatibility check by setting + `hive.metastore.disallow.incompatible.col.type.changes=false`: + + - **Remote HMS:** Set this property in the HMS server's `hive-site.xml`. + - **Embedded HMS:** Add the equivalent property to the Hive catalog configuration. + + **Trade-off:** After disabling this check, the Hive engine may no longer be able to read the table + correctly due to the schema mismatch in the Hive Metastore. Iceberg-aware engines (Spark, Flink, + Trino, etc.) will continue to work correctly, as they read schema from Iceberg metadata rather + than the Hive Metastore. + #### Hadoop catalog Iceberg also supports a directory-based catalog in HDFS that can be configured using `'catalog-type'='hadoop'`: diff --git a/docs/docs/flink-writes.md b/docs/docs/flink-writes.md index 3fef3a1bf3bf..03795b5beed0 100644 --- a/docs/docs/flink-writes.md +++ b/docs/docs/flink-writes.md @@ -207,6 +207,10 @@ They should have the following key-value tags. | dataFilesSizeHistogram | Histogram | Histogram distribution of data file sizes (in bytes). | | deleteFilesSizeHistogram | Histogram | Histogram distribution of delete file sizes (in bytes). | +The `Histogram` metrics above require `org.apache.flink:flink-metrics-dropwizard` on the classpath, +which is not shipped by Flink by default. Please add this artifact to your classpath to see histogram metrics. +If not present, histogram metrics will be missing. All other metric types will continue to get published. + Committer metrics are added under the sub group of `IcebergFilesCommitter`. They should have the following key-value tags. @@ -483,7 +487,7 @@ We need the following information (DynamicRecord) for every record: | `Schema` | The schema of the record. | | `Spec` | The expected partitioning specification for the record. | | `RowData` | The actual row data to be written. | -| `DistributionMode` | The distribution mode for writing the record (currently supports NONE or HASH). | +| `DistributionMode` | The distribution mode for writing the record (NONE, HASH or `null`). When `null`, the record won't be shuffled at all. | | `Parallelism` | The maximum number of parallel writers for a given table/branch/schema/spec (WriteTarget). | | `UpsertMode` | Overrides this table's write.upsert.enabled (optional). | | `EqualityFields` | The equality fields for the table(optional). | @@ -547,6 +551,30 @@ The Dynamic Iceberg Flink Sink is configured using the Builder pattern. Here are | `tableCreator(TableCreator creator)` | When DynamicIcebergSink creates new Iceberg tables, allows overriding how tables are created - setting custom table properties and location based on the table name. | | `dropUnusedColumns(boolean enabled)` | When enabled, drops all columns from the current table schema which are not contained in the input schema (see the caveats above on dropping columns). | +### Distribution Modes + +The `DistributionMode` set on each `DynamicRecord` controls how that record is routed from the processor to the writer: + +| Mode | Behavior | +|--------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `NONE` | Records are distributed across writer subtasks in a round-robin fashion (or by equality fields if set). | +| `HASH` | Records are distributed by partition key (partitioned tables) or equality fields (unpartitioned tables). Ensures that records for the same partition are handled by the same writer subtask. | +| `null` | Forward mode: bypasses distribution entirely and sends records directly via a forward edge (see below). | + +#### Forward Mode + +Using the `DynamicRecord` constructor overload without `distributionMode` parameter bypasses distribution entirely. This is designed for high-throughput pipelines where every partition already has a large volume of data and the serialization and network shuffle cost is prohibitive. Records are sent directly from the processor to the writer using a forward edge, enabling Flink operator chaining. Table metadata updates are always performed immediately inside the processor (regardless of `immediateTableUpdate` setting), because a dedicated table-update operator was deliberately omitted to avoid introducing extra data shuffles. + +Forward and regular records can be mixed in the same pipeline. The processor routes records to two separate sink outputs: + +- **Shuffle sink**: receives shuffling records. These go through the normal distribution topology (hash/round-robin) before reaching the writer. +- **Forward sink**: receives records without a `distributionMode`. These skip distribution entirely and flow via a forward edge from the processor, allowing Flink operator chaining. Suited for high-throughput tables where avoiding shuffle overhead is critical. The sink's `writeParallelism` config does not apply to this path. + +!!! warning + +1. In the forward path, schema changes are always applied immediately because records must pass straight through via the forward edge. For the intended high-volume use case, this can cause many conflicting commits to the Iceberg catalog and temporarily delay data processing. Consider either updating the schema externally before publishing records with the new schema, or planning for a temporary disruption in throughput when a new schema is introduced from upstream. +2. Because the forward path skips distribution entirely, users are responsible for distributing the data correctly in the upstream before the records reach the dynamic Iceberg sink. Otherwise, writes could be unbalanced. + ### Notes - **Range distribution mode**: Currently, the dynamic sink does not support the `RANGE` distribution mode, if set, it will fall back to `HASH`. diff --git a/docs/docs/hive.md b/docs/docs/hive.md index 0531e9b04a15..4829acfe208b 100644 --- a/docs/docs/hive.md +++ b/docs/docs/hive.md @@ -71,9 +71,9 @@ Starting from 1.8.0 Iceberg doesn't release Hive runtime connector. For Hive que with Hive 2.x and 3.x) use Hive runtime connector coming with Iceberg 1.6.1, or use Hive 4.0.0 or later which is released with embedded Iceberg integration. -### Hive 4.1.x +### Hive 4.1.x, 4.2.x -Hive 4.1.x comes with Iceberg 1.9.1 included. +Hive 4.1.x and 4.2.x come with Iceberg 1.9.1 included. ### Hive 4.0.x diff --git a/docs/docs/java-api-quickstart.md b/docs/docs/java-api-quickstart.md index 430450fc87c2..74b40feed86d 100644 --- a/docs/docs/java-api-quickstart.md +++ b/docs/docs/java-api-quickstart.md @@ -28,11 +28,11 @@ Tables are created using either a [`Catalog`](../../javadoc/{{ icebergVersion }} The Hive catalog connects to a Hive metastore to keep track of Iceberg tables. You can initialize a Hive catalog with a name and some properties. -(see: [Catalog properties](configuration.md#catalog-properties)) +(see: [Catalog properties](catalog-properties.md)) ```java -import java.util.HashMap -import java.util.Map +import java.util.HashMap; +import java.util.Map; import org.apache.iceberg.hive.HiveCatalog; diff --git a/docs/docs/metrics-reporting.md b/docs/docs/metrics-reporting.md index e019e2761fe6..4ca452b0d503 100644 --- a/docs/docs/metrics-reporting.md +++ b/docs/docs/metrics-reporting.md @@ -145,7 +145,7 @@ public class InMemoryMetricsReporter implements MetricsReporter { ### Via Catalog Configuration -The [catalog property](configuration.md#catalog-properties) `metrics-reporter-impl` allows registering a given [`MetricsReporter`](https://github.com/apache/iceberg/blob/main/api/src/main/java/org/apache/iceberg/metrics/MetricsReporter.java) by specifying its fully-qualified class name, e.g. `metrics-reporter-impl=org.apache.iceberg.metrics.InMemoryMetricsReporter`. +The [catalog property](catalog-properties.md) `metrics-reporter-impl` allows registering a given [`MetricsReporter`](https://github.com/apache/iceberg/blob/main/api/src/main/java/org/apache/iceberg/metrics/MetricsReporter.java) by specifying its fully-qualified class name, e.g. `metrics-reporter-impl=org.apache.iceberg.metrics.InMemoryMetricsReporter`. ### Via the Java API during Scan planning diff --git a/docs/docs/spark-configuration.md b/docs/docs/spark-configuration.md index e8e4f7e3c8c1..5972aafc3d39 100644 --- a/docs/docs/spark-configuration.md +++ b/docs/docs/spark-configuration.md @@ -80,7 +80,7 @@ Both catalogs are configured using properties nested under the catalog name. Com | spark.sql.catalog._catalog-name_.view-override._propertyKey_ | | Enforced Iceberg view property value for property key _propertyKey_, which cannot be overridden on view creation by user | | spark.sql.catalog._catalog-name_.use-nullable-query-schema | `true` or `false` | Whether to preserve fields' nullability when creating the table using CTAS and RTAS. If set to `true`, all fields will be marked as nullable. If set to `false`, fields' nullability will be preserved. The default value is `true`. Available in Spark 3.5 and above. | -Additional properties can be found in common [catalog configuration](configuration.md#catalog-properties). +Additional properties can be found in common [catalog configuration](catalog-properties.md). ### Using catalogs @@ -191,6 +191,8 @@ val spark = SparkSession.builder() | spark.sql.iceberg.distribution-mode | See [Spark Writes](spark-writes.md#writing-distribution-modes) | Controls distribution strategy during writes | | spark.wap.id | null | [Write-Audit-Publish](branching.md#audit-branch) snapshot staging ID | | spark.wap.branch | null | WAP branch name for snapshot commit | +| spark.sql.iceberg.shred-variants | Table default | When true, variant columns are written with shredded Parquet encoding for improved query performance | +| spark.sql.iceberg.variant-inference-buffer-size | Table default | Number of rows to buffer for schema inference when variant shredding is enabled | | spark.sql.iceberg.compression-codec | Table default | Write compression codec (e.g., `zstd`, `snappy`) | | spark.sql.iceberg.compression-level | Table default | Compression level for Parquet/Avro | | spark.sql.iceberg.compression-strategy | Table default | Compression strategy for ORC | @@ -262,6 +264,8 @@ df.writeTo("catalog.db.table") | compression-strategy | Table write.orc.compression-strategy | Overrides this table's compression strategy for ORC tables for this write | | distribution-mode | See [Spark Writes](spark-writes.md#writing-distribution-modes) for defaults | Override this table's distribution mode for this write | | delete-granularity | file | Override this table's delete granularity for this write | +| shred-variants | false | Overrides this table's write.parquet.shred-variants for this write | +| variant-inference-buffer-size | 100 | Overrides this table's write.parquet.variant-inference-buffer-size for this write | CommitMetadata provides an interface to add custom metadata to a snapshot summary during a SQL execution, which can be beneficial for purposes such as auditing or change tracking. If properties start with `snapshot-property.`, then that prefix will be removed from each property. Here is an example: diff --git a/docs/docs/spark-ddl.md b/docs/docs/spark-ddl.md index 4d227c2db4f9..9fa6c0e7d3c7 100644 --- a/docs/docs/spark-ddl.md +++ b/docs/docs/spark-ddl.md @@ -173,6 +173,27 @@ Iceberg has full `ALTER TABLE` support in Spark 3, including: In addition, [SQL extensions](spark-configuration.md#sql-extensions) can be used to add support for partition evolution and setting a table's write order +!!! warning "Hive Catalog Limitation" + The Hive Metastore (HMS) validates schema changes by comparing column types **positionally** + (`hive.metastore.disallow.incompatible.col.type.changes`, default `true`). Any schema evolution + operation that shifts column positions will fail when using a Hive catalog. Affected operations + include: + + - `ADD COLUMN` with `FIRST` or `AFTER` clauses + - `ALTER COLUMN` with `FIRST` or `AFTER` clauses (reordering) + - `DROP COLUMN` on a non-last column + + To work around this, disable the HMS schema compatibility check by setting + `hive.metastore.disallow.incompatible.col.type.changes=false`: + + - **Remote HMS:** Set this property in the HMS server's `hive-site.xml`. + - **Embedded HMS:** Pass `--conf spark.hadoop.hive.metastore.disallow.incompatible.col.type.changes=false` when starting Spark. + + **Trade-off:** After disabling this check, the Hive engine may no longer be able to read the table + correctly due to the schema mismatch in the Hive Metastore. Iceberg-aware engines (Spark, Flink, + Trino, etc.) will continue to work correctly, as they read schema from Iceberg metadata rather + than HMS. + ### `ALTER TABLE ... RENAME TO` ```sql @@ -262,6 +283,11 @@ ALTER TABLE prod.db.sample ADD COLUMN nested.new_column bigint FIRST; ``` +!!! warning "Hive Catalog Limitation" + When using a Hive catalog, adding a column with `FIRST` or `AFTER` may fail due to HMS positional + schema validation. See the warning above for details + and workaround. + ### `ALTER TABLE ... RENAME COLUMN` Iceberg allows any field to be renamed. To rename a field, use `RENAME COLUMN`: @@ -305,6 +331,10 @@ ALTER TABLE prod.db.sample ALTER COLUMN col FIRST; ALTER TABLE prod.db.sample ALTER COLUMN nested.col AFTER other_col; ``` +!!! warning "Hive Catalog Limitation" + When using a Hive catalog, reordering columns may fail due to HMS positional schema validation. + See the Hive Catalog Limitation note above for details and workaround. + Nullability for a non-nullable column can be changed using `DROP NOT NULL`: ```sql @@ -326,6 +356,11 @@ ALTER TABLE prod.db.sample DROP COLUMN id; ALTER TABLE prod.db.sample DROP COLUMN point.z; ``` +!!! warning "Hive Catalog Limitation" + When using a Hive catalog, dropping a non-last column may fail due to HMS positional schema + validation. See the earlier Hive Catalog Limitation warning above for details and + workaround. + ## `ALTER TABLE` SQL extensions These commands are available in Spark 3 when using Iceberg [SQL extensions](spark-configuration.md#sql-extensions). diff --git a/docs/docs/spark-procedures.md b/docs/docs/spark-procedures.md index 45e55a9365a5..8e594caa12d4 100644 --- a/docs/docs/spark-procedures.md +++ b/docs/docs/spark-procedures.md @@ -637,6 +637,7 @@ Replace a table with an Iceberg table, loaded with the source's data files. Table schema, partitioning, properties, and location will be copied from the source table. Migrate will fail if any table partition uses an unsupported format. Supported formats are Avro, Parquet, and ORC. +Migrate will also fail if the table is bucketed, as the bucketing will not be preserved. Existing data files are added to the Iceberg table's metadata and can be read using a name-to-id mapping created from the original table schema. To leave the original table intact while testing, use [`snapshot`](#snapshot) to create new temporary table that shares source data files and schema. diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index c1807a6b8542..8e31aba5c98b 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -65,6 +65,7 @@ nav: - Hive Migration: hive-migration.md - Delta Lake Migration: delta-lake-migration.md - Catalogs: + - Catalog properties: catalog-properties.md - AWS Glue: aws/#glue-catalog - AWS DynamoDB: aws/#dynamodb-catalog - HadoopCatalog: https://iceberg.apache.org/javadoc/nightly/org/apache/iceberg/hadoop/HadoopCatalog.html diff --git a/flink/v1.20/build.gradle b/flink/v1.20/build.gradle index 3591bf37b1a7..41f2489c8038 100644 --- a/flink/v1.20/build.gradle +++ b/flink/v1.20/build.gradle @@ -33,7 +33,8 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { implementation project(':iceberg-hive-metastore') compileOnly libs.flink120.avro - // for dropwizard histogram metrics implementation + compileOnly libs.joda.time + // dropwizard histogram metrics (optional in Flink) compileOnly libs.flink120.metrics.dropwizard compileOnly libs.flink120.streaming.java compileOnly "${libs.flink120.streaming.java.get().module}:${libs.flink120.streaming.java.get().getVersion()}:tests" @@ -83,6 +84,8 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') + testImplementation project(path: ':iceberg-orc', configuration: 'testArtifacts') + testImplementation(testFixtures(project(':iceberg-parquet'))) // By default, hive-exec is a fat/uber jar and it exports a guava library // that's really old. We use the core classifier to be able to override our guava @@ -169,9 +172,6 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { exclude group: 'com.google.code.findbugs', module: 'jsr305' } - // for dropwizard histogram metrics implementation - implementation libs.flink120.metrics.dropwizard - // for integration testing with the flink-runtime-jar // all of those dependencies are required because the integration test extends FlinkTestBase integrationCompileOnly project(':iceberg-api') @@ -266,4 +266,6 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { jar { enabled = false } + + apply from: "${rootDir}/runtime-deps.gradle" } diff --git a/flink/v1.20/flink-runtime/LICENSE b/flink/v1.20/flink-runtime/LICENSE index 36a03cb4fcf9..d73eda0104b9 100644 --- a/flink/v1.20/flink-runtime/LICENSE +++ b/flink/v1.20/flink-runtime/LICENSE @@ -219,6 +219,94 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles FastDoubleParser (via Jackson JSON Processor). + +Copyright: 2023 Werner Randelshofer, Switzerland +Project URL: https://github.com/wrandelshofer/FastDoubleParser +License: MIT + +| Copyright (c) 2023 Werner Randelshofer, Switzerland +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles fast_float (bundled by FastDoubleParser). + +Copyright: 2021 The fast_float authors +Project URL: https://github.com/fastfloat/fast_float +License: MIT + +| Copyright (c) 2021 The fast_float authors +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles bigint (bundled by FastDoubleParser). + +Copyright: 2022 Tim Buktu +Project URL: https://github.com/tbuktu/bigint +License: BSD 2-Clause + +| Copyright 2022 Tim Buktu +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| 1. Redistributions of source code must retain the above copyright notice, this +| list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + This product bundles Apache Parquet. Copyright: 2014-2020 The Apache Software Foundation. @@ -227,7 +315,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Thrift. +This product bundles Apache Thrift (bundled by Parquet). Copyright: 2006-2010 The Apache Software Foundation. Project URL: https://thrift.apache.org/ @@ -235,55 +323,57 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Fastutil. +This product includes code from Daniel Lemire's JavaFastPFOR project (bundled by Parquet). -Copyright: 2002-2014 Sebastiano Vigna -Project URL: http://fastutil.di.unimi.it/ +Copyright: 2013 Daniel Lemire +Project URL: https://github.com/lemire/JavaFastPFOR License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Apache ORC. +This product bundles fastutil (bundled by Parquet). -Copyright: 2013-2020 The Apache Software Foundation. -Project URL: https://orc.apache.org/ +Copyright: 2002-2014 Sebastiano Vigna +Project URL: http://fastutil.di.unimi.it/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Apache Datasketches. +This product bundles Zero-Allocation Hashing (bundled by Parquet). -Project URL: https://datasketches.apache.org/ +Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Apache Hive. +This product bundles Apache ORC. Copyright: 2013-2020 The Apache Software Foundation. -Project URL: https://hive.apache.org/ +Project URL: https://orc.apache.org/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Airlift Aircompressor. +This product bundles Apache Hive's Storage API (bundled by ORC). -Copyright: 2011-2020 Aircompressor authors. -Project URL: https://github.com/airlift/aircompressor +Copyright: 2008-2020 The Apache Software Foundation. +Project URL: https://hive.apache.org/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Google GAX. +This product bundles Google protobuf (bundled by ORC). -Project URL: https://github.com/googleapis/gax-java +Copyright: 2008 Google Inc. +Project URL: https://developers.google.com/protocol-buffers License: BSD 3-Clause -| Copyright 2016, Google Inc. All rights reserved. -| + +| Copyright 2008 Google Inc. All rights reserved. +| | Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are | met: -| +| | * Redistributions of source code must retain the above copyright | notice, this list of conditions and the following disclaimer. | * Redistributions in binary form must reproduce the above @@ -293,7 +383,7 @@ License: BSD 3-Clause | * Neither the name of Google Inc. nor the names of its | contributors may be used to endorse or promote products derived from | this software without specific prior written permission. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -305,40 +395,26 @@ License: BSD 3-Clause | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +| +| Code generated by the Protocol Buffer compiler is owned by the owner +| of the input file used when generating it. This code is not +| standalone and requires a support library to be linked with it. This +| support library is itself covered by the above license. -------------------------------------------------------------------------------- -This product bundles Google Auth Library. +This product bundles Apache Datasketches. -License: BSD 3-Clause -| Copyright 2014, Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +Project URL: https://datasketches.apache.org/ +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles Airlift Aircompressor. + +Copyright: 2011-2020 Aircompressor authors. +Project URL: https://github.com/airlift/aircompressor +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -363,6 +439,7 @@ This product bundles checkerframework checker-qual. Copyright: 2004-2020 the Checker Framework developers Project URL: https://github.com/typetools/checker-framework License: MIT + | The annotations are licensed under the MIT License. (The text of this | license appears below.) More specifically, all the parts of the Checker | Framework that you might want to include with your own program use the @@ -401,87 +478,13 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Google protobuf. - -Copyright: 2008 Google Inc. -Project URL: https://developers.google.com/protocol-buffers -License: BSD 3-Clause -| Copyright 2008 Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -| -| Code generated by the Protocol Buffer compiler is owned by the owner -| of the input file used when generating it. This code is not -| standalone and requires a support library to be linked with it. This -| support library is itself covered by the above license. - --------------------------------------------------------------------------------- - -This product bundles ThreeTen BP. - -Project URL: https://www.threeten.org/threetenbp -License: BSD 3-Clause -| Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. -| -| All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are met: -| -| * Redistributions of source code must retain the above copyright notice, -| this list of conditions and the following disclaimer. -| -| * Redistributions in binary form must reproduce the above copyright notice, -| this list of conditions and the following disclaimer in the documentation -| and/or other materials provided with the distribution. -| -| * Neither the name of JSR-310 nor the names of its contributors -| may be used to endorse or promote products derived from this software -| without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - This product bundles ThreeTen Extra. Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. Project URL: https://www.threeten.org/threeten-extra/ License: BSD 3-Clause + +| Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. | All rights reserved. | | * Redistribution and use in source and binary forms, with or without @@ -530,15 +533,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache HttpComponents (core and client). - -Copyright: 1999-2022 The Apache Software Foundation. -Project URL: https://hc.apache.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product includes code from Apache HttpComponents Client. +This product bundles and includes code from Apache HttpComponents (core/client). * retry and error handling logic in ExponentialHttpRequestRetryStrategy.java @@ -556,14 +551,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Codahale Metrics. - -Copyright: (c) 2010-2013 Coda Hale, Yammer.com, 2014-2021 Dropwizard Team -Project URL: https://github.com/dropwizard/metrics -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - This product bundles RoaringBitmap. Copyright: (c) 2013-... the RoaringBitmap authors @@ -572,7 +559,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Eclipse Microprofile OpenAPI. +This product bundles Eclipse MicroProfile OpenAPI. Copyright: Copyright (c) 2017 Contributors to the Eclipse Foundation Project URL: https://github.com/microprofile/microprofile-open-api @@ -585,6 +572,7 @@ This product bundles Luben Zstd. Copyright: Copyright (c) 2015-present, Luben Karavelov/ All rights reserved. Project URL: https://github.com/luben/zstd-jni/ License: BSD 2-Clause + | Zstd-jni: JNI bindings to Zstd Library | | Copyright (c) 2015-present, Luben Karavelov/ All rights reserved. @@ -614,242 +602,423 @@ License: BSD 2-Clause -------------------------------------------------------------------------------- -This product bundles Google Cloud BigQuery Client for Java. - -Project URL: https://github.com/googleapis/java-bigquery -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Credentials for Java. - -Project URL: https://github.com/aliyun/credentials-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Tea for Java. - -Project URL: https://github.com/aliyun/tea-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Gson. - -Project URL: https://github.com/google/gson/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google API Common. - -License: BSD 3-Clause -| Copyright 2016, Google Inc. -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This product bundles Google Http Client. - -Project URL: https://www.google.com/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OpenCensus. - -Project URL: https://github.com/census-instrumentation/opencensus-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JaCoCo. - -Project URL: https://github.com/jacoco/jacoco/ -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt - --------------------------------------------------------------------------------- - -This product bundles JAXB. - -Project URL: http://jaxb.java.net/ -License: CDDL 1.1 - https://glassfish.java.net/public/CDDL+GPL_1_1.html - --------------------------------------------------------------------------------- - -This product bundles Okio. - -Project URL: https://github.com/square/okio -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OkHttp. - -Project URL: https://github.com/square/okhttp -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Auto Value. - -Project URL: https://github.com/google/auto/tree/master/value -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles gRPC. - -Project URL: https://github.com/grpc/grpc-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Netty. - -Project URL: https://netty.io/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google APIs. - -Project URL: https://github.com/googleapis/googleapis -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud APIs for Java. - -Project URL: https://github.com/googleapis/google-cloud-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - This product bundles JTS Topology Suite. Project URL: https://github.com/locationtech/jts -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt - --------------------------------------------------------------------------------- - -This product bundles javax.annotation-api. - -Project URL: https://javaee.github.io/glassfish -Project URL: http://jcp.org/en/jsr/detail?id=250 -License: CDDL - https://github.com/javaee/javax.annotation/blob/master/LICENSE - --------------------------------------------------------------------------------- - -This product bundles JSpecify. - -Project URL: https://github.com/jspecify/jspecify -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Animal Sniffer Annotations. - -License: MIT -| The MIT License -| -| Copyright (c) 2009 codehaus.org. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in -| all copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -| THE SOFTWARE. - --------------------------------------------------------------------------------- - -This product bundles Android Annotations. - -Project URL: http://source.android.com/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php -This product bundles Conscrypt (openjdk-uber). - -Project URL: https://conscrypt.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Perfmark. - -Project URL: https://github.com/perfmark/perfmark -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles org.json. - -Project URL: https://github.com/douglascrockford/JSON-java -License: Public Domain - https://github.com/stleary/JSON-java/blob/master/LICENSE - --------------------------------------------------------------------------------- - -This product bundles Apache Arrow. - -Project URL: https://github.com/apache/arrow -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| - Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| +| - Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| - Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- -This product bundles Google flatbuffers. +This product bundles the Mozilla Public Suffix List, distributed by Apache HttpComponents. -Project URL: https://github.com/google/flatbuffers -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OpenTelemetry. +Project URL: https://publicsuffix.org/ +License: Mozilla Public License, Version 2.0 - https://mozilla.org/MPL/2.0/ -Project URL: https://opentelemetry.io/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 +| Mozilla Public License Version 2.0 +| ================================== +| +| 1. Definitions +| -------------- +| +| 1.1. "Contributor" +| means each individual or legal entity that creates, contributes to +| the creation of, or owns Covered Software. +| +| 1.2. "Contributor Version" +| means the combination of the Contributions of others (if any) used +| by a Contributor and that particular Contributor's Contribution. +| +| 1.3. "Contribution" +| means Covered Software of a particular Contributor. +| +| 1.4. "Covered Software" +| means Source Code Form to which the initial Contributor has attached +| the notice in Exhibit A, the Executable Form of such Source Code +| Form, and Modifications of such Source Code Form, in each case +| including portions thereof. +| +| 1.5. "Incompatible With Secondary Licenses" +| means +| +| (a) that the initial Contributor has attached the notice described +| in Exhibit B to the Covered Software; or +| +| (b) that the Covered Software was made available under the terms of +| version 1.1 or earlier of the License, but not also under the +| terms of a Secondary License. +| +| 1.6. "Executable Form" +| means any form of the work other than Source Code Form. +| +| 1.7. "Larger Work" +| means a work that combines Covered Software with other material, in +| a separate file or files, that is not Covered Software. +| +| 1.8. "License" +| means this document. +| +| 1.9. "Licensable" +| means having the right to grant, to the maximum extent possible, +| whether at the time of the initial grant or subsequently, any and +| all of the rights conveyed by this License. +| +| 1.10. "Modifications" +| means any of the following: +| +| (a) any file in Source Code Form that results from an addition to, +| deletion from, or modification of the contents of Covered +| Software; or +| +| (b) any new file in Source Code Form that contains any Covered +| Software. +| +| 1.11. "Patent Claims" of a Contributor +| means any patent claim(s), including without limitation, method, +| process, and apparatus claims, in any patent Licensable by such +| Contributor that would be infringed, but for the grant of the +| License, by the making, using, selling, offering for sale, having +| made, import, or transfer of either its Contributions or its +| Contributor Version. +| +| 1.12. "Secondary License" +| means either the GNU General Public License, Version 2.0, the GNU +| Lesser General Public License, Version 2.1, the GNU Affero General +| Public License, Version 3.0, or any later versions of those +| licenses. +| +| 1.13. "Source Code Form" +| means the form of the work preferred for making modifications. +| +| 1.14. "You" (or "Your") +| means an individual or a legal entity exercising rights under this +| License. For legal entities, "You" includes any entity that +| controls, is controlled by, or is under common control with You. For +| purposes of this definition, "control" means (a) the power, direct +| or indirect, to cause the direction or management of such entity, +| whether by contract or otherwise, or (b) ownership of more than +| fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants and Conditions +| -------------------------------- +| +| 2.1. Grants +| +| Each Contributor hereby grants You a world-wide, royalty-free, +| non-exclusive license: +| +| (a) under intellectual property rights (other than patent or trademark) +| Licensable by such Contributor to use, reproduce, make available, +| modify, display, perform, distribute, and otherwise exploit its +| Contributions, either on an unmodified basis, with Modifications, or +| as part of a Larger Work; and +| +| (b) under Patent Claims of such Contributor to make, use, sell, offer +| for sale, have made, import, and otherwise transfer either its +| Contributions or its Contributor Version. +| +| 2.2. Effective Date +| +| The licenses granted in Section 2.1 with respect to any Contribution +| become effective for each Contribution on the date the Contributor first +| distributes such Contribution. +| +| 2.3. Limitations on Grant Scope +| +| The licenses granted in this Section 2 are the only rights granted under +| this License. No additional rights or licenses will be implied from the +| distribution or licensing of Covered Software under this License. +| Notwithstanding Section 2.1(b) above, no patent license is granted by a +| Contributor: +| +| (a) for any code that a Contributor has removed from Covered Software; +| or +| +| (b) for infringements caused by: (i) Your and any other third party's +| modifications of Covered Software, or (ii) the combination of its +| Contributions with other software (except as part of its Contributor +| Version); or +| +| (c) under Patent Claims infringed by Covered Software in the absence of +| its Contributions. +| +| This License does not grant any rights in the trademarks, service marks, +| or logos of any Contributor (except as may be necessary to comply with +| the notice requirements in Section 3.4). +| +| 2.4. Subsequent Licenses +| +| No Contributor makes additional grants as a result of Your choice to +| distribute the Covered Software under a subsequent version of this +| License (see Section 10.2) or under the terms of a Secondary License (if +| permitted under the terms of Section 3.3). +| +| 2.5. Representation +| +| Each Contributor represents that the Contributor believes its +| Contributions are its original creation(s) or it has sufficient rights +| to grant the rights to its Contributions conveyed by this License. +| +| 2.6. Fair Use +| +| This License is not intended to limit any rights You have under +| applicable copyright doctrines of fair use, fair dealing, or other +| equivalents. +| +| 2.7. Conditions +| +| Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +| in Section 2.1. +| +| 3. Responsibilities +| ------------------- +| +| 3.1. Distribution of Source Form +| +| All distribution of Covered Software in Source Code Form, including any +| Modifications that You create or to which You contribute, must be under +| the terms of this License. You must inform recipients that the Source +| Code Form of the Covered Software is governed by the terms of this +| License, and how they can obtain a copy of this License. You may not +| attempt to alter or restrict the recipients' rights in the Source Code +| Form. +| +| 3.2. Distribution of Executable Form +| +| If You distribute Covered Software in Executable Form then: +| +| (a) such Covered Software must also be made available in Source Code +| Form, as described in Section 3.1, and You must inform recipients of +| the Executable Form how they can obtain a copy of such Source Code +| Form by reasonable means in a timely manner, at a charge no more +| than the cost of distribution to the recipient; and +| +| (b) You may distribute such Executable Form under the terms of this +| License, or sublicense it under different terms, provided that the +| license for the Executable Form does not attempt to limit or alter +| the recipients' rights in the Source Code Form under this License. +| +| 3.3. Distribution of a Larger Work +| +| You may create and distribute a Larger Work under terms of Your choice, +| provided that You also comply with the requirements of this License for +| the Covered Software. If the Larger Work is a combination of Covered +| Software with a work governed by one or more Secondary Licenses, and the +| Covered Software is not Incompatible With Secondary Licenses, this +| License permits You to additionally distribute such Covered Software +| under the terms of such Secondary License(s), so that the recipient of +| the Larger Work may, at their option, further distribute the Covered +| Software under the terms of either this License or such Secondary +| License(s). +| +| 3.4. Notices +| +| You may not remove or alter the substance of any license notices +| (including copyright notices, patent notices, disclaimers of warranty, +| or limitations of liability) contained within the Source Code Form of +| the Covered Software, except that You may alter any license notices to +| the extent required to remedy known factual inaccuracies. +| +| 3.5. Application of Additional Terms +| +| You may choose to offer, and to charge a fee for, warranty, support, +| indemnity or liability obligations to one or more recipients of Covered +| Software. However, You may do so only on Your own behalf, and not on +| behalf of any Contributor. You must make it absolutely clear that any +| such warranty, support, indemnity, or liability obligation is offered by +| You alone, and You hereby agree to indemnify every Contributor for any +| liability incurred by such Contributor as a result of warranty, support, +| indemnity or liability terms You offer. You may include additional +| disclaimers of warranty and limitations of liability specific to any +| jurisdiction. +| +| 4. Inability to Comply Due to Statute or Regulation +| --------------------------------------------------- +| +| If it is impossible for You to comply with any of the terms of this +| License with respect to some or all of the Covered Software due to +| statute, judicial order, or regulation then You must: (a) comply with +| the terms of this License to the maximum extent possible; and (b) +| describe the limitations and the code they affect. Such description must +| be placed in a text file included with all distributions of the Covered +| Software under this License. Except to the extent prohibited by statute +| or regulation, such description must be sufficiently detailed for a +| recipient of ordinary skill to be able to understand it. +| +| 5. Termination +| -------------- +| +| 5.1. The rights granted under this License will terminate automatically +| if You fail to comply with any of its terms. However, if You become +| compliant, then the rights granted under this License from a particular +| Contributor are reinstated (a) provisionally, unless and until such +| Contributor explicitly and finally terminates Your grants, and (b) on an +| ongoing basis, if such Contributor fails to notify You of the +| non-compliance by some reasonable means prior to 60 days after You have +| come back into compliance. Moreover, Your grants from a particular +| Contributor are reinstated on an ongoing basis if such Contributor +| notifies You of the non-compliance by some reasonable means, this is the +| first time You have received notice of non-compliance with this License +| from such Contributor, and You become compliant prior to 30 days after +| Your receipt of the notice. +| +| 5.2. If You initiate litigation against any entity by asserting a patent +| infringement claim (excluding declaratory judgment actions, +| counter-claims, and cross-claims) alleging that a Contributor Version +| directly or indirectly infringes any patent, then the rights granted to +| You by any and all Contributors for the Covered Software under Section +| 2.1 of this License shall terminate. +| +| 5.3. In the event of termination under Sections 5.1 or 5.2 above, all +| end user license agreements (excluding distributors and resellers) which +| have been validly granted by You or Your distributors under this License +| prior to termination shall survive termination. +| +| ************************************************************************ +| * * +| * 6. Disclaimer of Warranty * +| * ------------------------- * +| * * +| * Covered Software is provided under this License on an "as is" * +| * basis, without warranty of any kind, either expressed, implied, or * +| * statutory, including, without limitation, warranties that the * +| * Covered Software is free of defects, merchantable, fit for a * +| * particular purpose or non-infringing. The entire risk as to the * +| * quality and performance of the Covered Software is with You. * +| * Should any Covered Software prove defective in any respect, You * +| * (not any Contributor) assume the cost of any necessary servicing, * +| * repair, or correction. This disclaimer of warranty constitutes an * +| * essential part of this License. No use of any Covered Software is * +| * authorized under this License except under this disclaimer. * +| * * +| ************************************************************************ +| +| ************************************************************************ +| * * +| * 7. Limitation of Liability * +| * -------------------------- * +| * * +| * Under no circumstances and under no legal theory, whether tort * +| * (including negligence), contract, or otherwise, shall any * +| * Contributor, or anyone who distributes Covered Software as * +| * permitted above, be liable to You for any direct, indirect, * +| * special, incidental, or consequential damages of any character * +| * including, without limitation, damages for lost profits, loss of * +| * goodwill, work stoppage, computer failure or malfunction, or any * +| * and all other commercial damages or losses, even if such party * +| * shall have been informed of the possibility of such damages. This * +| * limitation of liability shall not apply to liability for death or * +| * personal injury resulting from such party's negligence to the * +| * extent applicable law prohibits such limitation. Some * +| * jurisdictions do not allow the exclusion or limitation of * +| * incidental or consequential damages, so this exclusion and * +| * limitation may not apply to You. * +| * * +| ************************************************************************ +| +| 8. Litigation +| ------------- +| +| Any litigation relating to this License may be brought only in the +| courts of a jurisdiction where the defendant maintains its principal +| place of business and such litigation shall be governed by laws of that +| jurisdiction, without reference to its conflict-of-law provisions. +| Nothing in this Section shall prevent a party's ability to bring +| cross-claims or counter-claims. +| +| 9. Miscellaneous +| ---------------- +| +| This License represents the complete agreement concerning the subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. Any law or regulation which provides +| that the language of a contract shall be construed against the drafter +| shall not be used to construe this License against a Contributor. +| +| 10. Versions of the License +| --------------------------- +| +| 10.1. New Versions +| +| Mozilla Foundation is the license steward. Except as provided in Section +| 10.3, no one other than the license steward has the right to modify or +| publish new versions of this License. Each version will be given a +| distinguishing version number. +| +| 10.2. Effect of New Versions +| +| You may distribute the Covered Software under the terms of the version +| of the License under which You originally received the Covered Software, +| or under the terms of any subsequent version published by the license +| steward. +| +| 10.3. Modified Versions +| +| If you create software not governed by this License, and you want to +| create a new license for such software, you may create and use a +| modified version of this License if you rename the license and remove +| any references to the name of the license steward (except to note that +| such modified license differs from this License). +| +| 10.4. Distributing Source Code Form that is Incompatible With Secondary +| Licenses +| +| If You choose to distribute Source Code Form that is Incompatible With +| Secondary Licenses under the terms of this version of the License, the +| notice described in Exhibit B of this License must be attached. +| +| Exhibit A - Source Code Form License Notice +| ------------------------------------------- +| +| This Source Code Form is subject to the terms of the Mozilla Public +| License, v. 2.0. If a copy of the MPL was not distributed with this +| file, You can obtain one at http://mozilla.org/MPL/2.0/. +| +| If it is not possible or desirable to put the notice in a particular +| file, then You may include the notice in a location (such as a LICENSE +| file in a relevant directory) where a recipient would be likely to look +| for such a notice. +| +| You may add additional accurate notices of copyright ownership. +| +| Exhibit B - "Incompatible With Secondary Licenses" Notice +| --------------------------------------------------------- +| +| This Source Code Form is "Incompatible With Secondary Licenses", as +| defined by the Mozilla Public License, v. 2.0. -------------------------------------------------------------------------------- -This product bundles Kotlin. - -Project URL: https://github.com/JetBrains/kotlin -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 diff --git a/flink/v1.20/flink-runtime/NOTICE b/flink/v1.20/flink-runtime/NOTICE index 7603ddaedd9f..61b02129d0e1 100644 --- a/flink/v1.20/flink-runtime/NOTICE +++ b/flink/v1.20/flink-runtime/NOTICE @@ -356,128 +356,6 @@ This product bundles Eclipse Microprofile OpenAPI with the following in its NOTI -------------------------------------------------------------------------------- -This product bundles gRPC with the following in its NOTICE file: -| Copyright 2014 The gRPC Authors -| -| Licensed under the Apache License, Version 2.0 (the "License"); -| you may not use this file except in compliance with the License. -| You may obtain a copy of the License at -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, -| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -| See the License for the specific language governing permissions and -| limitations under the License. -| -| ----------------------------------------------------------------------- -| -| This product contains a modified portion of 'OkHttp', an open source -| HTTP & SPDY client for Android and Java applications, which can be obtained -| at: -| -| * LICENSE: -| * okhttp/third_party/okhttp/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/square/okhttp -| * LOCATION_IN_GRPC: -| * okhttp/third_party/okhttp -| -| This product contains a modified portion of 'Envoy', an open source -| cloud-native high-performance edge/middle/service proxy, which can be -| obtained at: -| -| * LICENSE: -| * xds/third_party/envoy/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/envoy/NOTICE -| * HOMEPAGE: -| * https://www.envoyproxy.io -| * LOCATION_IN_GRPC: -| * xds/third_party/envoy -| -| This product contains a modified portion of 'protoc-gen-validate (PGV)', -| an open source protoc plugin to generate polyglot message validators, -| which can be obtained at: -| -| * LICENSE: -| * xds/third_party/protoc-gen-validate/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/protoc-gen-validate/NOTICE -| * HOMEPAGE: -| * https://github.com/envoyproxy/protoc-gen-validate -| * LOCATION_IN_GRPC: -| * xds/third_party/protoc-gen-validate -| -| This product contains a modified portion of 'udpa', -| an open source universal data plane API, which can be obtained at: -| -| * LICENSE: -| * xds/third_party/udpa/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/cncf/udpa -| * LOCATION_IN_GRPC: -| * xds/third_party/udpa - --------------------------------------------------------------------------------- - -This product bundles Netty with the following in its NOTICE file: -| -| The Netty Project -| ================= -| -| Please visit the Netty web site for more information: -| -| * http://netty.io/ -| -| Copyright 2016 The Netty Project -| -| The Netty Project licenses this file to you under the Apache License, -| version 2.0 (the "License"); you may not use this file except in compliance -| with the License. You may obtain a copy of the License at: -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -| WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -| License for the specific language governing permissions and limitations -| under the License. -| -| ------------------------------------------------------------------------------- -| This product contains a forked and modified version of Tomcat Native -| -| * LICENSE: -| * license/LICENSE.tomcat-native.txt (Apache License 2.0) -| * HOMEPAGE: -| * http://tomcat.apache.org/native-doc/ -| * https://svn.apache.org/repos/asf/tomcat/native/ -| -| This product contains the Maven wrapper scripts from 'Maven Wrapper', that provides an easy way to ensure a user has everything necessary to run the Maven build. -| -| * LICENSE: -| * license/LICENSE.mvn-wrapper.txt (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/takari/maven-wrapper -| -| This product contains small piece of code to support AIX, taken from netbsd. -| -| * LICENSE: -| * license/LICENSE.aix-netbsd.txt (OpenSSL License) -| * HOMEPAGE: -| * https://ftp.netbsd.org/pub/NetBSD/NetBSD-current/src/crypto/external/bsd/openssl/dist -| -| -| This product contains code from boringssl. -| -| * LICENSE (Combination ISC and OpenSSL license) -| * license/LICENSE.boringssl.txt (Combination ISC and OpenSSL license) -| * HOMEPAGE: -| * https://boringssl.googlesource.com/boringssl/ - --------------------------------------------------------------------------------- - This product bundles Jackson JSON Processor with the following in its NOTICE file: | # Jackson JSON processor | @@ -511,82 +389,3 @@ This product bundles Jackson JSON Processor with the following in its NOTICE fil | | See FastDoubleParser-NOTICE for details of other source code included in FastDoubleParser | and the licenses and copyrights that apply to that code. - --------------------------------------------------------------------------------- - -This product bundles Perfmark with the following in its NOTICE file: -| -| Copyright 2019 Google LLC -| -| Licensed under the Apache License, Version 2.0 (the "License"); -| you may not use this file except in compliance with the License. -| You may obtain a copy of the License at -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, -| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -| See the License for the specific language governing permissions and -| limitations under the License. -| -| ----------------------------------------------------------------------- -| -| This product contains a modified portion of 'Catapult', an open source -| Trace Event viewer for Chome, Linux, and Android applications, which can -| be obtained at: -| -| * LICENSE: -| * traceviewer/src/main/resources/io/perfmark/traceviewer/third_party/catapult/LICENSE (New BSD License) -| * HOMEPAGE: -| * https://github.com/catapult-project/catapult -| -| This product contains a modified portion of 'Polymer', a library for Web -| Components, which can be obtained at: -| * LICENSE: -| * traceviewer/src/main/resources/io/perfmark/traceviewer/third_party/polymer/LICENSE (New BSD License) -| * HOMEPAGE: -| * https://github.com/Polymer/polymer -| -| -| This product contains a modified portion of 'ASM', an open source -| Java Bytecode library, which can be obtained at: -| -| * LICENSE: -| * agent/src/main/resources/io/perfmark/agent/third_party/asm/LICENSE (BSD style License) -| * HOMEPAGE: -| * https://asm.ow2.io/ - --------------------------------------------------------------------------------- - -This product bundles Conscrypt (uber-jar) with the following in its NOTICE: -| Copyright 2016 The Android Open Source Project -| -| Licensed under the Apache License, Version 2.0 (the "License"); -| you may not use this file except in compliance with the License. -| You may obtain a copy of the License at -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, -| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -| See the License for the specific language governing permissions and -| limitations under the License. -| -| ----------------------------------------------------------------------- -| This product contains a modified portion of `Netty`, a configurable network -| stack in Java, which can be obtained at: -| -| * LICENSE: -| * licenses/LICENSE.netty.txt (Apache License 2.0) -| * HOMEPAGE: -| * http://netty.io/ -| -| This product contains a modified portion of `Apache Harmony`, modular Java runtime, -| which can be obtained at: -| -| * LICENSE: -| * licenses/LICENSE.harmony.txt (Apache License 2.0) -| * HOMEPAGE: -| * https://harmony.apache.org/ diff --git a/flink/v1.20/flink-runtime/baseline-class-uniqueness.lock b/flink/v1.20/flink-runtime/baseline-class-uniqueness.lock deleted file mode 100644 index 7868296a79e8..000000000000 --- a/flink/v1.20/flink-runtime/baseline-class-uniqueness.lock +++ /dev/null @@ -1,60 +0,0 @@ -# Danger! Multiple jars contain identically named classes. This may cause different behaviour depending on classpath ordering. -# Run ./gradlew checkClassUniqueness --fix to update this file - -## runtimeClasspath -[com.google.protobuf:protobuf-java, dev.vortex:vortex-jni] - - com.google.protobuf.BoolValue - - com.google.protobuf.BoolValue$1 - - com.google.protobuf.BoolValue$Builder - - com.google.protobuf.BoolValueOrBuilder - - com.google.protobuf.BytesValue - - com.google.protobuf.BytesValue$1 - - com.google.protobuf.BytesValue$Builder - - com.google.protobuf.BytesValueOrBuilder - - com.google.protobuf.DoubleValue - - com.google.protobuf.DoubleValue$1 - - com.google.protobuf.DoubleValue$Builder - - com.google.protobuf.DoubleValueOrBuilder - - com.google.protobuf.FloatValue - - com.google.protobuf.FloatValue$1 - - com.google.protobuf.FloatValue$Builder - - com.google.protobuf.FloatValueOrBuilder - - com.google.protobuf.Int32Value - - com.google.protobuf.Int32Value$1 - - com.google.protobuf.Int32Value$Builder - - com.google.protobuf.Int32ValueOrBuilder - - com.google.protobuf.Int64Value - - com.google.protobuf.Int64Value$1 - - com.google.protobuf.Int64Value$Builder - - com.google.protobuf.Int64ValueOrBuilder - - com.google.protobuf.ListValue - - com.google.protobuf.ListValue$1 - - com.google.protobuf.ListValue$Builder - - com.google.protobuf.ListValueOrBuilder - - com.google.protobuf.NullValue - - com.google.protobuf.NullValue$1 - - com.google.protobuf.StringValue - - com.google.protobuf.StringValue$1 - - com.google.protobuf.StringValue$Builder - - com.google.protobuf.StringValueOrBuilder - - com.google.protobuf.Struct - - com.google.protobuf.Struct$1 - - com.google.protobuf.Struct$Builder - - com.google.protobuf.Struct$Builder$FieldsConverter - - com.google.protobuf.Struct$FieldsDefaultEntryHolder - - com.google.protobuf.StructOrBuilder - - com.google.protobuf.StructProto - - com.google.protobuf.UInt32Value - - com.google.protobuf.UInt32Value$1 - - com.google.protobuf.UInt32Value$Builder - - com.google.protobuf.UInt32ValueOrBuilder - - com.google.protobuf.UInt64Value - - com.google.protobuf.UInt64Value$1 - - com.google.protobuf.UInt64Value$Builder - - com.google.protobuf.UInt64ValueOrBuilder - - com.google.protobuf.Value - - com.google.protobuf.Value$1 - - com.google.protobuf.Value$Builder - - com.google.protobuf.Value$KindCase - - com.google.protobuf.ValueOrBuilder - - com.google.protobuf.WrappersProto diff --git a/flink/v1.20/flink-runtime/runtime-deps.txt b/flink/v1.20/flink-runtime/runtime-deps.txt new file mode 100644 index 000000000000..2eba0b183fb9 --- /dev/null +++ b/flink/v1.20/flink-runtime/runtime-deps.txt @@ -0,0 +1,47 @@ +com.fasterxml.jackson.core:jackson-annotations:2.21 +com.fasterxml.jackson.core:jackson-core:2.21.3 +com.fasterxml.jackson.core:jackson-databind:2.21.3 +com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.21.3 +com.github.ben-manes.caffeine:caffeine:2.9.3 +com.github.luben:zstd-jni:1.5.7-3 +com.google.errorprone:error_prone_annotations:2.41.0 +com.google.flatbuffers:flatbuffers-java:25.2.10 +com.google.guava:failureaccess:1.0.3 +com.google.guava:guava:33.5.0-jre +com.google.guava:listenablefuture:9999.0-empty-to-avoid-conflict-with-guava +com.google.j2objc:j2objc-annotations:3.1 +dev.failsafe:failsafe:3.3.2 +dev.vortex:vortex-jni:0.67.0 +io.airlift:aircompressor:2.0.3 +io.netty:netty-buffer:4.2.7.Final +io.netty:netty-common:4.2.7.Final +org.apache.arrow:arrow-c-data:18.3.0 +org.apache.arrow:arrow-format:18.3.0 +org.apache.arrow:arrow-memory-core:18.3.0 +org.apache.arrow:arrow-memory-netty-buffer-patch:18.3.0 +org.apache.arrow:arrow-memory-netty:18.3.0 +org.apache.arrow:arrow-vector:18.3.0 +org.apache.avro:avro:1.12.1 +org.apache.datasketches:datasketches-java:6.2.0 +org.apache.datasketches:datasketches-memory:3.0.2 +org.apache.httpcomponents.client5:httpclient5:5.6.1 +org.apache.httpcomponents.core5:httpcore5-h2:5.4 +org.apache.httpcomponents.core5:httpcore5:5.4 +org.apache.orc:orc-core:1.9.8 +org.apache.orc:orc-shims:1.9.8 +org.apache.parquet:parquet-avro:1.17.0 +org.apache.parquet:parquet-column:1.17.0 +org.apache.parquet:parquet-common:1.17.0 +org.apache.parquet:parquet-encoding:1.17.0 +org.apache.parquet:parquet-format-structures:1.17.0 +org.apache.parquet:parquet-hadoop:1.17.0 +org.apache.parquet:parquet-jackson:1.17.0 +org.apache.parquet:parquet-variant:1.17.0 +org.checkerframework:checker-qual:3.19.0 +org.eclipse.microprofile.openapi:microprofile-openapi-api:4.1.1 +org.jspecify:jspecify:1.0.0 +org.locationtech.jts:jts-core:1.20.0 +org.projectnessie.nessie:nessie-client:0.107.5 +org.projectnessie.nessie:nessie-model:0.107.5 +org.roaringbitmap:RoaringBitmap:1.6.14 +org.threeten:threeten-extra:1.7.1 diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java index e0672811cf5f..7661372c88e8 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java @@ -44,7 +44,7 @@ public FlinkConfParser(Table table, Map options, ReadableConfig this.readableConfig = readableConfig; } - FlinkConfParser(Map options, ReadableConfig readableConfig) { + public FlinkConfParser(Map options, ReadableConfig readableConfig) { this.tableProperties = ImmutableMap.of(); this.options = options; this.readableConfig = readableConfig; diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java index 408065f06057..8f106da8d56b 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java @@ -137,11 +137,17 @@ public Type visit(TimeType timeType) { @Override public Type visit(TimestampType timestampType) { + if (timestampType.getPrecision() > 6) { + return Types.TimestampNanoType.withoutZone(); + } return Types.TimestampType.withoutZone(); } @Override public Type visit(LocalZonedTimestampType localZonedTimestampType) { + if (localZonedTimestampType.getPrecision() > 6) { + return Types.TimestampNanoType.withZone(); + } return Types.TimestampType.withZone(); } diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java index 3ef611f2ded5..920e44b24b31 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java @@ -114,19 +114,35 @@ private static PositionalGetter buildGetter(LogicalType logicalType, Type typ case TIMESTAMP_WITHOUT_TIME_ZONE: TimestampType timestampType = (TimestampType) logicalType; - return (row, pos) -> { - LocalDateTime localDateTime = - row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); - return DateTimeUtil.microsFromTimestamp(localDateTime); - }; + if (type.typeId() == Type.TypeID.TIMESTAMP_NANO) { + return (row, pos) -> { + LocalDateTime localDateTime = + row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); + return DateTimeUtil.nanosFromTimestamp(localDateTime); + }; + } else { + return (row, pos) -> { + LocalDateTime localDateTime = + row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); + return DateTimeUtil.microsFromTimestamp(localDateTime); + }; + } case TIMESTAMP_WITH_LOCAL_TIME_ZONE: LocalZonedTimestampType lzTs = (LocalZonedTimestampType) logicalType; - return (row, pos) -> { - TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); - return timestampData.getMillisecond() * 1000 - + timestampData.getNanoOfMillisecond() / 1000; - }; + if (type.typeId() == Type.TypeID.TIMESTAMP_NANO) { + return (row, pos) -> { + TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); + return timestampData.getMillisecond() * 1_000_000L + + timestampData.getNanoOfMillisecond(); + }; + } else { + return (row, pos) -> { + TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); + return timestampData.getMillisecond() * 1000L + + timestampData.getNanoOfMillisecond() / 1000; + }; + } case ROW: RowType rowType = (RowType) logicalType; diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java index 65b9d44ad4b8..77f16bfdb2ab 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java @@ -70,7 +70,7 @@ public OrcValueReader record( TypeDescription record, List names, List> fields) { - return FlinkOrcReaders.struct(fields, iStruct, idToConstant); + return FlinkOrcReaders.struct(record, fields, iStruct, idToConstant); } @Override @@ -112,6 +112,13 @@ public OrcValueReader primitive(Type.PrimitiveType iPrimitive, TypeDescriptio } else { return FlinkOrcReaders.timestamps(); } + case TIMESTAMP_NANO: + Types.TimestampNanoType timestampNanoType = (Types.TimestampNanoType) iPrimitive; + if (timestampNanoType.shouldAdjustToUTC()) { + return FlinkOrcReaders.timestampTzs(); + } else { + return FlinkOrcReaders.timestamps(); + } case STRING: return FlinkOrcReaders.strings(); case UUID: diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java index 7a4a15c7e600..c5c958fbdb04 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java @@ -39,6 +39,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; +import org.apache.orc.TypeDescription; import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; import org.apache.orc.storage.ql.exec.vector.ColumnVector; import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; @@ -91,8 +92,11 @@ public static OrcValueReader map( } public static OrcValueReader struct( - List> readers, Types.StructType struct, Map idToConstant) { - return new StructReader(readers, struct, idToConstant); + TypeDescription record, + List> readers, + Types.StructType struct, + Map idToConstant) { + return new StructReader(record, readers, struct, idToConstant); } private static class StringReader implements OrcValueReader { @@ -265,8 +269,11 @@ private static class StructReader extends OrcValueReaders.StructReader private final int numFields; StructReader( - List> readers, Types.StructType struct, Map idToConstant) { - super(readers, struct, idToConstant); + TypeDescription record, + List> readers, + Types.StructType struct, + Map idToConstant) { + super(record, readers, struct, idToConstant); this.numFields = struct.fields().size(); } diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java index a467d848337d..c1b46252e18a 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java @@ -145,6 +145,13 @@ public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, LogicalType fl } else { return FlinkOrcWriters.timestamps(); } + case TIMESTAMP_NANO: + Types.TimestampNanoType timestampNanoType = (Types.TimestampNanoType) iPrimitive; + if (timestampNanoType.shouldAdjustToUTC()) { + return FlinkOrcWriters.timestampNanoTzs(); + } else { + return FlinkOrcWriters.timestampNanos(); + } case STRING: return FlinkOrcWriters.strings(); case UUID: diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java index 684842aa099c..bf19a46c05fb 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java @@ -70,6 +70,14 @@ static OrcValueWriter timestampTzs() { return TimestampTzWriter.INSTANCE; } + static OrcValueWriter timestampNanos() { + return TimestampNanoWriter.INSTANCE; + } + + static OrcValueWriter timestampNanoTzs() { + return TimestampNanoTzWriter.INSTANCE; + } + static OrcValueWriter decimals(int precision, int scale) { if (precision <= 18) { return new Decimal18Writer(precision, scale); @@ -170,6 +178,35 @@ public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { } } + private static class TimestampNanoWriter implements OrcValueWriter { + private static final TimestampNanoWriter INSTANCE = new TimestampNanoWriter(); + + @Override + public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { + TimestampColumnVector cv = (TimestampColumnVector) output; + cv.setIsUTC(true); + // millis + OffsetDateTime offsetDateTime = data.toInstant().atOffset(ZoneOffset.UTC); + cv.time[rowId] = + offsetDateTime.toEpochSecond() * 1_000 + offsetDateTime.getNano() / 1_000_000; + cv.nanos[rowId] = offsetDateTime.getNano(); + } + } + + private static class TimestampNanoTzWriter implements OrcValueWriter { + private static final TimestampNanoTzWriter INSTANCE = new TimestampNanoTzWriter(); + + @SuppressWarnings("JavaInstantGetSecondsGetNano") + @Override + public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { + TimestampColumnVector cv = (TimestampColumnVector) output; + // millis + Instant instant = data.toInstant(); + cv.time[rowId] = instant.toEpochMilli(); + cv.nanos[rowId] = instant.getNano(); + } + } + private static class Decimal18Writer implements OrcValueWriter { private final int precision; private final int scale; diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java index f23a7ee3d0d3..81bb55967992 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java @@ -69,6 +69,8 @@ public static Object convertConstant(Type type, Object value) { return (int) ((Long) value / 1000); case TIMESTAMP: // TimestampData return TimestampData.fromLocalDateTime(DateTimeUtil.timestampFromMicros((Long) value)); + case TIMESTAMP_NANO: + return TimestampData.fromLocalDateTime(DateTimeUtil.timestampFromNanos((Long) value)); case UUID: return UUIDUtil.convert((UUID) value); default: diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java index 34576a1e5c0b..b469f2310f42 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java @@ -48,6 +48,7 @@ import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.ByteBuffers; +import org.apache.iceberg.util.DateTimeUtil; @Internal public class StructRowData implements RowData { @@ -120,8 +121,8 @@ public int getInt(int pos) { if (integer instanceof Integer) { return (int) integer; - } else if (integer instanceof LocalDate) { - return (int) ((LocalDate) integer).toEpochDay(); + } else if (integer instanceof LocalDate localDate) { + return (int) localDate.toEpochDay(); } else if (integer instanceof LocalTime) { return (int) (((LocalTime) integer).toNanoOfDay() / 1000_000); } else { @@ -185,8 +186,27 @@ private BigDecimal getDecimalInternal(int pos) { @Override public TimestampData getTimestamp(int pos, int precision) { + if (precision > 6) { + Object timeVal = struct.get(pos, Object.class); + if (timeVal instanceof OffsetDateTime) { + OffsetDateTime odt = (OffsetDateTime) timeVal; + return TimestampData.fromEpochMillis( + odt.toInstant().toEpochMilli(), odt.getNano() % 1_000_000); + } else if (timeVal instanceof LocalDateTime) { + LocalDateTime ldt = (LocalDateTime) timeVal; + return TimestampData.fromEpochMillis( + ldt.toInstant(ZoneOffset.UTC).toEpochMilli(), ldt.getNano() % 1_000_000); + } else if (timeVal instanceof Long) { + long timeLong = (Long) timeVal; + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1_000_000L), (int) Math.floorMod(timeLong, 1_000_000L)); + } else { + throw new IllegalStateException("Unknown type for timestamp_ns: " + timeVal.getClass()); + } + } long timeLong = getLong(pos); - return TimestampData.fromEpochMillis(timeLong / 1000, (int) (timeLong % 1000) * 1000); + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1000L), (int) Math.floorMod(timeLong, 1000L) * 1000); } @Override @@ -257,9 +277,29 @@ private Object convertValue(Type elementType, Object value) { case DECIMAL: return value; case TIMESTAMP: - long millisecond = (long) value / 1000; - int nanoOfMillisecond = (int) ((Long) value % 1000) * 1000; - return TimestampData.fromEpochMillis(millisecond, nanoOfMillisecond); + long timeMillis; + if (value instanceof LocalDateTime localDateTime) { + timeMillis = DateTimeUtil.microsFromTimestamp(localDateTime) / 1000L; + } else if (value instanceof OffsetDateTime offsetDateTime) { + timeMillis = DateTimeUtil.microsFromTimestamptz(offsetDateTime) / 1000L; + } else { + timeMillis = Math.floorDiv((Long) value, 1000L); + } + return TimestampData.fromEpochMillis( + timeMillis, + (int) Math.floorMod(value instanceof Long ? (Long) value : timeMillis * 1000L, 1000L) + * 1000); + case TIMESTAMP_NANO: + long nanoLong; + if (value instanceof LocalDateTime localDateTime) { + nanoLong = DateTimeUtil.nanosFromTimestamp(localDateTime); + } else if (value instanceof OffsetDateTime offsetDateTime) { + nanoLong = DateTimeUtil.nanosFromTimestamptz(offsetDateTime); + } else { + nanoLong = (Long) value; + } + return TimestampData.fromEpochMillis( + Math.floorDiv(nanoLong, 1_000_000L), (int) Math.floorMod(nanoLong, 1_000_000L)); case STRING: return StringData.fromString(value.toString()); case FIXED: diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/AvroToRowDataConverters.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/AvroToRowDataConverters.java new file mode 100644 index 000000000000..0f70e60a1b9f --- /dev/null +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/AvroToRowDataConverters.java @@ -0,0 +1,303 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro; + +import java.io.Serializable; +import java.lang.reflect.Array; +import java.nio.ByteBuffer; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.temporal.ChronoField; +import java.util.List; +import java.util.Map; +import org.apache.avro.generic.GenericFixed; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.utils.LogicalTypeUtils; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +/** + * Tool class used to convert from Avro {@link GenericRecord} to {@link RowData}. + * + *

This class is adapted in Iceberg to add support for nanosecond precision timestamps + * (FLINK-39251). Once that ticket is resolved in Flink, this custom converter may be removed. + */ +@Internal +public class AvroToRowDataConverters { + + private AvroToRowDataConverters() {} + + /** + * Runtime converter that converts Avro data structures into objects of Flink Table & SQL + * internal data structures. + */ + @FunctionalInterface + public interface AvroToRowDataConverter extends Serializable { + Object convert(Object object); + } + + // ------------------------------------------------------------------------------------- + // Runtime Converters + // ------------------------------------------------------------------------------------- + + public static AvroToRowDataConverter createRowConverter(RowType rowType) { + return createRowConverter(rowType, true); + } + + public static AvroToRowDataConverter createRowConverter( + RowType rowType, boolean legacyTimestampMapping) { + final AvroToRowDataConverter[] fieldConverters = + rowType.getFields().stream() + .map(RowType.RowField::getType) + .map(type -> createNullableConverter(type, legacyTimestampMapping)) + .toArray(AvroToRowDataConverter[]::new); + final int arity = rowType.getFieldCount(); + + return avroObject -> { + IndexedRecord record = (IndexedRecord) avroObject; + GenericRowData row = new GenericRowData(arity); + for (int i = 0; i < arity; ++i) { + // avro always deserialize successfully even though the type isn't matched + // so no need to throw exception about which field can't be deserialized + row.setField(i, fieldConverters[i].convert(record.get(i))); + } + return row; + }; + } + + /** Creates a runtime converter which is null safe. */ + private static AvroToRowDataConverter createNullableConverter( + LogicalType type, boolean legacyTimestampMapping) { + final AvroToRowDataConverter converter = createConverter(type, legacyTimestampMapping); + return avroObject -> { + if (avroObject == null) { + return null; + } + return converter.convert(avroObject); + }; + } + + /** Creates a runtime converter which assuming input object is not null. */ + private static AvroToRowDataConverter createConverter( + LogicalType type, boolean legacyTimestampMapping) { + switch (type.getTypeRoot()) { + case NULL: + return avroObject -> null; + case TINYINT: + return avroObject -> ((Integer) avroObject).byteValue(); + case SMALLINT: + return avroObject -> ((Integer) avroObject).shortValue(); + case BOOLEAN: // boolean + case INTEGER: // int + case INTERVAL_YEAR_MONTH: // long + case BIGINT: // long + case INTERVAL_DAY_TIME: // long + case FLOAT: // float + case DOUBLE: // double + return avroObject -> avroObject; + case DATE: + return AvroToRowDataConverters::convertToDate; + case TIME_WITHOUT_TIME_ZONE: + return AvroToRowDataConverters::convertToTime; + case TIMESTAMP_WITHOUT_TIME_ZONE: + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return avroObject -> convertToTimestamp(avroObject, type); + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + if (legacyTimestampMapping) { + throw new UnsupportedOperationException("Unsupported type: " + type); + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return avroObject -> convertToTimestamp(avroObject, type); + } + case CHAR: + case VARCHAR: + return avroObject -> StringData.fromString(avroObject.toString()); + case BINARY: + case VARBINARY: + return AvroToRowDataConverters::convertToBytes; + case DECIMAL: + return createDecimalConverter((DecimalType) type); + case ARRAY: + return createArrayConverter((ArrayType) type, legacyTimestampMapping); + case ROW: + return createRowConverter((RowType) type); + case MAP: + case MULTISET: + return createMapConverter(type, legacyTimestampMapping); + case RAW: + default: + throw new UnsupportedOperationException("Unsupported type: " + type); + } + } + + private static AvroToRowDataConverter createDecimalConverter(DecimalType decimalType) { + final int precision = decimalType.getPrecision(); + final int scale = decimalType.getScale(); + return avroObject -> { + final byte[] bytes; + if (avroObject instanceof GenericFixed) { + bytes = ((GenericFixed) avroObject).bytes(); + } else if (avroObject instanceof ByteBuffer) { + ByteBuffer byteBuffer = (ByteBuffer) avroObject; + bytes = new byte[byteBuffer.remaining()]; + byteBuffer.get(bytes); + } else { + bytes = (byte[]) avroObject; + } + return DecimalData.fromUnscaledBytes(bytes, precision, scale); + }; + } + + private static AvroToRowDataConverter createArrayConverter( + ArrayType arrayType, boolean legacyTimestampMapping) { + final AvroToRowDataConverter elementConverter = + createNullableConverter(arrayType.getElementType(), legacyTimestampMapping); + final Class elementClass = + LogicalTypeUtils.toInternalConversionClass(arrayType.getElementType()); + + return avroObject -> { + final List list = (List) avroObject; + final int length = list.size(); + final Object[] array = (Object[]) Array.newInstance(elementClass, length); + for (int i = 0; i < length; ++i) { + array[i] = elementConverter.convert(list.get(i)); + } + return new GenericArrayData(array); + }; + } + + private static AvroToRowDataConverter createMapConverter( + LogicalType type, boolean legacyTimestampMapping) { + final AvroToRowDataConverter keyConverter = + createConverter(DataTypes.STRING().getLogicalType(), legacyTimestampMapping); + final AvroToRowDataConverter valueConverter = + createNullableConverter( + AvroSchemaConverter.extractValueTypeToAvroMap(type), legacyTimestampMapping); + + return avroObject -> { + final Map map = (Map) avroObject; + Map result = Maps.newHashMap(); + for (Map.Entry entry : map.entrySet()) { + Object key = keyConverter.convert(entry.getKey()); + Object value = valueConverter.convert(entry.getValue()); + result.put(key, value); + } + return new GenericMapData(result); + }; + } + + private static TimestampData convertToTimestamp(Object object, LogicalType type) { + int precision = 3; + if (type instanceof org.apache.flink.table.types.logical.TimestampType) { + precision = ((org.apache.flink.table.types.logical.TimestampType) type).getPrecision(); + } else if (type instanceof org.apache.flink.table.types.logical.LocalZonedTimestampType) { + precision = + ((org.apache.flink.table.types.logical.LocalZonedTimestampType) type).getPrecision(); + } + + if (object instanceof Long) { + long timeLong = (Long) object; + if (precision <= 3) { + return TimestampData.fromEpochMillis(timeLong); + } else if (precision <= 6) { + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1000L), (int) Math.floorMod(timeLong, 1000L) * 1_000_000); + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1_000_000L), (int) Math.floorMod(timeLong, 1_000_000L)); + } + } else if (object instanceof Instant) { + return TimestampData.fromInstant((Instant) object); + } else if (object instanceof LocalDateTime) { + return TimestampData.fromLocalDateTime((LocalDateTime) object); + } else { + JodaConverter jodaConverter = JodaConverter.getConverter(); + if (jodaConverter != null) { + return TimestampData.fromEpochMillis(jodaConverter.convertTimestamp(object)); + } else { + throw new IllegalArgumentException( + "Unexpected object type for TIMESTAMP logical type. Received: " + object); + } + } + } + + private static int convertToDate(Object object) { + if (object instanceof Integer) { + return (Integer) object; + } else if (object instanceof LocalDate) { + return (int) ((LocalDate) object).toEpochDay(); + } else { + JodaConverter jodaConverter = JodaConverter.getConverter(); + if (jodaConverter != null) { + return (int) jodaConverter.convertDate(object); + } else { + throw new IllegalArgumentException( + "Unexpected object type for DATE logical type. Received: " + object); + } + } + } + + private static int convertToTime(Object object) { + final int millis; + if (object instanceof Integer) { + millis = (Integer) object; + } else if (object instanceof LocalTime) { + millis = ((LocalTime) object).get(ChronoField.MILLI_OF_DAY); + } else { + JodaConverter jodaConverter = JodaConverter.getConverter(); + if (jodaConverter != null) { + millis = jodaConverter.convertTime(object); + } else { + throw new IllegalArgumentException( + "Unexpected object type for TIME logical type. Received: " + object); + } + } + return millis; + } + + private static byte[] convertToBytes(Object object) { + if (object instanceof GenericFixed) { + return ((GenericFixed) object).bytes(); + } else if (object instanceof ByteBuffer) { + ByteBuffer byteBuffer = (ByteBuffer) object; + byte[] bytes = new byte[byteBuffer.remaining()]; + byteBuffer.get(bytes); + return bytes; + } else { + return (byte[]) object; + } + } +} diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/JodaConverter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/JodaConverter.java new file mode 100644 index 000000000000..c30b78023345 --- /dev/null +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/JodaConverter.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro; + +import org.joda.time.DateTime; +import org.joda.time.DateTimeFieldType; +import org.joda.time.LocalDate; +import org.joda.time.LocalTime; + +/** + * Encapsulates joda optional dependency. Instantiates this class only if joda is available on the + * classpath. + */ +@SuppressWarnings("JavaUtilDate") +class JodaConverter { + + private static JodaConverter instance; + private static boolean instantiated = false; + + public static JodaConverter getConverter() { + if (instantiated) { + return instance; + } + + try { + Class.forName( + "org.joda.time.DateTime", false, Thread.currentThread().getContextClassLoader()); + instance = new JodaConverter(); + } catch (ClassNotFoundException e) { + instance = null; + } finally { + instantiated = true; + } + return instance; + } + + public long convertDate(Object object) { + final LocalDate value = (LocalDate) object; + return value.toDate().getTime(); + } + + public int convertTime(Object object) { + final LocalTime value = (LocalTime) object; + return value.get(DateTimeFieldType.millisOfDay()); + } + + public long convertTimestamp(Object object) { + final DateTime value = (DateTime) object; + return value.toDate().getTime(); + } + + private JodaConverter() {} +} diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/RowDataToAvroConverters.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/RowDataToAvroConverters.java new file mode 100644 index 000000000000..d4c7e4282d6e --- /dev/null +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/RowDataToAvroConverters.java @@ -0,0 +1,394 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro; + +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.time.ZoneOffset; +import java.util.List; +import java.util.Map; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.util.Utf8; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.util.CollectionUtil; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +/** + * Tool class used to convert from {@link RowData} to Avro {@link GenericRecord}. + * + *

This class is adapted in Iceberg to add support for nanosecond precision timestamps + * (FLINK-39251). Once that ticket is resolved in Flink, this custom converter may be removed. + */ +@Internal +public class RowDataToAvroConverters { + + private RowDataToAvroConverters() {} + + // -------------------------------------------------------------------------------- + // Runtime Converters + // -------------------------------------------------------------------------------- + + /** + * Runtime converter that converts objects of Flink Table & SQL internal data structures to + * corresponding Avro data structures. + */ + @FunctionalInterface + public interface RowDataToAvroConverter extends Serializable { + Object convert(Schema schema, Object object); + } + + // -------------------------------------------------------------------------------- + // IMPORTANT! We use anonymous classes instead of lambdas for a reason here. It is + // necessary because the maven shade plugin cannot relocate classes in + // SerializedLambdas (MSHADE-260). On the other hand we want to relocate Avro for + // sql-client uber jars. + // -------------------------------------------------------------------------------- + + /** + * Creates a runtime converter according to the given logical type that converts objects of Flink + * Table & SQL internal data structures to corresponding Avro data structures. + */ + public static RowDataToAvroConverter createConverter(LogicalType type) { + return createConverter(type, true); + } + + @SuppressWarnings("checkstyle:MethodLength") + public static RowDataToAvroConverter createConverter( + LogicalType type, boolean legacyTimestampMapping) { + final RowDataToAvroConverter converter; + switch (type.getTypeRoot()) { + case NULL: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return null; + } + }; + break; + case TINYINT: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ((Byte) object).intValue(); + } + }; + break; + case SMALLINT: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ((Short) object).intValue(); + } + }; + break; + case BOOLEAN: // boolean + case INTEGER: // int + case INTERVAL_YEAR_MONTH: // long + case BIGINT: // long + case INTERVAL_DAY_TIME: // long + case FLOAT: // float + case DOUBLE: // double + case TIME_WITHOUT_TIME_ZONE: // int + case DATE: // int + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return object; + } + }; + break; + case CHAR: + case VARCHAR: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return new Utf8(object.toString()); + } + }; + break; + case BINARY: + case VARBINARY: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ByteBuffer.wrap((byte[]) object); + } + }; + break; + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + case TIMESTAMP_WITHOUT_TIME_ZONE: + final int tzPrecision; + if (type instanceof org.apache.flink.table.types.logical.TimestampType) { + tzPrecision = ((org.apache.flink.table.types.logical.TimestampType) type).getPrecision(); + } else { + tzPrecision = 3; + } + if (legacyTimestampMapping) { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + TimestampData timestampData = (TimestampData) object; + if (tzPrecision <= 3) { + return timestampData.getMillisecond(); + } else if (tzPrecision <= 6) { + return timestampData.getMillisecond() * 1000L + + timestampData.getNanoOfMillisecond() / 1000; + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return timestampData.getMillisecond() * 1_000_000L + + timestampData.getNanoOfMillisecond(); + } + } + }; + } else { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + TimestampData timestampData = (TimestampData) object; + java.time.Instant instant = + timestampData.toLocalDateTime().toInstant(ZoneOffset.UTC); + if (tzPrecision <= 3) { + return instant.toEpochMilli(); + } else if (tzPrecision <= 6) { + return instant.getEpochSecond() * 1_000_000L + instant.getNano() / 1000; + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return instant.getEpochSecond() * 1_000_000_000L + instant.getNano(); + } + } + }; + } + break; + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + final int ltzPrecision; + if (type instanceof org.apache.flink.table.types.logical.LocalZonedTimestampType) { + ltzPrecision = + ((org.apache.flink.table.types.logical.LocalZonedTimestampType) type).getPrecision(); + } else { + ltzPrecision = 3; + } + if (legacyTimestampMapping) { + throw new UnsupportedOperationException("Unsupported type: " + type); + } else { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + TimestampData timestampData = (TimestampData) object; + if (ltzPrecision <= 3) { + return timestampData.getMillisecond(); + } else if (ltzPrecision <= 6) { + return timestampData.getMillisecond() * 1000L + + timestampData.getNanoOfMillisecond() / 1000; + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return timestampData.getMillisecond() * 1_000_000L + + timestampData.getNanoOfMillisecond(); + } + } + }; + } + break; + case DECIMAL: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ByteBuffer.wrap(((DecimalData) object).toUnscaledBytes()); + } + }; + break; + case ARRAY: + converter = createArrayConverter((ArrayType) type, legacyTimestampMapping); + break; + case ROW: + converter = createRowConverter((RowType) type, legacyTimestampMapping); + break; + case MAP: + case MULTISET: + converter = createMapConverter(type, legacyTimestampMapping); + break; + case RAW: + default: + throw new UnsupportedOperationException("Unsupported type: " + type); + } + + // wrap into nullable converter + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + if (object == null) { + return null; + } + + // get actual schema if it is a nullable schema + Schema actualSchema; + if (schema.getType() == Schema.Type.UNION) { + List types = schema.getTypes(); + int size = types.size(); + if (size == 2 && types.get(1).getType() == Schema.Type.NULL) { + actualSchema = types.get(0); + } else if (size == 2 && types.get(0).getType() == Schema.Type.NULL) { + actualSchema = types.get(1); + } else { + throw new IllegalArgumentException( + "The Avro schema is not a nullable type: " + schema.toString()); + } + } else { + actualSchema = schema; + } + return converter.convert(actualSchema, object); + } + }; + } + + private static RowDataToAvroConverter createRowConverter( + RowType rowType, boolean legacyTimestampMapping) { + final RowDataToAvroConverter[] fieldConverters = + rowType.getChildren().stream() + .map(legacyType -> createConverter(legacyType, legacyTimestampMapping)) + .toArray(RowDataToAvroConverter[]::new); + final LogicalType[] fieldTypes = + rowType.getFields().stream().map(RowType.RowField::getType).toArray(LogicalType[]::new); + final RowData.FieldGetter[] fieldGetters = new RowData.FieldGetter[fieldTypes.length]; + for (int i = 0; i < fieldTypes.length; i++) { + fieldGetters[i] = RowData.createFieldGetter(fieldTypes[i], i); + } + final int length = rowType.getFieldCount(); + + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + final RowData row = (RowData) object; + final List fields = schema.getFields(); + final GenericRecord record = new GenericData.Record(schema); + for (int i = 0; i < length; ++i) { + final Schema.Field schemaField = fields.get(i); + try { + Object avroObject = + fieldConverters[i].convert( + schemaField.schema(), fieldGetters[i].getFieldOrNull(row)); + record.put(i, avroObject); + } catch (Throwable t) { + throw new RuntimeException( + String.format("Fail to serialize at field: %s.", schemaField.name()), t); + } + } + return record; + } + }; + } + + private static RowDataToAvroConverter createArrayConverter( + ArrayType arrayType, boolean legacyTimestampMapping) { + LogicalType elementType = arrayType.getElementType(); + final ArrayData.ElementGetter elementGetter = ArrayData.createElementGetter(elementType); + final RowDataToAvroConverter elementConverter = + createConverter(arrayType.getElementType(), legacyTimestampMapping); + + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + final Schema elementSchema = schema.getElementType(); + ArrayData arrayData = (ArrayData) object; + List list = Lists.newArrayList(); + for (int i = 0; i < arrayData.size(); ++i) { + list.add( + elementConverter.convert( + elementSchema, elementGetter.getElementOrNull(arrayData, i))); + } + return list; + } + }; + } + + private static RowDataToAvroConverter createMapConverter( + LogicalType type, boolean legacyTimestampMapping) { + LogicalType valueType = AvroSchemaConverter.extractValueTypeToAvroMap(type); + final ArrayData.ElementGetter valueGetter = ArrayData.createElementGetter(valueType); + final RowDataToAvroConverter valueConverter = + createConverter(valueType, legacyTimestampMapping); + + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + final Schema valueSchema = schema.getValueType(); + final MapData mapData = (MapData) object; + final ArrayData keyArray = mapData.keyArray(); + final ArrayData valueArray = mapData.valueArray(); + final Map map = CollectionUtil.newHashMapWithExpectedSize(mapData.size()); + for (int i = 0; i < mapData.size(); ++i) { + final String key = keyArray.getString(i).toString(); + final Object value = + valueConverter.convert(valueSchema, valueGetter.getElementOrNull(valueArray, i)); + map.put(key, value); + } + return map; + } + }; + } +} diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/typeutils/AvroSchemaConverter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/typeutils/AvroSchemaConverter.java new file mode 100644 index 000000000000..347631c7f451 --- /dev/null +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/typeutils/AvroSchemaConverter.java @@ -0,0 +1,625 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro.typeutils; + +import java.util.List; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; +import org.apache.avro.SchemaParseException; +import org.apache.avro.specific.SpecificData; +import org.apache.avro.specific.SpecificRecord; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.formats.avro.AvroRowDataDeserializationSchema; +import org.apache.flink.formats.avro.AvroRowDataSerializationSchema; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.types.AtomicDataType; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.IntType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.LogicalTypeFamily; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.MultisetType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.TimeType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.flink.table.types.logical.TypeInformationRawType; +import org.apache.flink.types.Row; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * Converts an Avro schema into Flink's type information. It uses {@link RowTypeInfo} for + * representing objects and converts Avro types into types that are compatible with Flink's Table + * & SQL API. + * + *

Note: Changes in this class need to be kept in sync with the corresponding runtime classes + * {@link AvroRowDataDeserializationSchema} and {@link AvroRowDataSerializationSchema}. + * + *

This class is adapted in Iceberg to support custom 'timestamp-nanos' and + * 'local-timestamp-nanos' logical types (FLINK-39251). Once that ticket is resolved in Flink, these + * custom types may be removed. + */ +public class AvroSchemaConverter { + + private AvroSchemaConverter() { + // private + } + + /** + * Converts an Avro class into a nested row structure with deterministic field order and data + * types that are compatible with Flink's Table & SQL API. + * + * @param avroClass Avro specific record that contains schema information + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo( + Class avroClass) { + return convertToTypeInfo(avroClass, true); + } + + /** + * Converts an Avro class into a nested row structure with deterministic field order and data + * types that are compatible with Flink's Table & SQL API. + * + * @param avroClass Avro specific record that contains schema information + * @param legacyTimestampMapping legacy mapping of timestamp types + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo( + Class avroClass, boolean legacyTimestampMapping) { + Preconditions.checkNotNull(avroClass, "Avro specific record class must not be null."); + // determine schema to retrieve deterministic field order + final Schema schema = SpecificData.get().getSchema(avroClass); + return (TypeInformation) convertToTypeInfo(schema, true); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo(String avroSchemaString) { + return convertToTypeInfo(avroSchemaString, true); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @param legacyTimestampMapping legacy mapping of timestamp types + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo( + String avroSchemaString, boolean legacyTimestampMapping) { + Preconditions.checkNotNull(avroSchemaString, "Avro schema must not be null."); + final Schema schema; + try { + schema = new Schema.Parser().parse(avroSchemaString); + } catch (SchemaParseException e) { + throw new IllegalArgumentException("Could not parse Avro schema string.", e); + } + return (TypeInformation) convertToTypeInfo(schema, legacyTimestampMapping); + } + + private static TypeInformation convertToTypeInfo( + Schema schema, boolean legacyTimestampMapping) { + switch (schema.getType()) { + case RECORD: + final List fields = schema.getFields(); + + final TypeInformation[] types = new TypeInformation[fields.size()]; + final String[] names = new String[fields.size()]; + for (int i = 0; i < fields.size(); i++) { + final Schema.Field field = fields.get(i); + types[i] = convertToTypeInfo(field.schema(), legacyTimestampMapping); + names[i] = field.name(); + } + return Types.ROW_NAMED(names, types); + case ENUM: + return Types.STRING; + case ARRAY: + // result type might either be ObjectArrayTypeInfo or BasicArrayTypeInfo for Strings + return Types.OBJECT_ARRAY( + convertToTypeInfo(schema.getElementType(), legacyTimestampMapping)); + case MAP: + return Types.MAP( + Types.STRING, convertToTypeInfo(schema.getValueType(), legacyTimestampMapping)); + case UNION: + final Schema actualSchema; + if (schema.getTypes().size() == 2 + && schema.getTypes().get(0).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(1); + } else if (schema.getTypes().size() == 2 + && schema.getTypes().get(1).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(0); + } else if (schema.getTypes().size() == 1) { + actualSchema = schema.getTypes().get(0); + } else { + // use Kryo for serialization + return Types.GENERIC(Object.class); + } + return convertToTypeInfo(actualSchema, legacyTimestampMapping); + case FIXED: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + return Types.BIG_DEC; + } + // convert fixed size binary data to primitive byte arrays + return Types.PRIMITIVE_ARRAY(Types.BYTE); + case STRING: + // convert Avro's Utf8/CharSequence to String + return Types.STRING; + case BYTES: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + return Types.BIG_DEC; + } + return Types.PRIMITIVE_ARRAY(Types.BYTE); + case INT: + // logical date and time type + final org.apache.avro.LogicalType logicalType = schema.getLogicalType(); + if (logicalType == LogicalTypes.date()) { + return Types.SQL_DATE; + } else if (logicalType == LogicalTypes.timeMillis()) { + return Types.SQL_TIME; + } + return Types.INT; + case LONG: + if (legacyTimestampMapping) { + if (schema.getLogicalType() == LogicalTypes.timestampMillis() + || schema.getLogicalType() == LogicalTypes.timestampMicros() + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + || (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos"))) { + return Types.SQL_TIMESTAMP; + } else if (schema.getLogicalType() == LogicalTypes.timeMicros() + || schema.getLogicalType() == LogicalTypes.timeMillis()) { + return Types.SQL_TIME; + } + } else { + // Avro logical timestamp types to Flink DataStream timestamp types + if (schema.getLogicalType() == LogicalTypes.timestampMillis() + || schema.getLogicalType() == LogicalTypes.timestampMicros() + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + || (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos"))) { + return Types.INSTANT; + } else if (schema.getLogicalType() == LogicalTypes.localTimestampMillis() + || schema.getLogicalType() == LogicalTypes.localTimestampMicros() + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + || (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("local-timestamp-nanos"))) { + return Types.LOCAL_DATE_TIME; + } else if (schema.getLogicalType() == LogicalTypes.timeMicros() + || schema.getLogicalType() == LogicalTypes.timeMillis()) { + return Types.SQL_TIME; + } + } + return Types.LONG; + case FLOAT: + return Types.FLOAT; + case DOUBLE: + return Types.DOUBLE; + case BOOLEAN: + return Types.BOOLEAN; + case NULL: + return Types.VOID; + } + throw new IllegalArgumentException("Unsupported Avro type '" + schema.getType() + "'."); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @return data type matching the schema + */ + public static DataType convertToDataType(String avroSchemaString) { + return convertToDataType(avroSchemaString, true); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @param legacyTimestampMapping legacy mapping of local timestamps + * @return data type matching the schema + */ + public static DataType convertToDataType( + String avroSchemaString, boolean legacyTimestampMapping) { + Preconditions.checkNotNull(avroSchemaString, "Avro schema must not be null."); + final Schema schema; + try { + schema = new Schema.Parser().parse(avroSchemaString); + } catch (SchemaParseException e) { + throw new IllegalArgumentException("Could not parse Avro schema string.", e); + } + return convertToDataType(schema, legacyTimestampMapping); + } + + @SuppressWarnings("deprecation") + private static DataType convertToDataType(Schema schema, boolean legacyMapping) { + switch (schema.getType()) { + case RECORD: + final List schemaFields = schema.getFields(); + + final DataTypes.Field[] fields = new DataTypes.Field[schemaFields.size()]; + for (int i = 0; i < schemaFields.size(); i++) { + final Schema.Field field = schemaFields.get(i); + fields[i] = + DataTypes.FIELD(field.name(), convertToDataType(field.schema(), legacyMapping)); + } + return DataTypes.ROW(fields).notNull(); + case ENUM: + return DataTypes.STRING().notNull(); + case ARRAY: + return DataTypes.ARRAY(convertToDataType(schema.getElementType(), legacyMapping)).notNull(); + case MAP: + return DataTypes.MAP( + DataTypes.STRING().notNull(), + convertToDataType(schema.getValueType(), legacyMapping)) + .notNull(); + case UNION: + final Schema actualSchema; + final boolean nullable; + if (schema.getTypes().size() == 2 + && schema.getTypes().get(0).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(1); + nullable = true; + } else if (schema.getTypes().size() == 2 + && schema.getTypes().get(1).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(0); + nullable = true; + } else if (schema.getTypes().size() == 1) { + actualSchema = schema.getTypes().get(0); + nullable = false; + } else { + // use Kryo for serialization + return new AtomicDataType( + new TypeInformationRawType<>(false, Types.GENERIC(Object.class))); + } + DataType converted = convertToDataType(actualSchema, legacyMapping); + return nullable ? converted.nullable() : converted; + case FIXED: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + final LogicalTypes.Decimal decimalType = (LogicalTypes.Decimal) schema.getLogicalType(); + return DataTypes.DECIMAL(decimalType.getPrecision(), decimalType.getScale()).notNull(); + } + // convert fixed size binary data to primitive byte arrays + return DataTypes.VARBINARY(schema.getFixedSize()).notNull(); + case STRING: + // convert Avro's Utf8/CharSequence to String + return DataTypes.STRING().notNull(); + case BYTES: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + final LogicalTypes.Decimal decimalType = (LogicalTypes.Decimal) schema.getLogicalType(); + return DataTypes.DECIMAL(decimalType.getPrecision(), decimalType.getScale()).notNull(); + } + return DataTypes.BYTES().notNull(); + case INT: + // logical date and time type + final org.apache.avro.LogicalType logicalType = schema.getLogicalType(); + if (logicalType == LogicalTypes.date()) { + return DataTypes.DATE().notNull(); + } else if (logicalType == LogicalTypes.timeMillis()) { + return DataTypes.TIME(3).notNull(); + } + return DataTypes.INT().notNull(); + case LONG: + if (legacyMapping) { + // Avro logical timestamp types to Flink SQL timestamp types + if (schema.getLogicalType() == LogicalTypes.timestampMillis()) { + return DataTypes.TIMESTAMP(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timestampMicros()) { + return DataTypes.TIMESTAMP(6).notNull(); + } else if (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos")) { + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + return DataTypes.TIMESTAMP(9).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMillis()) { + return DataTypes.TIME(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMicros()) { + return DataTypes.TIME(6).notNull(); + } + } else { + // Avro logical timestamp types to Flink SQL timestamp types + if (schema.getLogicalType() == LogicalTypes.timestampMillis()) { + return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timestampMicros()) { + return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(6).notNull(); + } else if (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos")) { + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(9).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMillis()) { + return DataTypes.TIME(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMicros()) { + return DataTypes.TIME(6).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.localTimestampMillis()) { + return DataTypes.TIMESTAMP(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.localTimestampMicros()) { + return DataTypes.TIMESTAMP(6).notNull(); + } else if (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("local-timestamp-nanos")) { + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + return DataTypes.TIMESTAMP(9).notNull(); + } + } + + return DataTypes.BIGINT().notNull(); + case FLOAT: + return DataTypes.FLOAT().notNull(); + case DOUBLE: + return DataTypes.DOUBLE().notNull(); + case BOOLEAN: + return DataTypes.BOOLEAN().notNull(); + case NULL: + return DataTypes.NULL(); + } + throw new IllegalArgumentException("Unsupported Avro type '" + schema.getType() + "'."); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

Use "org.apache.flink.avro.generated.record" as the type name. + * + * @param schema the schema type, usually it should be the top level record type, e.g. not a + * nested type + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema(LogicalType schema) { + return convertToSchema(schema, true); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

Use "org.apache.flink.avro.generated.record" as the type name. + * + * @param schema the schema type, usually it should be the top level record type, e.g. not a + * nested type + * @param legacyTimestampMapping whether to use the legacy timestamp mapping + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema(LogicalType schema, boolean legacyTimestampMapping) { + return convertToSchema( + schema, "org.apache.flink.avro.generated.record", legacyTimestampMapping); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

The "{rowName}_" is used as the nested row type name prefix in order to generate the right + * schema. Nested record type that only differs with type name is still compatible. + * + * @param logicalType logical type + * @param rowName the record name + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema(LogicalType logicalType, String rowName) { + return convertToSchema(logicalType, rowName, true); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

The "{rowName}_" is used as the nested row type name prefix in order to generate the right + * schema. Nested record type that only differs with type name is still compatible. + * + * @param logicalType logical type + * @param rowName the record name + * @param legacyTimestampMapping whether to use legal timestamp mapping + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema( + LogicalType logicalType, String rowName, boolean legacyTimestampMapping) { + int precision; + boolean nullable = logicalType.isNullable(); + switch (logicalType.getTypeRoot()) { + case NULL: + return SchemaBuilder.builder().nullType(); + case BOOLEAN: + Schema bool = SchemaBuilder.builder().booleanType(); + return nullable ? nullableSchema(bool) : bool; + case TINYINT: + case SMALLINT: + case INTEGER: + Schema integer = SchemaBuilder.builder().intType(); + return nullable ? nullableSchema(integer) : integer; + case BIGINT: + Schema bigint = SchemaBuilder.builder().longType(); + return nullable ? nullableSchema(bigint) : bigint; + case FLOAT: + Schema floatSchema = SchemaBuilder.builder().floatType(); + return nullable ? nullableSchema(floatSchema) : floatSchema; + case DOUBLE: + Schema doubleSchema = SchemaBuilder.builder().doubleType(); + return nullable ? nullableSchema(doubleSchema) : doubleSchema; + case CHAR: + case VARCHAR: + Schema str = SchemaBuilder.builder().stringType(); + return nullable ? nullableSchema(str) : str; + case BINARY: + case VARBINARY: + Schema binary = SchemaBuilder.builder().bytesType(); + return nullable ? nullableSchema(binary) : binary; + case TIMESTAMP_WITHOUT_TIME_ZONE: + // use long to represents Timestamp + final TimestampType timestampType = (TimestampType) logicalType; + precision = timestampType.getPrecision(); + org.apache.avro.LogicalType avroLogicalType; + if (legacyTimestampMapping) { + if (precision <= 3) { + avroLogicalType = LogicalTypes.timestampMillis(); + } else { + throw new IllegalArgumentException( + "Avro does not support TIMESTAMP type " + + "with precision: " + + precision + + ", it only supports precision less than 3."); + } + } else { + if (precision <= 3) { + avroLogicalType = LogicalTypes.localTimestampMillis(); + } else if (precision <= 6) { + avroLogicalType = LogicalTypes.localTimestampMicros(); + } else { + throw new IllegalArgumentException( + "Avro does not support LOCAL TIMESTAMP type " + + "with precision: " + + precision + + ", it only supports precision less than 6."); + } + } + Schema timestamp = avroLogicalType.addToSchema(SchemaBuilder.builder().longType()); + return nullable ? nullableSchema(timestamp) : timestamp; + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + if (legacyTimestampMapping) { + throw new UnsupportedOperationException( + "Unsupported to derive Schema for type: " + logicalType); + } else { + final LocalZonedTimestampType localZonedTimestampType = + (LocalZonedTimestampType) logicalType; + precision = localZonedTimestampType.getPrecision(); + if (precision <= 3) { + avroLogicalType = LogicalTypes.timestampMillis(); + } else if (precision <= 6) { + avroLogicalType = LogicalTypes.timestampMicros(); + } else { + throw new IllegalArgumentException( + "Avro does not support TIMESTAMP type " + + "with precision: " + + precision + + ", it only supports precision less than 6."); + } + timestamp = avroLogicalType.addToSchema(SchemaBuilder.builder().longType()); + return nullable ? nullableSchema(timestamp) : timestamp; + } + case DATE: + // use int to represents Date + Schema date = LogicalTypes.date().addToSchema(SchemaBuilder.builder().intType()); + return nullable ? nullableSchema(date) : date; + case TIME_WITHOUT_TIME_ZONE: + precision = ((TimeType) logicalType).getPrecision(); + if (precision > 3) { + throw new IllegalArgumentException( + "Avro does not support TIME type with precision: " + + precision + + ", it only supports precision less than 3."); + } + // use int to represents Time, we only support millisecond when deserialization + Schema time = LogicalTypes.timeMillis().addToSchema(SchemaBuilder.builder().intType()); + return nullable ? nullableSchema(time) : time; + case DECIMAL: + DecimalType decimalType = (DecimalType) logicalType; + // store BigDecimal as byte[] + Schema decimal = + LogicalTypes.decimal(decimalType.getPrecision(), decimalType.getScale()) + .addToSchema(SchemaBuilder.builder().bytesType()); + return nullable ? nullableSchema(decimal) : decimal; + case ROW: + RowType rowType = (RowType) logicalType; + List fieldNames = rowType.getFieldNames(); + // we have to make sure the record name is different in a Schema + SchemaBuilder.FieldAssembler builder = + SchemaBuilder.builder().record(rowName).fields(); + for (int i = 0; i < rowType.getFieldCount(); i++) { + String fieldName = fieldNames.get(i); + LogicalType fieldType = rowType.getTypeAt(i); + SchemaBuilder.GenericDefault fieldBuilder = + builder + .name(fieldName) + .type( + convertToSchema( + fieldType, rowName + "_" + fieldName, legacyTimestampMapping)); + + if (fieldType.isNullable()) { + builder = fieldBuilder.withDefault(null); + } else { + builder = fieldBuilder.noDefault(); + } + } + Schema record = builder.endRecord(); + return nullable ? nullableSchema(record) : record; + case MULTISET: + case MAP: + Schema map = + SchemaBuilder.builder() + .map() + .values(convertToSchema(extractValueTypeToAvroMap(logicalType), rowName)); + return nullable ? nullableSchema(map) : map; + case ARRAY: + ArrayType arrayType = (ArrayType) logicalType; + Schema array = + SchemaBuilder.builder() + .array() + .items(convertToSchema(arrayType.getElementType(), rowName)); + return nullable ? nullableSchema(array) : array; + case RAW: + default: + throw new UnsupportedOperationException( + "Unsupported to derive Schema for type: " + logicalType); + } + } + + public static LogicalType extractValueTypeToAvroMap(LogicalType type) { + LogicalType keyType; + LogicalType valueType; + if (type instanceof MapType) { + MapType mapType = (MapType) type; + keyType = mapType.getKeyType(); + valueType = mapType.getValueType(); + } else { + MultisetType multisetType = (MultisetType) type; + keyType = multisetType.getElementType(); + valueType = new IntType(); + } + if (!keyType.is(LogicalTypeFamily.CHARACTER_STRING)) { + throw new UnsupportedOperationException( + "Avro format doesn't support non-string as key type of map. " + + "The key type is: " + + keyType.asSummaryString()); + } + return valueType; + } + + /** Returns schema with nullable true. */ + private static Schema nullableSchema(Schema schema) { + return schema.isNullable() + ? schema + : Schema.createUnion(SchemaBuilder.builder().nullType(), schema); + } +} diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java index f68605accc57..30e95b1edba0 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java @@ -260,10 +260,6 @@ public void unlock() { this, instanceId, count); - } catch (SQLException e) { - // SQL exception happened when deleting lock information - throw new UncheckedSQLException( - e, "Failed to delete %s lock with instanceId %s", this, instanceId); } return null; @@ -298,9 +294,6 @@ private String instanceId() { return null; } } - } catch (SQLException e) { - // SQL exception happened when getting lock information - throw new UncheckedSQLException(e, "Failed to get lock information for %s", type); } }); } catch (InterruptedException e) { diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java index 9aeee75b1464..f03f33a3fd81 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java @@ -24,6 +24,7 @@ import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.util.function.SerializableSupplier; import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.actions.BinPackRewriteFilePlanner; import org.apache.iceberg.actions.SizeBasedFileRewritePlanner; @@ -59,7 +60,7 @@ public static class Builder extends MaintenanceTaskBuilder rewriteOptions = Maps.newHashMapWithExpectedSize(6); private long maxRewriteBytes = Long.MAX_VALUE; - private Expression filter = Expressions.alwaysTrue(); + private SerializableSupplier filterSupplier = Expressions::alwaysTrue; private String branch = SnapshotRef.MAIN_BRANCH; @Override @@ -214,9 +215,32 @@ public Builder maxFilesToRewrite(int maxFilesToRewrite) { * * @param newFilter the filter expression to apply * @return this for method chaining + * @deprecated will be removed in 1.12.0. Use {@link #filter(SerializableSupplier)} instead */ + @Deprecated public Builder filter(Expression newFilter) { - this.filter = newFilter; + this.filterSupplier = () -> newFilter; + return this; + } + + /** + * A user-provided supplier of a filter expression that determines which files are considered by + * the rewrite strategy. + * + *

The supplier is evaluated by the planner on every compaction trigger, allowing a fresh + * filter to be produced for each compaction run. + * + *

This is particularly useful for time-relative filters. For example, a supplier such as + * {@code () -> Expressions.greaterThanOrEqual("ts", + * LocalDateTime.now(ZoneOffset.UTC).minus(Duration.ofDays(3)).toString())} ensures that each + * compaction rewrites files from the last 3 days relative to the time the compaction is + * planned, rather than relative to when the job was started. + * + * @param newFilterSupplier the supplier providing the filter expression to apply + * @return this for method chaining + */ + public Builder filter(SerializableSupplier newFilterSupplier) { + this.filterSupplier = newFilterSupplier; return this; } @@ -276,7 +300,7 @@ DataStream append(DataStream trigger) { partialProgressEnabled ? partialProgressMaxCommits : 1, maxRewriteBytes, rewriteOptions, - filter, + filterSupplier, branch)) .name(operatorName(PLANNER_TASK_NAME)) .uid(PLANNER_TASK_NAME + uidSuffix()) diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java index feb2dd26c807..9c3b44b9d544 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java @@ -26,6 +26,7 @@ import org.apache.flink.metrics.Counter; import org.apache.flink.streaming.api.functions.ProcessFunction; import org.apache.flink.util.Collector; +import org.apache.flink.util.function.SerializableSupplier; import org.apache.iceberg.DataFile; import org.apache.iceberg.FileScanTask; import org.apache.iceberg.SerializableTable; @@ -62,8 +63,8 @@ public class DataFileRewritePlanner private final long maxRewriteBytes; private final Map rewriterOptions; private transient Counter errorCounter; - private final Expression filter; private final String branch; + private final SerializableSupplier filterSupplier; public DataFileRewritePlanner( String tableName, @@ -73,7 +74,7 @@ public DataFileRewritePlanner( int newPartialProgressMaxCommits, long maxRewriteBytes, Map rewriterOptions, - Expression filter, + SerializableSupplier filterSupplier, String branch) { Preconditions.checkNotNull(tableName, "Table name should no be null"); @@ -89,8 +90,8 @@ public DataFileRewritePlanner( this.partialProgressMaxCommits = newPartialProgressMaxCommits; this.maxRewriteBytes = maxRewriteBytes; this.rewriterOptions = rewriterOptions; - this.filter = filter; this.branch = branch; + this.filterSupplier = filterSupplier; } @Override @@ -125,7 +126,7 @@ public void processElement(Trigger value, Context ctx, Collector o } BinPackRewriteFilePlanner planner = - new BinPackRewriteFilePlanner(table, filter, snapshot.snapshotId(), false); + new BinPackRewriteFilePlanner(table, filterSupplier.get(), snapshot.snapshotId(), false); planner.init(rewriterOptions); FileRewritePlan diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java index f7e8e0c884cf..5f3494330cfc 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java @@ -21,14 +21,14 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.flink.api.common.functions.MapFunction; -import org.apache.flink.formats.avro.AvroToRowDataConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.table.types.logical.RowType; import org.apache.flink.table.types.utils.TypeConversions; import org.apache.iceberg.avro.AvroSchemaUtil; +import org.apache.iceberg.flink.formats.avro.AvroToRowDataConverters; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; /** * This util class converts Avro GenericRecord to Flink RowData.
diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java index 434f3969577f..6cf15ff713fb 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java @@ -18,23 +18,33 @@ */ package org.apache.iceberg.flink.sink; -import com.codahale.metrics.SlidingWindowReservoir; import java.util.Arrays; import java.util.concurrent.atomic.AtomicLong; import org.apache.flink.annotation.Internal; -import org.apache.flink.dropwizard.metrics.DropwizardHistogramWrapper; +import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.metrics.Counter; import org.apache.flink.metrics.Histogram; import org.apache.flink.metrics.MetricGroup; +import org.apache.iceberg.common.DynClasses; +import org.apache.iceberg.common.DynConstructors; import org.apache.iceberg.io.WriteResult; import org.apache.iceberg.util.ScanTaskUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; @Internal public class IcebergStreamWriterMetrics { + + private static final Logger LOG = LoggerFactory.getLogger(IcebergStreamWriterMetrics.class); + // 1,024 reservoir size should cost about 8KB, which is quite small. // It should also produce good accuracy for histogram distribution (like percentiles). private static final int HISTOGRAM_RESERVOIR_SIZE = 1024; + // Histogram metrics loaded through Flink's optional flink-metrics-dropwizard dependency. + // Will be null if not available. + private static final DropwizardCtors DROPWIZARD = loadDropwizardCtors(); + private final Counter flushedDataFiles; private final Counter flushedDeleteFiles; private final Counter flushedReferencedDataFiles; @@ -51,18 +61,8 @@ public IcebergStreamWriterMetrics(MetricGroup metrics, String fullTableName) { this.lastFlushDurationMs = new AtomicLong(); writerMetrics.gauge("lastFlushDurationMs", lastFlushDurationMs::get); - com.codahale.metrics.Histogram dropwizardDataFilesSizeHistogram = - new com.codahale.metrics.Histogram(new SlidingWindowReservoir(HISTOGRAM_RESERVOIR_SIZE)); - this.dataFilesSizeHistogram = - writerMetrics.histogram( - "dataFilesSizeHistogram", - new DropwizardHistogramWrapper(dropwizardDataFilesSizeHistogram)); - com.codahale.metrics.Histogram dropwizardDeleteFilesSizeHistogram = - new com.codahale.metrics.Histogram(new SlidingWindowReservoir(HISTOGRAM_RESERVOIR_SIZE)); - this.deleteFilesSizeHistogram = - writerMetrics.histogram( - "deleteFilesSizeHistogram", - new DropwizardHistogramWrapper(dropwizardDeleteFilesSizeHistogram)); + this.dataFilesSizeHistogram = registerHistogram(writerMetrics, "dataFilesSizeHistogram"); + this.deleteFilesSizeHistogram = registerHistogram(writerMetrics, "deleteFilesSizeHistogram"); } public void updateFlushResult(WriteResult result) { @@ -74,16 +74,21 @@ public void updateFlushResult(WriteResult result) { // This should works equally well and we avoided the overhead of tracking the list of file sizes // in the {@link CommitSummary}, which currently stores simple stats for counters and gauges // metrics. - Arrays.stream(result.dataFiles()) - .forEach( - dataFile -> { - dataFilesSizeHistogram.update(dataFile.fileSizeInBytes()); - }); - Arrays.stream(result.deleteFiles()) - .forEach( - deleteFile -> { - deleteFilesSizeHistogram.update(ScanTaskUtil.contentSizeInBytes(deleteFile)); - }); + if (dataFilesSizeHistogram != null) { + Arrays.stream(result.dataFiles()) + .forEach( + dataFile -> { + dataFilesSizeHistogram.update(dataFile.fileSizeInBytes()); + }); + } + + if (deleteFilesSizeHistogram != null) { + Arrays.stream(result.deleteFiles()) + .forEach( + deleteFile -> { + deleteFilesSizeHistogram.update(ScanTaskUtil.contentSizeInBytes(deleteFile)); + }); + } } public void flushDuration(long flushDurationMs) { @@ -97,4 +102,60 @@ public Counter getFlushedDataFiles() { public Counter getFlushedDeleteFiles() { return flushedDeleteFiles; } + + @VisibleForTesting + Histogram dataFilesSizeHistogram() { + return dataFilesSizeHistogram; + } + + @VisibleForTesting + Histogram deleteFilesSizeHistogram() { + return deleteFilesSizeHistogram; + } + + private static Histogram registerHistogram(MetricGroup group, String name) { + Histogram histogram = newDropwizardHistogram(); + return histogram != null ? group.histogram(name, histogram) : null; + } + + private static Histogram newDropwizardHistogram() { + if (DROPWIZARD == null) { + return null; + } + + Object reservoir = DROPWIZARD.reservoirCtor.newInstance(HISTOGRAM_RESERVOIR_SIZE); + Object codahaleHistogram = DROPWIZARD.histogramCtor.newInstance(reservoir); + return DROPWIZARD.wrapperCtor.newInstance(codahaleHistogram); + } + + private static DropwizardCtors loadDropwizardCtors() { + try { + Class reservoirInterface = + DynClasses.builder().impl("com.codahale.metrics.Reservoir").buildChecked(); + Class codahaleHistogramClass = + DynClasses.builder().impl("com.codahale.metrics.Histogram").buildChecked(); + return new DropwizardCtors( + DynConstructors.builder() + .impl("com.codahale.metrics.SlidingWindowReservoir", int.class) + .buildChecked(), + DynConstructors.builder() + .impl("com.codahale.metrics.Histogram", reservoirInterface) + .buildChecked(), + DynConstructors.builder(Histogram.class) + .impl( + "org.apache.flink.dropwizard.metrics.DropwizardHistogramWrapper", + codahaleHistogramClass) + .buildChecked()); + } catch (ClassNotFoundException | NoSuchMethodException e) { + LOG.warn( + "Cannot load Dropwizard metrics; is org.apache.flink:flink-metrics-dropwizard on the classpath?", + e); + return null; + } + } + + private record DropwizardCtors( + DynConstructors.Ctor reservoirCtor, + DynConstructors.Ctor histogramCtor, + DynConstructors.Ctor wrapperCtor) {} } diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java index 218fa2d911c8..e7cd2c16459f 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java @@ -44,6 +44,7 @@ import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSink; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.streaming.runtime.operators.sink.SinkWriterOperatorFactory; import org.apache.flink.table.data.RowData; import org.apache.flink.util.OutputTag; import org.apache.iceberg.Table; @@ -79,13 +80,17 @@ public class DynamicIcebergSink private final Configuration flinkConfig; private final int cacheMaximumSize; + // Set by the builder before sinkTo() — forward writer results to union into pre-commit topology + private final transient DataStream> forwardWriteResults; + DynamicIcebergSink( CatalogLoader catalogLoader, Map snapshotProperties, String uidPrefix, Map writeProperties, Configuration flinkConfig, - int cacheMaximumSize) { + int cacheMaximumSize, + DataStream> forwardWriteResults) { this.catalogLoader = catalogLoader; this.snapshotProperties = snapshotProperties; this.uidPrefix = uidPrefix; @@ -96,6 +101,7 @@ public class DynamicIcebergSink // This is used to separate files generated by different sinks writing the same table. // Also used to generate the aggregator operator name this.sinkId = UUID.randomUUID().toString(); + this.forwardWriteResults = forwardWriteResults; } @SuppressWarnings("deprecation") @@ -145,7 +151,11 @@ public DataStream> addPreCommitTopology( TypeInformation> typeInformation = CommittableMessageTypeInfo.of(this::getCommittableSerializer); - return writeResults + // Union forward writer results with the shuffle writer results + DataStream> allResults = + writeResults.union(forwardWriteResults); + + return allResults .keyBy( committable -> { if (committable instanceof CommittableSummary) { @@ -168,6 +178,56 @@ public SimpleVersionedSerializer getWriteResultSerializer() return new DynamicWriteResultSerializer(); } + /** + * A lightweight Sink used with {@link SinkWriterOperatorFactory} for the forward write path. + * Implements {@link SupportsCommitter} so that {@code SinkWriterOperator} emits committables + * downstream. The committer is never called — committing is handled by the main sink. + */ + @VisibleForTesting + static class ForwardWriterSink + implements Sink, SupportsCommitter { + + private final CatalogLoader catalogLoader; + private final Map writeProperties; + private final Configuration flinkConfig; + private final int cacheMaximumSize; + + ForwardWriterSink( + CatalogLoader catalogLoader, + Map writeProperties, + Configuration flinkConfig, + int cacheMaximumSize) { + this.catalogLoader = catalogLoader; + this.writeProperties = writeProperties; + this.flinkConfig = flinkConfig; + this.cacheMaximumSize = cacheMaximumSize; + } + + @SuppressWarnings("deprecation") + @Override + public SinkWriter createWriter(InitContext context) throws IOException { + return new DynamicWriter( + catalogLoader.loadCatalog(), + writeProperties, + flinkConfig, + cacheMaximumSize, + new DynamicWriterMetrics(context.metricGroup()), + context.getSubtaskId(), + context.getAttemptNumber()); + } + + @Override + public Committer createCommitter(CommitterInitContext context) { + throw new UnsupportedOperationException( + "WriterSink is used only for writing; committing is handled by the main sink"); + } + + @Override + public SimpleVersionedSerializer getCommittableSerializer() { + return new DynamicWriteResultSerializer(); + } + } + public static class Builder { private DataStream input; private DynamicRecordGenerator generator; @@ -177,12 +237,6 @@ public static class Builder { private final Map snapshotSummary = Maps.newHashMap(); private ReadableConfig readableConfig = new Configuration(); private TableCreator tableCreator = TableCreator.DEFAULT; - private boolean immediateUpdate = false; - private boolean dropUnusedColumns = false; - private int cacheMaximumSize = 100; - private long cacheRefreshMs = 1_000; - private int inputSchemasPerTableCacheMaximumSize = 10; - private boolean caseSensitive = true; Builder() {} @@ -303,7 +357,9 @@ public Builder toBranch(String branch) { } public Builder immediateTableUpdate(boolean newImmediateUpdate) { - this.immediateUpdate = newImmediateUpdate; + writeOptions.put( + FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE.key(), + Boolean.toString(newImmediateUpdate)); return this; } @@ -319,19 +375,21 @@ public Builder immediateTableUpdate(boolean newImmediateUpdate) { * will never return data of the old column. */ public Builder dropUnusedColumns(boolean newDropUnusedColumns) { - this.dropUnusedColumns = newDropUnusedColumns; + writeOptions.put( + FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.key(), + Boolean.toString(newDropUnusedColumns)); return this; } /** Maximum size of the caches used in Dynamic Sink for table data and serializers. */ public Builder cacheMaxSize(int maxSize) { - this.cacheMaximumSize = maxSize; + writeOptions.put(FlinkDynamicSinkOptions.CACHE_MAX_SIZE.key(), Integer.toString(maxSize)); return this; } /** Maximum interval for cache items renewals. */ public Builder cacheRefreshMs(long refreshMs) { - this.cacheRefreshMs = refreshMs; + writeOptions.put(FlinkDynamicSinkOptions.CACHE_REFRESH_MS.key(), Long.toString(refreshMs)); return this; } @@ -341,7 +399,9 @@ public Builder cacheRefreshMs(long refreshMs) { * comparison results. */ public Builder inputSchemasPerTableCacheMaxSize(int inputSchemasPerTableCacheMaxSize) { - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaxSize; + writeOptions.put( + FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE.key(), + Integer.toString(inputSchemasPerTableCacheMaxSize)); return this; } @@ -350,7 +410,8 @@ public Builder inputSchemasPerTableCacheMaxSize(int inputSchemasPerTableCache * field names case-sensitive. */ public Builder caseSensitive(boolean newCaseSensitive) { - this.caseSensitive = newCaseSensitive; + writeOptions.put( + FlinkDynamicSinkOptions.CASE_SENSITIVE.key(), Boolean.toString(newCaseSensitive)); return this; } @@ -358,89 +419,134 @@ private String operatorName(String suffix) { return uidPrefix != null ? uidPrefix + "-" + suffix : suffix; } - private DynamicIcebergSink build() { + private DynamicIcebergSink build( + SingleOutputStreamOperator converted, + DynamicRecordInternalType sideOutputType) { Preconditions.checkArgument( generator != null, "Please use withGenerator() to convert the input DataStream."); Preconditions.checkNotNull(catalogLoader, "Catalog loader shouldn't be null"); - uidPrefix = Optional.ofNullable(uidPrefix).orElse(""); + Configuration flinkConfig = fromReadableConfig(); + FlinkDynamicSinkConf flinkDynamicSinkConf = + new FlinkDynamicSinkConf(writeOptions, flinkConfig); - Configuration flinkConfig = - readableConfig instanceof Configuration - ? (Configuration) readableConfig - : Configuration.fromMap(readableConfig.toMap()); + // Forward writer: chained with generator via forward edge, no data shuffle + ForwardWriterSink forwardWriterSink = + new ForwardWriterSink( + catalogLoader, writeOptions, flinkConfig, flinkDynamicSinkConf.cacheMaxSize()); + TypeInformation> writeResultTypeInfo = + CommittableMessageTypeInfo.of(DynamicWriteResultSerializer::new); - return instantiateSink(writeOptions, flinkConfig); + DataStream> forwardWriteResults = + converted + .getSideOutput( + new OutputTag<>(DynamicRecordProcessor.DYNAMIC_FORWARD_STREAM, sideOutputType)) + .transform( + operatorName("Forward-Writer"), + writeResultTypeInfo, + new SinkWriterOperatorFactory<>(forwardWriterSink)) + .setParallelism(converted.getParallelism()) + .uid(prefixIfNotNull(uidPrefix, "-forward-writer")); + + // Inject forward write results into sink — they'll be unioned in addPreCommitTopology + return instantiateSink(writeOptions, flinkConfig, forwardWriteResults); } @VisibleForTesting DynamicIcebergSink instantiateSink( - Map writeProperties, Configuration flinkWriteConf) { + Map writeProperties, + Configuration flinkWriteConf, + DataStream> forwardWriteResults) { + FlinkDynamicSinkConf flinkDynamicSinkConf = + new FlinkDynamicSinkConf(writeProperties, flinkWriteConf); return new DynamicIcebergSink( catalogLoader, snapshotSummary, uidPrefix, writeProperties, flinkWriteConf, - cacheMaximumSize); + flinkDynamicSinkConf.cacheMaxSize(), + forwardWriteResults); } /** * Append the iceberg sink operators to write records to iceberg table. * + *

The topology splits records by distribution mode: + * + *

    + *
  • Forward records ({@code null} distributionMode) go through a forward edge to a chained + * writer, avoiding any data shuffle. + *
  • Shuffle records (non-null distributionMode) go through the standard Sink2 pipeline with + * hash/round-robin distribution. + *
+ * + * Both writers feed into a single shared pre-commit aggregator and committer, ensuring atomic + * commits across both paths. + * * @return {@link DataStreamSink} for sink. */ public DataStreamSink append() { + uidPrefix = Optional.ofNullable(uidPrefix).orElse(""); + + FlinkDynamicSinkConf flinkDynamicSinkConf = + new FlinkDynamicSinkConf(writeOptions, readableConfig); + Configuration flinkConfig = fromReadableConfig(); + DynamicRecordInternalType type = - new DynamicRecordInternalType(catalogLoader, false, cacheMaximumSize); - DynamicIcebergSink sink = build(); + new DynamicRecordInternalType(catalogLoader, false, flinkDynamicSinkConf.cacheMaxSize()); + DynamicRecordInternalType sideOutputType = + new DynamicRecordInternalType(catalogLoader, true, flinkDynamicSinkConf.cacheMaxSize()); + SingleOutputStreamOperator converted = input .process( new DynamicRecordProcessor<>( generator, catalogLoader, - immediateUpdate, - cacheMaximumSize, - cacheRefreshMs, - inputSchemasPerTableCacheMaximumSize, tableCreator, - caseSensitive, - dropUnusedColumns)) + flinkDynamicSinkConf, + writeOptions, + flinkConfig)) + .setParallelism(input.getParallelism()) .uid(prefixIfNotNull(uidPrefix, "-generator")) .name(operatorName("generator")) .returns(type); - DataStreamSink rowDataDataStreamSink = + DynamicIcebergSink sink = build(converted, sideOutputType); + + // Shuffle path: table update side output + main output → sinkTo() + DataStream shuffleInput = converted .getSideOutput( new OutputTag<>( - DynamicRecordProcessor.DYNAMIC_TABLE_UPDATE_STREAM, - new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize))) + DynamicRecordProcessor.DYNAMIC_TABLE_UPDATE_STREAM, sideOutputType)) .keyBy((KeySelector) DynamicRecordInternal::tableName) .map( - new DynamicTableUpdateOperator( - catalogLoader, - cacheMaximumSize, - cacheRefreshMs, - inputSchemasPerTableCacheMaximumSize, - tableCreator, - caseSensitive, - dropUnusedColumns)) + new DynamicTableUpdateOperator(catalogLoader, tableCreator, flinkDynamicSinkConf)) .uid(prefixIfNotNull(uidPrefix, "-updater")) .name(operatorName("Updater")) .returns(type) - .union(converted) - .sinkTo(sink) + .union(converted); + + DataStreamSink result = + shuffleInput + .sinkTo(sink) // Forward write results are implicitly injected here .uid(prefixIfNotNull(uidPrefix, "-sink")); FlinkWriteConf flinkWriteConf = new FlinkWriteConf(writeOptions, readableConfig); if (flinkWriteConf.writeParallelism() != null) { - rowDataDataStreamSink.setParallelism(flinkWriteConf.writeParallelism()); + result.setParallelism(flinkWriteConf.writeParallelism()); } - return rowDataDataStreamSink; + return result; + } + + private Configuration fromReadableConfig() { + return readableConfig instanceof Configuration + ? (Configuration) readableConfig + : Configuration.fromMap(readableConfig.toMap()); } } diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java index 9f445766083e..6507a575c2af 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java @@ -20,6 +20,7 @@ import java.util.Set; import javax.annotation.Nullable; +import org.apache.flink.annotation.Internal; import org.apache.flink.table.data.RowData; import org.apache.iceberg.DistributionMode; import org.apache.iceberg.PartitionSpec; @@ -34,20 +35,43 @@ public class DynamicRecord { private Schema schema; private RowData rowData; private PartitionSpec partitionSpec; - private DistributionMode distributionMode; + @Nullable private DistributionMode distributionMode; private int writeParallelism; private boolean upsertMode; @Nullable private Set equalityFields; + @Internal + DynamicRecord() {} + + /** + * Constructs a new DynamicRecord with forward (no shuffle) writes. + * + * @param tableIdentifier The target table identifier. + * @param branch The target table branch. + * @param schema The target table schema. + * @param rowData The data matching the provided schema. + * @param partitionSpec The target table {@link PartitionSpec}. + */ + public DynamicRecord( + TableIdentifier tableIdentifier, + String branch, + Schema schema, + RowData rowData, + PartitionSpec partitionSpec) { + this(tableIdentifier, branch, schema, rowData, partitionSpec, null, -1); + } + /** - * Constructs a new DynamicRecord. + * Constructs a new DynamicRecord. This record will be shuffled as specified by {@code + * distributionMode}. * * @param tableIdentifier The target table identifier. * @param branch The target table branch. * @param schema The target table schema. * @param rowData The data matching the provided schema. * @param partitionSpec The target table {@link PartitionSpec}. - * @param distributionMode The {@link DistributionMode}. + * @param distributionMode The {@link DistributionMode}. {@code null} indicates forward (no + * shuffle) writes. * @param writeParallelism The number of parallel writers. Can be set to any value {@literal > 0}, * but will always be automatically capped by the maximum write parallelism, which is the * parallelism of the sink. Set to Integer.MAX_VALUE for always using the maximum available diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java index 07dfad2780f7..c752b8e9b8d9 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java @@ -18,10 +18,12 @@ */ package org.apache.iceberg.flink.sink.dynamic; +import java.util.Map; import org.apache.flink.annotation.Internal; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.functions.OpenContext; import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.functions.ProcessFunction; import org.apache.flink.table.data.RowData; import org.apache.flink.util.Collector; @@ -30,6 +32,7 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.flink.CatalogLoader; +import org.apache.iceberg.flink.FlinkWriteConf; @Internal class DynamicRecordProcessor extends ProcessFunction @@ -37,8 +40,12 @@ class DynamicRecordProcessor extends ProcessFunction generator; private final CatalogLoader catalogLoader; + private final Map writeProperties; + private final Configuration flinkConfig; private final boolean immediateUpdate; private final boolean dropUnusedColumns; private final int cacheMaximumSize; @@ -51,28 +58,29 @@ class DynamicRecordProcessor extends ProcessFunction updateStream; + private transient OutputTag forwardStream; private transient Collector collector; + private transient DynamicRecordWithConfig dynamicRecordWithConfig; private transient Context context; DynamicRecordProcessor( DynamicRecordGenerator generator, CatalogLoader catalogLoader, - boolean immediateUpdate, - int cacheMaximumSize, - long cacheRefreshMs, - int inputSchemasPerTableCacheMaximumSize, TableCreator tableCreator, - boolean caseSensitive, - boolean dropUnusedColumns) { + FlinkDynamicSinkConf sinkConfig, + Map writeProperties, + Configuration flinkConfig) { this.generator = generator; this.catalogLoader = catalogLoader; - this.immediateUpdate = immediateUpdate; - this.cacheMaximumSize = cacheMaximumSize; - this.cacheRefreshMs = cacheRefreshMs; - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaximumSize; + this.flinkConfig = flinkConfig; + this.writeProperties = writeProperties; + this.immediateUpdate = sinkConfig.immediateTableUpdate(); + this.cacheMaximumSize = sinkConfig.cacheMaxSize(); + this.cacheRefreshMs = sinkConfig.cacheRefreshMs(); + this.inputSchemasPerTableCacheMaximumSize = sinkConfig.inputSchemasPerTableCacheMaxSize(); this.tableCreator = tableCreator; - this.caseSensitive = caseSensitive; - this.dropUnusedColumns = dropUnusedColumns; + this.caseSensitive = sinkConfig.caseSensitive(); + this.dropUnusedColumns = sinkConfig.dropUnusedColumns(); } @Override @@ -90,15 +98,22 @@ public void open(OpenContext openContext) throws Exception { this.hashKeyGenerator = new HashKeyGenerator( cacheMaximumSize, getRuntimeContext().getTaskInfo().getMaxNumberOfParallelSubtasks()); - if (immediateUpdate) { - updater = new TableUpdater(tableCache, catalog, caseSensitive, dropUnusedColumns); - } else { + // Always create updater — needed for forced immediate updates on forward records + this.updater = new TableUpdater(tableCache, catalog, caseSensitive, dropUnusedColumns); + // Always create forward stream tag for forward (distributionMode == null) records + this.forwardStream = + new OutputTag<>( + DYNAMIC_FORWARD_STREAM, + new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize)) {}; + if (!immediateUpdate) { updateStream = new OutputTag<>( DYNAMIC_TABLE_UPDATE_STREAM, new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize)) {}; } + this.dynamicRecordWithConfig = + new DynamicRecordWithConfig(new FlinkWriteConf(writeProperties, flinkConfig)); generator.open(openContext); } @@ -111,7 +126,10 @@ public void processElement(T element, Context ctx, Collector newData = updater.update( data.tableIdentifier(), data.branch(), data.schema(), data.spec(), tableCreator); emit( - collector, data, newData.f0.resolvedTableSchema(), newData.f0.recordConverter(), - newData.f1); + newData.f1, + isForward); } else { + // Shuffled records with immediateUpdate=false go to the update side output int writerKey = hashKeyGenerator.generateKey( data, @@ -159,33 +182,38 @@ public void collect(DynamicRecord data) { } } else { emit( - collector, data, foundSchema.resolvedTableSchema(), foundSchema.recordConverter(), - foundSpec); + foundSpec, + isForward); } } private void emit( - Collector out, DynamicRecord data, Schema schema, DataConverter recordConverter, - PartitionSpec spec) { + PartitionSpec spec, + boolean forward) { RowData rowData = (RowData) recordConverter.convert(data.rowData()); - int writerKey = hashKeyGenerator.generateKey(data, schema, spec, rowData); - String tableName = data.tableIdentifier().toString(); - out.collect( + // writerKey is unused in the forward path. + int writerKey = forward ? -1 : hashKeyGenerator.generateKey(data, schema, spec, rowData); + DynamicRecordInternal record = new DynamicRecordInternal( - tableName, + data.tableIdentifier().toString(), data.branch(), schema, rowData, spec, writerKey, data.upsertMode(), - DynamicSinkUtil.getEqualityFieldIds(data.equalityFields(), schema))); + DynamicSinkUtil.getEqualityFieldIds(data.equalityFields(), schema)); + if (forward) { + context.output(forwardStream, record); + } else { + collector.collect(record); + } } @Override diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordWithConfig.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordWithConfig.java new file mode 100644 index 000000000000..32716c3e4ac7 --- /dev/null +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordWithConfig.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.util.Set; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.FlinkWriteConf; + +class DynamicRecordWithConfig extends DynamicRecord { + private final String defaultBranch; + private final Integer defaultWriteParallelism; + + private DynamicRecord wrapped; + + DynamicRecordWithConfig(FlinkWriteConf flinkWriteConf) { + this.defaultBranch = flinkWriteConf.branch(); + this.defaultWriteParallelism = flinkWriteConf.writeParallelism(); + } + + DynamicRecordWithConfig wrap(DynamicRecord newWrapped) { + this.wrapped = newWrapped; + return this; + } + + @Override + public String branch() { + return wrapped.branch() != null ? wrapped.branch() : defaultBranch; + } + + @Override + public DistributionMode distributionMode() { + return wrapped.distributionMode(); + } + + @Override + public int writeParallelism() { + int originalParallelism = wrapped.writeParallelism(); + if (originalParallelism > 0 || defaultWriteParallelism == null) { + return originalParallelism; + } + + return defaultWriteParallelism; + } + + @Override + public TableIdentifier tableIdentifier() { + return wrapped.tableIdentifier(); + } + + @Override + public Schema schema() { + return wrapped.schema(); + } + + @Override + public PartitionSpec spec() { + return wrapped.spec(); + } + + @Override + public RowData rowData() { + return wrapped.rowData(); + } + + @Override + public boolean upsertMode() { + return wrapped.upsertMode(); + } + + @Override + public Set equalityFields() { + return wrapped.equalityFields(); + } +} diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java index 456f20adf59f..93c268ff86ad 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java @@ -48,20 +48,15 @@ class DynamicTableUpdateOperator private transient TableUpdater updater; DynamicTableUpdateOperator( - CatalogLoader catalogLoader, - int cacheMaximumSize, - long cacheRefreshMs, - int inputSchemasPerTableCacheMaximumSize, - TableCreator tableCreator, - boolean caseSensitive, - boolean dropUnusedColumns) { + CatalogLoader catalogLoader, TableCreator tableCreator, FlinkDynamicSinkConf configuration) { this.catalogLoader = catalogLoader; - this.cacheMaximumSize = cacheMaximumSize; - this.cacheRefreshMs = cacheRefreshMs; - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaximumSize; this.tableCreator = tableCreator; - this.caseSensitive = caseSensitive; - this.dropUnusedColumns = dropUnusedColumns; + + this.cacheMaximumSize = configuration.cacheMaxSize(); + this.cacheRefreshMs = configuration.cacheRefreshMs(); + this.inputSchemasPerTableCacheMaximumSize = configuration.inputSchemasPerTableCacheMaxSize(); + this.caseSensitive = configuration.caseSensitive(); + this.dropUnusedColumns = configuration.dropUnusedColumns(); } @Override diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkConf.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkConf.java new file mode 100644 index 000000000000..75b169c4b533 --- /dev/null +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkConf.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.util.Map; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.iceberg.flink.FlinkConfParser; + +/** + * A class for common Dynamic Iceberg sink configs for Flink writes. + * + *

If a config is set at multiple levels, the following order of precedence is used (top to + * bottom): + * + *

    + *
  1. Write options + *
  2. Flink ReadableConfig + *
  3. Default values + *
+ * + * The most specific value is set in write options and takes precedence over all other configs. If + * no write option is provided, this class checks the flink configuration for any overrides. If no + * applicable value is found in the write options, this class uses the default values. + */ +class FlinkDynamicSinkConf { + + private final FlinkConfParser confParser; + + FlinkDynamicSinkConf(Map writeOptions, ReadableConfig readableConfig) { + this.confParser = new FlinkConfParser(writeOptions, readableConfig); + } + + int cacheMaxSize() { + return confParser + .intConf() + .option(FlinkDynamicSinkOptions.CACHE_MAX_SIZE.key()) + .flinkConfig(FlinkDynamicSinkOptions.CACHE_MAX_SIZE) + .defaultValue(FlinkDynamicSinkOptions.CACHE_MAX_SIZE.defaultValue()) + .parse(); + } + + boolean immediateTableUpdate() { + return confParser + .booleanConf() + .option(FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE.key()) + .flinkConfig(FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE) + .defaultValue(FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE.defaultValue()) + .parse(); + } + + boolean dropUnusedColumns() { + return confParser + .booleanConf() + .option(FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.key()) + .flinkConfig(FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS) + .defaultValue(FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.defaultValue()) + .parse(); + } + + long cacheRefreshMs() { + return confParser + .longConf() + .option(FlinkDynamicSinkOptions.CACHE_REFRESH_MS.key()) + .flinkConfig(FlinkDynamicSinkOptions.CACHE_REFRESH_MS) + .defaultValue(FlinkDynamicSinkOptions.CACHE_REFRESH_MS.defaultValue()) + .parse(); + } + + int inputSchemasPerTableCacheMaxSize() { + return confParser + .intConf() + .option(FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE.key()) + .flinkConfig(FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE) + .defaultValue(FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE.defaultValue()) + .parse(); + } + + boolean caseSensitive() { + return confParser + .booleanConf() + .option(FlinkDynamicSinkOptions.CASE_SENSITIVE.key()) + .flinkConfig(FlinkDynamicSinkOptions.CASE_SENSITIVE) + .defaultValue(FlinkDynamicSinkOptions.CASE_SENSITIVE.defaultValue()) + .parse(); + } +} diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkOptions.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkOptions.java new file mode 100644 index 000000000000..7a4f038219d9 --- /dev/null +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkOptions.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import org.apache.flink.annotation.Experimental; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; + +@Experimental +public class FlinkDynamicSinkOptions { + + private FlinkDynamicSinkOptions() {} + + public static final ConfigOption CACHE_MAX_SIZE = + ConfigOptions.key("dynamic-sink.cache-max-size") + .intType() + .defaultValue(100) + .withDescription( + "Maximum size of the caches used in Dynamic Sink for table data and serializers."); + + public static final ConfigOption IMMEDIATE_TABLE_UPDATE = + ConfigOptions.key("dynamic-sink.immediate-table-update") + .booleanType() + .defaultValue(false) + .withDescription( + "Controls whether table schema and partition updates should be applied immediately in Dynamic Sink."); + + public static final ConfigOption DROP_UNUSED_COLUMNS = + ConfigOptions.key("dynamic-sink.drop-unused-columns") + .booleanType() + .defaultValue(false) + .withDescription( + "Allows dropping unused columns during schema evolution in Dynamic Sink."); + + public static final ConfigOption CACHE_REFRESH_MS = + ConfigOptions.key("dynamic-sink.cache-refresh-ms") + .longType() + .defaultValue(1_000L) + .withDescription( + "Cache refresh interval for dynamic table metadata in Dynamic Sink in milliseconds."); + + public static final ConfigOption INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE = + ConfigOptions.key("dynamic-sink.input-schemas-per-table-cache-max-size") + .intType() + .defaultValue(10) + .withDescription( + "Maximum input schema objects to cache per each table in Dynamic Sink for performance."); + + public static final ConfigOption CASE_SENSITIVE = + ConfigOptions.key("dynamic-sink.case-sensitive") + .booleanType() + .defaultValue(true) + .withDescription( + "Controls whether schema field name matching should be case-sensitive in Dynamic Sink."); +} diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java index fca45bf882e0..61a850212bf4 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java @@ -88,7 +88,7 @@ int generateKey( dynamicRecord.schema(), dynamicRecord.spec(), dynamicRecord.equalityFields(), - MoreObjects.firstNonNull(dynamicRecord.distributionMode(), DistributionMode.NONE), + dynamicRecord.distributionMode(), Math.min(dynamicRecord.writeParallelism(), maxWriteParallelism)); KeySelector keySelector = keySelectorCache.computeIfAbsent( @@ -98,8 +98,7 @@ int generateKey( tableIdent, MoreObjects.firstNonNull(tableSchema, dynamicRecord.schema()), MoreObjects.firstNonNull(tableSpec, dynamicRecord.spec()), - MoreObjects.firstNonNull( - dynamicRecord.distributionMode(), DistributionMode.NONE), + dynamicRecord.distributionMode(), MoreObjects.firstNonNull( dynamicRecord.equalityFields(), Collections.emptySet()), Math.min(dynamicRecord.writeParallelism(), maxWriteParallelism))); diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java index 8ef1f1fbb833..d74b8b9d620f 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java @@ -23,8 +23,6 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.flink.annotation.Internal; -import org.apache.flink.formats.avro.RowDataToAvroConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.LogicalType; @@ -32,6 +30,8 @@ import org.apache.flink.table.types.utils.TypeConversions; import org.apache.iceberg.avro.AvroSchemaUtil; import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.formats.avro.RowDataToAvroConverters; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; /** * This is not serializable because Avro {@link Schema} is not actually serializable, even though it diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java index bac7c05bdfef..1c6644238c3d 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java @@ -136,23 +136,26 @@ void validate() { if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) { Preconditions.checkArgument( startSnapshotId != null, - "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); + "Invalid starting snapshot id for %s strategy: null", + startingStrategy); Preconditions.checkArgument( startSnapshotTimestamp == null, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + "Invalid starting snapshot timestamp for %s strategy: not null", + startingStrategy); } if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) { Preconditions.checkArgument( startSnapshotTimestamp != null, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); + "Invalid starting snapshot timestamp for %s strategy: null", + startingStrategy); Preconditions.checkArgument( startSnapshotId == null, - "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + "Invalid starting snapshot id for %s strategy: not null", + startingStrategy); } Preconditions.checkArgument( - tag == null, - String.format("Cannot scan table using ref %s configured for streaming reader", tag)); + tag == null, "Cannot scan table using ref %s configured for streaming reader", tag); Preconditions.checkArgument( snapshotId == null, "Cannot set snapshot-id option for streaming reader"); Preconditions.checkArgument( diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java index b158b0871a53..cfef780a4daa 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java @@ -21,8 +21,6 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.formats.avro.RowDataToAvroConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; import org.apache.flink.formats.avro.typeutils.GenericRecordAvroTypeInfo; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.DataType; @@ -31,6 +29,8 @@ import org.apache.flink.table.types.utils.TypeConversions; import org.apache.iceberg.avro.AvroSchemaUtil; import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.formats.avro.RowDataToAvroConverters; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; public class AvroGenericRecordConverter implements RowDataConverter { private final Schema avroSchema; diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java index 02ef57d344b1..3af9957875e8 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java @@ -34,7 +34,7 @@ class WatermarkExtractorRecordEmitter implements SerializableRecordEmitter private static final Logger LOG = LoggerFactory.getLogger(WatermarkExtractorRecordEmitter.class); private final SplitWatermarkExtractor timeExtractor; private String lastSplitId = null; - private long watermark; + private long watermark = Long.MIN_VALUE; WatermarkExtractorRecordEmitter(SplitWatermarkExtractor timeExtractor) { this.timeExtractor = timeExtractor; @@ -44,7 +44,10 @@ class WatermarkExtractorRecordEmitter implements SerializableRecordEmitter public void emitRecord( RecordAndPosition element, SourceOutput output, IcebergSourceSplit split) { if (!split.splitId().equals(lastSplitId)) { - long newWatermark = timeExtractor.extractWatermark(split); + long extracted = timeExtractor.extractWatermark(split); + // Subtract 1 because watermark W means all records with eventTime <= W have arrived; + // records in this split have eventTime == extracted, so watermark must be extracted - 1. + long newWatermark = extracted > Long.MIN_VALUE ? extracted - 1 : Long.MIN_VALUE; if (newWatermark < watermark) { LOG.info( "Received a new split with lower watermark. Previous watermark = {}, current watermark = {}, previous split = {}, current split = {}", diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java index e2cd411d7069..795c4fa5a766 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java @@ -75,6 +75,11 @@ public static class Primitives implements DataGenerator { OffsetDateTime.of(2022, 1, 10, 0, 0, 0, 0, ZoneOffset.UTC); private static final LocalDateTime JAVA_LOCAL_DATE_TIME_20220110 = LocalDateTime.of(2022, 1, 10, 0, 0, 0); + private static final OffsetDateTime JAVA_OFFSET_DATE_TIME_MAX_NANO = + OffsetDateTime.of(2262, 4, 11, 23, 47, 16, 854_775_807, ZoneOffset.UTC); + private static final LocalDateTime JAVA_LOCAL_DATE_TIME_MAX_NANO = + LocalDateTime.of(2262, 4, 11, 23, 47, 16, 854_775_807); + private static final long ICEBERG_MAX_NANOS_EPOCH = 9223372036854775807L; private static final BigDecimal BIG_DECIMAL_NEGATIVE = new BigDecimal("-1.50"); private static final byte[] FIXED_BYTES = "012345689012345".getBytes(StandardCharsets.UTF_8); @@ -96,7 +101,11 @@ public static class Primitives implements DataGenerator { Types.NestedField.required(12, "uuid_field", Types.UUIDType.get()), Types.NestedField.required(13, "binary_field", Types.BinaryType.get()), Types.NestedField.required(14, "decimal_field", Types.DecimalType.of(9, 2)), - Types.NestedField.required(15, "fixed_field", Types.FixedType.ofLength(16))); + Types.NestedField.required(15, "fixed_field", Types.FixedType.ofLength(16)), + Types.NestedField.required( + 16, "ts_ns_with_zone_field", Types.TimestampNanoType.withZone()), + Types.NestedField.required( + 17, "ts_ns_without_zone_field", Types.TimestampNanoType.withoutZone())); private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); @@ -171,6 +180,8 @@ public GenericRecord generateIcebergGenericRecord() { genericRecord.setField("time_field", JAVA_LOCAL_TIME_HOUR8); genericRecord.setField("ts_with_zone_field", JAVA_OFFSET_DATE_TIME_20220110); genericRecord.setField("ts_without_zone_field", JAVA_LOCAL_DATE_TIME_20220110); + genericRecord.setField("ts_ns_with_zone_field", JAVA_OFFSET_DATE_TIME_MAX_NANO); + genericRecord.setField("ts_ns_without_zone_field", JAVA_LOCAL_DATE_TIME_MAX_NANO); byte[] uuidBytes = new byte[16]; for (int i = 0; i < 16; ++i) { @@ -220,7 +231,11 @@ public GenericRowData generateFlinkRowData() { uuidBytes, binaryBytes, DecimalData.fromBigDecimal(BIG_DECIMAL_NEGATIVE, 9, 2), - FIXED_BYTES); + FIXED_BYTES, + TimestampData.fromEpochMillis( + ICEBERG_MAX_NANOS_EPOCH / 1_000_000, (int) (ICEBERG_MAX_NANOS_EPOCH % 1_000_000)), + TimestampData.fromEpochMillis( + ICEBERG_MAX_NANOS_EPOCH / 1_000_000, (int) (ICEBERG_MAX_NANOS_EPOCH % 1_000_000))); } @Override @@ -236,10 +251,12 @@ public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { genericRecord.put("date_field", DAYS_BTW_EPOC_AND_20220110); genericRecord.put("time_field", HOUR_8_IN_MILLI); - // Although Avro logical type for timestamp fields are in micro seconds, - // AvroToRowDataConverters only looks for long value in milliseconds. - genericRecord.put("ts_with_zone_field", JODA_DATETIME_20220110.getMillis()); - genericRecord.put("ts_without_zone_field", JODA_DATETIME_20220110.getMillis()); + // Now that AvroToRowDataConverters correctly supports microseconds, + // we must inject correct microsecond scale values into the Avro data. + genericRecord.put("ts_with_zone_field", JODA_DATETIME_20220110.getMillis() * 1000L); + genericRecord.put("ts_without_zone_field", JODA_DATETIME_20220110.getMillis() * 1000L); + genericRecord.put("ts_ns_with_zone_field", ICEBERG_MAX_NANOS_EPOCH); + genericRecord.put("ts_ns_without_zone_field", ICEBERG_MAX_NANOS_EPOCH); byte[] uuidBytes = new byte[16]; for (int i = 0; i < 16; ++i) { @@ -554,7 +571,11 @@ public static class ArrayOfPrimitive implements DataGenerator { new Schema( Types.NestedField.required(1, "row_id", Types.StringType.get()), Types.NestedField.required( - 2, "array_of_int", Types.ListType.ofOptional(101, Types.IntegerType.get()))); + 2, "array_of_int", Types.ListType.ofOptional(101, Types.IntegerType.get())), + Types.NestedField.optional( + 3, + "array_of_ts_ns", + Types.ListType.ofRequired(102, Types.TimestampNanoType.withoutZone()))); private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); @@ -581,13 +602,33 @@ public GenericRecord generateIcebergGenericRecord() { GenericRecord genericRecord = GenericRecord.create(icebergSchema); genericRecord.setField("row_id", "row_id_value"); genericRecord.setField("array_of_int", Arrays.asList(1, 2, 3)); + + LocalDateTime posNanos = LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789); + LocalDateTime negNanos = LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321); + genericRecord.setField("array_of_ts_ns", Arrays.asList(posNanos, negNanos)); return genericRecord; } @Override public GenericRowData generateFlinkRowData() { Integer[] arr = {1, 2, 3}; - return GenericRowData.of(StringData.fromString("row_id_value"), new GenericArrayData(arr)); + + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + TimestampData[] tsArr = { + TimestampData.fromEpochMillis( + Math.floorDiv(posNanos, 1_000_000L), (int) Math.floorMod(posNanos, 1_000_000L)), + TimestampData.fromEpochMillis( + Math.floorDiv(negNanos, 1_000_000L), (int) Math.floorMod(negNanos, 1_000_000L)) + }; + return GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericArrayData(arr), + new GenericArrayData(tsArr)); } @Override @@ -595,6 +636,14 @@ public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); genericRecord.put("row_id", "row_id_value"); genericRecord.put("array_of_int", Arrays.asList(1, 2, 3)); + + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + genericRecord.put("array_of_ts_ns", Arrays.asList(posNanos, negNanos)); return genericRecord; } } @@ -808,7 +857,12 @@ public static class MapOfPrimitives implements DataGenerator { 2, "map_of_primitives", Types.MapType.ofRequired( - 101, 102, Types.StringType.get(), Types.IntegerType.get()))); + 101, 102, Types.StringType.get(), Types.IntegerType.get())), + Types.NestedField.optional( + 3, + "map_of_ts_ns", + Types.MapType.ofRequired( + 103, 104, Types.StringType.get(), Types.TimestampNanoType.withoutZone()))); private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); @@ -835,15 +889,37 @@ public GenericRecord generateIcebergGenericRecord() { GenericRecord genericRecord = GenericRecord.create(icebergSchema); genericRecord.setField("row_id", "row_id_value"); genericRecord.setField("map_of_primitives", ImmutableMap.of("Jane", 1, "Joe", 2)); + + LocalDateTime posNanos = LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789); + LocalDateTime negNanos = LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321); + genericRecord.setField( + "map_of_ts_ns", ImmutableMap.of("positive", posNanos, "negative", negNanos)); return genericRecord; } @Override public GenericRowData generateFlinkRowData() { + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + return GenericRowData.of( StringData.fromString("row_id_value"), new GenericMapData( - ImmutableMap.of(StringData.fromString("Jane"), 1, StringData.fromString("Joe"), 2))); + ImmutableMap.of(StringData.fromString("Jane"), 1, StringData.fromString("Joe"), 2)), + new GenericMapData( + ImmutableMap.of( + StringData.fromString("positive"), + TimestampData.fromEpochMillis( + Math.floorDiv(posNanos, 1_000_000L), + (int) Math.floorMod(posNanos, 1_000_000L)), + StringData.fromString("negative"), + TimestampData.fromEpochMillis( + Math.floorDiv(negNanos, 1_000_000L), + (int) Math.floorMod(negNanos, 1_000_000L))))); } @Override @@ -851,6 +927,15 @@ public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); genericRecord.put("row_id", "row_id_value"); genericRecord.put("map_of_primitives", ImmutableMap.of("Jane", 1, "Joe", 2)); + + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + genericRecord.put( + "map_of_ts_ns", ImmutableMap.of("positive", posNanos, "negative", negNanos)); return genericRecord; } } diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java index cd6964b5ed0f..0e7635a33e87 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java @@ -30,7 +30,6 @@ import org.apache.iceberg.data.Record; import org.apache.iceberg.flink.data.RandomRowData; import org.apache.iceberg.util.StructLikeWrapper; -import org.junit.jupiter.api.Disabled; public class TestRowDataWrapper extends RecordWrapperTestBase { @@ -60,18 +59,6 @@ public void testTime() { }); } - @Disabled - @Override - public void testTimestampNanoWithoutZone() { - // Flink does not support nanosecond timestamp without zone. - } - - @Disabled - @Override - public void testTimestampNanoWithZone() { - // Flink does not support nanosecond timestamp with zone. - } - @Override protected void generateAndValidate( Schema schema, RecordWrapperTestBase.AssertMethod assertMethod) { diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java index 8c99fdf52110..1f0fe70ac53b 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java @@ -19,13 +19,17 @@ package org.apache.iceberg.flink.data; import java.util.List; +import org.apache.flink.table.data.GenericRowData; import org.apache.flink.table.data.RowData; +import org.apache.iceberg.PartitionData; import org.apache.iceberg.Schema; import org.apache.iceberg.data.BaseFormatModelTests; import org.apache.iceberg.data.Record; import org.apache.iceberg.flink.FlinkSchemaUtil; import org.apache.iceberg.flink.RowDataConverter; import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; public class TestFlinkFormatModel extends BaseFormatModelTests { @@ -48,4 +52,26 @@ protected RowData convertToEngine(Record record, Schema schema) { protected void assertEquals(Schema schema, List expected, List actual) { TestHelpers.assertRows(actual, expected, FlinkSchemaUtil.convert(schema)); } + + @Override + protected Object convertConstantToEngine(Type type, Object value) { + if (value instanceof PartitionData partitionData) { + Types.StructType structType = type.asStructType(); + List fields = structType.fields(); + GenericRowData rowData = new GenericRowData(fields.size()); + int sourceSize = partitionData.size(); + for (int i = 0; i < fields.size(); i++) { + if (i < sourceSize) { + Object fieldValue = partitionData.get(i, Object.class); + rowData.setField(i, convertConstantToEngine(fields.get(i).type(), fieldValue)); + } else { + rowData.setField(i, null); + } + } + + return rowData; + } + + return RowDataUtil.convertConstant(type, value); + } } diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java index 4a70802f2a2e..b7b0a54156cc 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java @@ -49,6 +49,11 @@ protected boolean allowsWritingNullValuesForRequiredFields() { return true; } + @Override + protected boolean supportsTimestampNanos() { + return true; + } + @Override protected void writeAndValidate(Schema schema) throws IOException { List expectedRecords = RandomGenericData.generate(schema, NUM_RECORDS, 1990L); diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java index 4e5b38ffb026..a2411da1e344 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java @@ -271,18 +271,19 @@ public void testMapOfPrimitivesProjection() { GenericRowData.of( StringData.fromString("row_id_value"), new GenericMapData( - ImmutableMap.of(StringData.fromString("foo"), 1, StringData.fromString("bar"), 2))); + ImmutableMap.of(StringData.fromString("foo"), 1, StringData.fromString("bar"), 2)), + null); testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData, true); testEqualsAndHashCode(schema, mapOnly, rowData, copyRowData, otherRowData); testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); GenericRowData rowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("row_id_value"), null); + GenericRowData.of(StringData.fromString("row_id_value"), null, null); GenericRowData copyRowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("row_id_value"), null); + GenericRowData.of(StringData.fromString("row_id_value"), null, null); // modify the map field value GenericRowData otherRowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("other_row_id_value"), null); + GenericRowData.of(StringData.fromString("other_row_id_value"), null, null); testEqualsAndHashCode( schema, idOnly, @@ -432,7 +433,8 @@ public void testArrayOfPrimitiveProjection() { GenericRowData otherRowData = GenericRowData.of( StringData.fromString("other_row_id_value"), - new GenericArrayData(new Integer[] {4, 5, 6})); + new GenericArrayData(new Integer[] {4, 5, 6}), + null); testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); testEqualsAndHashCode(schema, arrayOnly, rowData, copyRowData, otherRowData); testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); @@ -440,16 +442,19 @@ public void testArrayOfPrimitiveProjection() { GenericRowData rowDataNullOptionalFields = GenericRowData.of( StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {1, null, 3})); + new GenericArrayData(new Integer[] {1, null, 3}), + null); GenericRowData copyRowDataNullOptionalFields = GenericRowData.of( StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {1, null, 3})); + new GenericArrayData(new Integer[] {1, null, 3}), + null); // modify the map field value GenericRowData otherRowDataNullOptionalFields = GenericRowData.of( StringData.fromString("other_row_id_value"), - new GenericArrayData(new Integer[] {4, null, 6})); + new GenericArrayData(new Integer[] {4, null, 6}), + null); testEqualsAndHashCode( schema, idOnly, diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java index 3cb18ffbb77e..4d35792e440e 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java @@ -19,11 +19,18 @@ package org.apache.iceberg.flink.maintenance.api; import static org.apache.iceberg.flink.maintenance.api.JdbcLockFactory.INIT_LOCK_TABLES_PROPERTY; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import java.sql.SQLTransientConnectionException; import java.util.Map; import java.util.UUID; +import java.util.concurrent.atomic.AtomicInteger; import org.apache.iceberg.jdbc.JdbcCatalog; +import org.apache.iceberg.jdbc.JdbcClientPool; +import org.apache.iceberg.jdbc.UncheckedSQLException; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.Test; class TestJdbcLockFactory extends TestLockFactoryBase { @Override @@ -38,4 +45,62 @@ TriggerLockFactory lockFactory(String tableName) { tableName, properties); } + + @Test + void testSQLExceptionEnablesRetryInClientPool() throws Exception { + // Regression test for #15759: verify that removing the inner try-catch allows + // ClientPoolImpl to retry on transient connection failures. + // + // Before the fix: inner catch converted SQLException -> UncheckedSQLException + // (RuntimeException) inside the lambda. ClientPoolImpl only catches the declared + // exception type (SQLException), so RuntimeException bypasses retry entirely. + // After the fix: SQLException propagates naturally, ClientPoolImpl catches it, + // and retries on transient connection exceptions. + Map props = Maps.newHashMap(); + props.put("username", "user"); + props.put("password", "password"); + String uri = "jdbc:sqlite:file::memory:?ic" + UUID.randomUUID().toString().replace("-", ""); + + try (JdbcClientPool pool = new JdbcClientPool(1, uri, props)) { + AtomicInteger attempts = new AtomicInteger(0); + + String result = + pool.run( + conn -> { + if (attempts.incrementAndGet() == 1) { + throw new SQLTransientConnectionException("transient failure"); + } + + return "success"; + }); + + assertThat(result).isEqualTo("success"); + assertThat(attempts.get()).isGreaterThan(1); + } + } + + @Test + void testUncheckedSQLExceptionBypassesRetry() throws Exception { + // Companion test: demonstrates that wrapping SQLException as UncheckedSQLException + // (the OLD behavior before the fix) prevents ClientPoolImpl from retrying. + Map props = Maps.newHashMap(); + props.put("username", "user"); + props.put("password", "password"); + String uri = "jdbc:sqlite:file::memory:?ic" + UUID.randomUUID().toString().replace("-", ""); + + try (JdbcClientPool pool = new JdbcClientPool(1, uri, props)) { + assertThatThrownBy( + () -> + pool.run( + conn -> { + try { + throw new SQLTransientConnectionException("transient failure"); + } catch (java.sql.SQLException e) { + throw new UncheckedSQLException(e, "wrapped"); + } + })) + .isInstanceOf(UncheckedSQLException.class) + .hasMessageContaining("wrapped"); + } + } } diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java index bb53b5265655..88b949a9a7f8 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java @@ -29,9 +29,11 @@ import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.REMOVED_DATA_FILE_SIZE_METRIC; import static org.assertj.core.api.Assertions.assertThat; +import java.time.Instant; import java.util.List; import java.util.stream.StreamSupport; import org.apache.flink.streaming.api.graph.StreamGraphGenerator; +import org.apache.iceberg.FileFormat; import org.apache.iceberg.ManifestFiles; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; @@ -43,8 +45,14 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.FieldSource; class TestRewriteDataFiles extends MaintenanceTaskTestBase { + + private static final FileFormat[] FILE_FORMATS = + new FileFormat[] {FileFormat.AVRO, FileFormat.PARQUET, FileFormat.ORC}; + @Test void testRewriteUnpartitioned() throws Exception { Table table = createTable(); @@ -82,13 +90,14 @@ void testRewriteUnpartitioned() throws Exception { createRecord(4, "d"))); } - @Test - void testRewriteUnpartitionedPreserveLineage() throws Exception { - Table table = createTable(3); - insert(table, 1, "a"); - insert(table, 2, "b"); - insert(table, 3, "c"); - insert(table, 4, "d"); + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testRewriteUnpartitionedPreserveLineage(FileFormat fileFormat) throws Exception { + Table table = createTable(3, fileFormat); + insert(table, 1, "a", fileFormat); + insert(table, 2, "b", fileFormat); + insert(table, 3, "c", fileFormat); + insert(table, 4, "d", fileFormat); assertFileNum(table, 4, 0); @@ -122,15 +131,17 @@ void testRewriteUnpartitionedPreserveLineage() throws Exception { schema); } - @Test - void testRewriteTheSameFilePreserveLineage() throws Exception { - Table table = createTable(3); - insert(table, 1, "a"); - insert(table, 2, "b"); + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testRewriteTheSameFilePreserveLineage(FileFormat fileFormat) throws Exception { + Table table = createTable(3, fileFormat); + insert(table, 1, "a", fileFormat); + insert(table, 2, "b", fileFormat); // Create a file with two lines of data to verify that the rowid is read correctly. insert( table, - ImmutableList.of(SimpleDataUtil.createRecord(3, "c"), SimpleDataUtil.createRecord(4, "d"))); + ImmutableList.of(SimpleDataUtil.createRecord(3, "c"), SimpleDataUtil.createRecord(4, "d")), + fileFormat); assertFileNum(table, 3, 0); @@ -166,13 +177,14 @@ void testRewriteTheSameFilePreserveLineage() throws Exception { schema); } - @Test - void testRewritePartitionedPreserveLineage() throws Exception { - Table table = createPartitionedTable(3); - insertPartitioned(table, 1, "p1"); - insertPartitioned(table, 2, "p1"); - insertPartitioned(table, 3, "p2"); - insertPartitioned(table, 4, "p2"); + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testRewritePartitionedPreserveLineage(FileFormat fileFormat) throws Exception { + Table table = createPartitionedTable(3, fileFormat); + insertPartitioned(table, 1, "p1", fileFormat); + insertPartitioned(table, 2, "p1", fileFormat); + insertPartitioned(table, 3, "p2", fileFormat); + insertPartitioned(table, 4, "p2", fileFormat); assertFileNum(table, 4, 0); @@ -529,6 +541,57 @@ void testRewriteWithFilter() throws Exception { createRecord(4, "d"))); } + /** + * By verifying that the creation time of the data content in the builder is later than the + * creation time of the filter condition — if the filter condition is actually created in the + * planner, then all files can be compacted; otherwise, not all files can be compacted — we can + * confirm whether the filter condition is actually created in the planner. + */ + @Test + void testRewriteWithFilterSupplier() throws Exception { + Table table = createTable(); + + appendRewriteDataFiles( + RewriteDataFiles.builder() + .parallelism(2) + .deleteFileThreshold(10) + .targetFileSizeBytes(1_000_000L) + .maxFileGroupSizeBytes(10_000_000L) + .maxFileSizeBytes(2_000_000L) + .minFileSizeBytes(500_000L) + .minInputFiles(2) + // Rewrite data files where id is less than current timestamp in planner + .filter(() -> Expressions.lessThan("id", (int) Instant.now().getEpochSecond())) + .partialProgressEnabled(true) + .partialProgressMaxCommits(1) + .maxRewriteBytes(100_000L) + .rewriteAll(false)); + + insert(table, 1, "a"); + insert(table, 2, "b"); + insert(table, 3, "c"); + + int epochSecond = (int) Instant.now().getEpochSecond(); + insert(table, epochSecond, "d"); + + assertFileNum(table, 4, 0); + + Thread.sleep(1_000L); + runAndWaitForSuccess(infra.env(), infra.source(), infra.sink()); + + // There is four files, only id is less than current timestamp will be rewritten. so expect 2 + // files. + assertFileNum(table, 1, 0); + + SimpleDataUtil.assertTableRecords( + table, + ImmutableList.of( + createRecord(1, "a"), + createRecord(2, "b"), + createRecord(3, "c"), + createRecord(epochSecond, "d"))); + } + @Test void testBranch() throws Exception { Table table = createTable(); diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java index 5eecc5a803d3..06ab7861c0f5 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java @@ -24,7 +24,10 @@ import java.io.File; import java.io.IOException; import java.nio.file.Path; +import java.time.LocalDateTime; +import java.time.ZoneOffset; import java.util.List; +import java.util.concurrent.TimeUnit; import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.MetricOptions; import org.apache.flink.core.execution.JobClient; @@ -79,6 +82,12 @@ public class OperatorTestBase { ImmutableMap.of(), ImmutableSet.of(SimpleDataUtil.SCHEMA.columns().get(0).fieldId())); + private static final Schema SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "ts", Types.TimestampType.withoutZone())); + protected static final String UID_SUFFIX = "UID-Dummy"; protected static final String SLOT_SHARING_GROUP = "SlotSharingGroup"; protected static final TriggerLockFactory LOCK_FACTORY = new MemoryLockFactory(); @@ -124,10 +133,14 @@ void after() throws IOException { } protected static Table createTable() { - return createTable(2); + return createTable(2, FileFormat.PARQUET); } protected static Table createTable(int formatVersion) { + return createPartitionedTable(formatVersion, FileFormat.PARQUET); + } + + protected static Table createTable(int formatVersion, FileFormat fileFormat) { return CATALOG_EXTENSION .catalog() .createTable( @@ -136,12 +149,29 @@ protected static Table createTable(int formatVersion) { PartitionSpec.unpartitioned(), null, ImmutableMap.of( + "write.format.default", + fileFormat.name(), TableProperties.FORMAT_VERSION, String.valueOf(formatVersion), "flink.max-continuous-empty-commits", "100000")); } + protected static Table createTableWithTimestampWithoutZone() { + return CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE, + PartitionSpec.builderFor(SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE).identity("ts").build(), + null, + ImmutableMap.of( + TableProperties.FORMAT_VERSION, + "2", + "flink.max-continuous-empty-commits", + "100000")); + } + protected static Table createTableWithDelete() { return createTableWithDelete(2); } @@ -158,7 +188,7 @@ protected static Table createTableWithDelete(int formatVersion) { "format-version", String.valueOf(formatVersion), "write.upsert.enabled", "true")); } - protected static Table createPartitionedTable(int formatVersion) { + protected static Table createPartitionedTable(int formatVersion, FileFormat fileFormat) { return CATALOG_EXTENSION .catalog() .createTable( @@ -167,6 +197,8 @@ protected static Table createPartitionedTable(int formatVersion) { PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build(), null, ImmutableMap.of( + "write.format.default", + fileFormat.name(), "format-version", String.valueOf(formatVersion), "flink.max-continuous-empty-commits", @@ -174,17 +206,27 @@ protected static Table createPartitionedTable(int formatVersion) { } protected static Table createPartitionedTable() { - return createPartitionedTable(2); + return createPartitionedTable(2, FileFormat.PARQUET); } protected void insert(Table table, Integer id, String data) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + insert(table, id, data, FileFormat.PARQUET); + } + + protected void insert(Table table, Integer id, String data, FileFormat fileFormat) + throws IOException { + new GenericAppenderHelper(table, fileFormat, warehouseDir) .appendToTable(Lists.newArrayList(SimpleDataUtil.createRecord(id, data))); table.refresh(); } protected void insert(Table table, List records) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir).appendToTable(records); + insert(table, records, FileFormat.PARQUET); + } + + protected void insert(Table table, List records, FileFormat fileFormat) + throws IOException { + new GenericAppenderHelper(table, fileFormat, warehouseDir).appendToTable(records); table.refresh(); } @@ -194,6 +236,20 @@ protected void insert(Table table, Integer id, String data, String extra) throws table.refresh(); } + protected void insertWithTimestampWithoutZone( + Table table, Integer id, String data, LocalDateTime ts) throws IOException { + GenericRecord record = GenericRecord.create(SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE); + record.setField("id", id); + record.setField("data", data); + record.setField("ts", ts); + long tsMicros = + TimeUnit.SECONDS.toMicros(ts.toEpochSecond(ZoneOffset.UTC)) + + TimeUnit.NANOSECONDS.toMicros(ts.getNano()); + new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + .appendToTable(TestHelpers.Row.of(tsMicros), Lists.newArrayList(record)); + table.refresh(); + } + /** * For the same identifier column id this methods simulate the following row operations: *
  • add an equality delete on oldData @@ -271,7 +327,12 @@ protected void update(Table table, Integer id, String oldData, String tempData, } protected void insertPartitioned(Table table, Integer id, String data) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + insertPartitioned(table, id, data, FileFormat.PARQUET); + } + + protected void insertPartitioned(Table table, Integer id, String data, FileFormat fileFormat) + throws IOException { + new GenericAppenderHelper(table, fileFormat, warehouseDir) .appendToTable( TestHelpers.Row.of(data), Lists.newArrayList(SimpleDataUtil.createRecord(id, data))); table.refresh(); diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java index 8a8a2fa194d4..7b8f638b7e2f 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java @@ -57,7 +57,7 @@ static List planDataFileRewrite( 11, 10_000_000L, rewriterOptions, - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java index 16d524f05cf7..8300df8c94eb 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java @@ -24,6 +24,9 @@ import static org.apache.iceberg.flink.maintenance.operator.RewriteUtil.planDataFileRewrite; import static org.assertj.core.api.Assertions.assertThat; +import java.time.Duration; +import java.time.LocalDateTime; +import java.time.ZoneOffset; import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -107,7 +110,7 @@ void testError() throws Exception { 11, 1L, ImmutableMap.of(MIN_INPUT_FILES, "2"), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); @@ -174,7 +177,7 @@ void testMaxRewriteBytes() throws Exception { 11, maxRewriteBytes, ImmutableMap.of(MIN_INPUT_FILES, "2"), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); @@ -228,7 +231,7 @@ void testBranch() throws Exception { 11, 10_000_000L, ImmutableMap.of(MIN_INPUT_FILES, "2"), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, branchName))) { testHarness.open(); @@ -243,6 +246,46 @@ void testBranch() throws Exception { } } + @Test + void testFilterSupplierWithTimestamp() throws Exception { + Table table = createTableWithTimestampWithoutZone(); + + LocalDateTime oldTs = LocalDateTime.now().minusDays(10); + insertWithTimestampWithoutZone(table, 1, "old_a", oldTs); + insertWithTimestampWithoutZone(table, 2, "old_b", oldTs); + + LocalDateTime recentTs = LocalDateTime.now().minusHours(1); + insertWithTimestampWithoutZone(table, 3, "new_a", recentTs); + insertWithTimestampWithoutZone(table, 4, "new_b", recentTs); + + try (OneInputStreamOperatorTestHarness + testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new DataFileRewritePlanner( + OperatorTestBase.DUMMY_TABLE_NAME, + OperatorTestBase.DUMMY_TABLE_NAME, + 0, + tableLoader(), + 11, + 10_000_000L, + ImmutableMap.of(MIN_INPUT_FILES, "2"), + () -> + Expressions.greaterThanOrEqual( + "ts", + LocalDateTime.now(ZoneOffset.UTC).minus(Duration.ofDays(3)).toString()), + SnapshotRef.MAIN_BRANCH))) { + testHarness.open(); + + trigger(testHarness); + + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); + List planned = testHarness.extractOutputValues(); + + assertThat(planned).hasSize(1); + assertThat(planned.get(0).group().fileScanTasks()).hasSize(2); + } + } + void assertRewriteFileGroup( DataFileRewritePlanner.PlannedGroup plannedGroup, Table table, Set files) { assertThat(plannedGroup.table().currentSnapshot().snapshotId()) diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java index 9202a1df92af..62b29e7c017a 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java @@ -309,7 +309,7 @@ void testSplitSize() throws Exception { "2", TARGET_FILE_SIZE_BYTES, String.valueOf(targetFileSize)), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriterMetrics.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriterMetrics.java new file mode 100644 index 000000000000..42bbfc0d3628 --- /dev/null +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriterMetrics.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatNoException; + +import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; +import org.apache.iceberg.io.WriteResult; +import org.junit.jupiter.api.Test; + +public class TestIcebergStreamWriterMetrics { + + @Test + void histogramsCreatedWhenDropwizardAvailable() { + IcebergStreamWriterMetrics metrics = + new IcebergStreamWriterMetrics( + UnregisteredMetricsGroup.createSinkWriterMetricGroup(), "db.table"); + + assertThat(metrics.dataFilesSizeHistogram()).isNotNull(); + assertThat(metrics.deleteFilesSizeHistogram()).isNotNull(); + + assertThatNoException() + .isThrownBy(() -> metrics.updateFlushResult(WriteResult.builder().build())); + } +} diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java index 27b1e3d84a8c..bafd0276b7ce 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java @@ -44,6 +44,8 @@ import org.apache.flink.configuration.RestartStrategyOptions; import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; import org.apache.flink.runtime.client.JobExecutionException; +import org.apache.flink.runtime.jobgraph.JobVertex; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; @@ -75,6 +77,7 @@ import org.apache.iceberg.flink.CatalogLoader; import org.apache.iceberg.flink.FlinkSchemaUtil; import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.SimpleDataUtil; import org.apache.iceberg.flink.TestHelpers; @@ -83,6 +86,7 @@ import org.apache.iceberg.flink.sink.dynamic.TestDynamicCommitter.FailBeforeAndAfterCommit; import org.apache.iceberg.inmemory.InMemoryInputFile; import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; @@ -119,6 +123,7 @@ private static class DynamicIcebergDataImpl implements Serializable { PartitionSpec partitionSpec; boolean upsertMode; Set equalityFields; + int writeParallelism; private DynamicIcebergDataImpl( Schema schemaProvided, String tableName, String branch, PartitionSpec partitionSpec) { @@ -130,7 +135,8 @@ private DynamicIcebergDataImpl( partitionSpec, false, Collections.emptySet(), - false); + false, + 10); } private DynamicIcebergDataImpl( @@ -147,7 +153,8 @@ private DynamicIcebergDataImpl( partitionSpec, false, Collections.emptySet(), - false); + false, + 10); } private DynamicIcebergDataImpl( @@ -166,7 +173,8 @@ private DynamicIcebergDataImpl( partitionSpec, upsertMode, equalityFields, - isDuplicate); + isDuplicate, + 10); } private DynamicIcebergDataImpl( @@ -177,7 +185,8 @@ private DynamicIcebergDataImpl( PartitionSpec partitionSpec, boolean upsertMode, Set equalityFields, - boolean isDuplicate) { + boolean isDuplicate, + int writeParallelism) { this.rowProvided = randomRow(schemaProvided, isDuplicate ? seed : ++seed); this.rowExpected = isDuplicate ? null : rowProvided; this.schemaProvided = schemaProvided; @@ -187,6 +196,7 @@ private DynamicIcebergDataImpl( this.partitionSpec = partitionSpec; this.upsertMode = upsertMode; this.equalityFields = equalityFields; + this.writeParallelism = writeParallelism; } } @@ -206,6 +216,56 @@ public void generate(DynamicIcebergDataImpl row, Collector out) { converter(schema).toInternal(row.rowProvided), spec, spec.isPartitioned() ? DistributionMode.HASH : DistributionMode.NONE, + row.writeParallelism); + dynamicRecord.setUpsertMode(row.upsertMode); + dynamicRecord.setEqualityFields(row.equalityFields); + out.collect(dynamicRecord); + } + } + + /** Generator that always emits forward (null distributionMode) records. */ + private static class ForwardGenerator implements DynamicRecordGenerator { + @Override + public void generate(DynamicIcebergDataImpl row, Collector out) { + TableIdentifier tableIdentifier = TableIdentifier.of(DATABASE, row.tableName); + Schema schema = row.schemaProvided; + PartitionSpec spec = row.partitionSpec; + DynamicRecord dynamicRecord = + new DynamicRecord( + tableIdentifier, + row.branch, + schema, + converter(schema).toInternal(row.rowProvided), + spec); + dynamicRecord.setUpsertMode(row.upsertMode); + dynamicRecord.setEqualityFields(row.equalityFields); + out.collect(dynamicRecord); + } + } + + /** + * Generator that alternates between forward (null distributionMode) and shuffle records. Even + * indices go forward, odd indices go through shuffle. + */ + private static class MixedGenerator implements DynamicRecordGenerator { + private int count = 0; + + @Override + public void generate(DynamicIcebergDataImpl row, Collector out) { + TableIdentifier tableIdentifier = TableIdentifier.of(DATABASE, row.tableName); + Schema schema = row.schemaProvided; + PartitionSpec spec = row.partitionSpec; + boolean forward = (count++ % 2 == 0); + DistributionMode mode = + forward ? null : (spec.isPartitioned() ? DistributionMode.HASH : DistributionMode.NONE); + DynamicRecord dynamicRecord = + new DynamicRecord( + tableIdentifier, + row.branch, + schema, + converter(schema).toInternal(row.rowProvided), + spec, + mode, 10); dynamicRecord.setUpsertMode(row.upsertMode); dynamicRecord.setEqualityFields(row.equalityFields); @@ -238,6 +298,98 @@ void testWrite() throws Exception { runTest(rows); } + @Test + void testNoShuffleTopology() throws Exception { + DataStream dataStream = + env.fromData( + Collections.emptyList(), TypeInformation.of(new TypeHint() {})); + DynamicIcebergSink.forInput(dataStream) + .generator(new ForwardGenerator()) + .catalogLoader(CATALOG_EXTENSION.catalogLoader()) + .writeParallelism(2) + .immediateTableUpdate(false) + .overwrite(false) + .append(); + + boolean generatorAndSinkChained = false; + for (JobVertex vertex : env.getStreamGraph().getJobGraph().getVertices()) { + String vertexName = vertex.getName(); + boolean generatorInThisVertex = vertexName.contains("-generator"); + boolean sinkInThisVertex = vertexName.contains("-Forward-Writer"); + + generatorAndSinkChained = generatorInThisVertex && sinkInThisVertex; + if (generatorAndSinkChained) { + break; + } + } + + assertThat(generatorAndSinkChained).isTrue(); + } + + @Test + void testForwardWrite() throws Exception { + runForwardWriteTest(new ForwardGenerator()); + } + + @Test + void testMixedForwardAndShuffleWrite() throws Exception { + runForwardWriteTest(new MixedGenerator()); + } + + private void runForwardWriteTest(DynamicRecordGenerator generator) + throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned())); + + DataStream dataStream = + env.fromData(rows, TypeInformation.of(new TypeHint<>() {})); + env.setParallelism(1); + + DynamicIcebergSink.forInput(dataStream) + .generator(generator) + .catalogLoader(CATALOG_EXTENSION.catalogLoader()) + .writeParallelism(1) + .immediateTableUpdate(true) + .append(); + + env.execute(); + + verifyResults(rows); + } + + @Test + void testWriteWithNullBranch() throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", null, PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", null, PartitionSpec.unpartitioned())); + + runTest( + rows, this.env, false, 1, ImmutableMap.of(FlinkWriteOptions.BRANCH.key(), "test-branch")); + } + @Test void testWritePartitioned() throws Exception { PartitionSpec spec = PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).bucket("id", 10).build(); @@ -1170,8 +1322,9 @@ void testOperatorUidsFormat() { // pre commit topology was off, but since it is stateless, users will still be able to restore // state, but we must keep the stateful operators UUIds like the committer consistent. assertThat(sinkUids) - .contains( + .containsOnly( "test--sink", + "test--forward-writer", "test--generator", "test--updater", "test--sink: test--pre-commit-topology", @@ -1179,8 +1332,9 @@ void testOperatorUidsFormat() { sinkUids = createSinkAndReturnUIds(""); assertThat(sinkUids) - .contains( + .containsOnly( "--sink", + "--forward-writer", "--generator", "--updater", "--sink: --pre-commit-topology", @@ -1188,14 +1342,71 @@ void testOperatorUidsFormat() { sinkUids = createSinkAndReturnUIds(null); assertThat(sinkUids) - .contains( + .containsOnly( "--sink", + "--forward-writer", "--generator", "--updater", "--sink: --pre-commit-topology", "Sink Committer: --sink"); } + @Test + void testGeneratorDefaultParallelism() { + StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); + streamEnv.setParallelism(4); + + DataStreamSource source = + streamEnv.fromData(Collections.emptySet(), TypeInformation.of(new TypeHint<>() {})); + source.setParallelism(8); + + DynamicIcebergSink.forInput(source) + .generator(new Generator()) + .catalogLoader(CATALOG_EXTENSION.catalogLoader()) + .uidPrefix("test") + .append(); + + // Since the generator parallelism is not directly accessible via the returned DataStreamSink, + // inspect the stream graph to verify the generator inherits the input source parallelism. + int generatorParallelism = + streamEnv.getStreamGraph().getStreamNodes().stream() + .filter(node -> "test--generator".equals(node.getTransformationUID())) + .findFirst() + .map(StreamNode::getParallelism) + .orElseThrow(() -> new AssertionError("Generator node not found")); + + assertThat(generatorParallelism).isEqualTo(source.getParallelism()); + } + + @Test + void testFallBackParallelismFromConfig() throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned(), + false, + Collections.emptySet(), + false, + -1), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned(), + false, + Collections.emptySet(), + false, + 0)); + + runTest( + rows, this.env, true, 2, ImmutableMap.of(FlinkWriteOptions.WRITE_PARALLELISM.key(), "1")); + } + private Set createSinkAndReturnUIds(String uidPrefix) { StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); @@ -1304,6 +1515,18 @@ private void runTest( verifyResults(dynamicData); } + private void runTest( + List dynamicData, + StreamExecutionEnvironment env, + boolean immediateUpdate, + int parallelism, + Map writeProperties) + throws Exception { + executeDynamicSink( + dynamicData, env, immediateUpdate, parallelism, null, false, writeProperties); + verifyResults(dynamicData, writeProperties); + } + private void executeDynamicSink( List dynamicData, StreamExecutionEnvironment env, @@ -1311,7 +1534,8 @@ private void executeDynamicSink( int parallelism, @Nullable CommitHook commitHook) throws Exception { - executeDynamicSink(dynamicData, env, immediateUpdate, parallelism, commitHook, false); + executeDynamicSink( + dynamicData, env, immediateUpdate, parallelism, commitHook, false, Maps.newHashMap()); } private void executeDynamicSink( @@ -1322,6 +1546,19 @@ private void executeDynamicSink( @Nullable CommitHook commitHook, boolean overwrite) throws Exception { + executeDynamicSink( + dynamicData, env, immediateUpdate, parallelism, commitHook, overwrite, Maps.newHashMap()); + } + + private void executeDynamicSink( + List dynamicData, + StreamExecutionEnvironment env, + boolean immediateUpdate, + int parallelism, + @Nullable CommitHook commitHook, + boolean overwrite, + Map writeProperties) + throws Exception { DataStream dataStream = env.fromData(dynamicData, TypeInformation.of(new TypeHint<>() {})); env.setParallelism(parallelism); @@ -1335,6 +1572,7 @@ private void executeDynamicSink( .immediateTableUpdate(immediateUpdate) .setSnapshotProperty("commit.retry.num-retries", "0") .overwrite(overwrite) + .setAll(writeProperties) .append(); } else { DynamicIcebergSink.forInput(dataStream) @@ -1343,6 +1581,7 @@ private void executeDynamicSink( .writeParallelism(parallelism) .immediateTableUpdate(immediateUpdate) .overwrite(overwrite) + .setAll(writeProperties) .append(); } @@ -1359,7 +1598,9 @@ static class CommitHookEnabledDynamicIcebergSink extends DynamicIcebergSink.B @Override DynamicIcebergSink instantiateSink( - Map writeProperties, Configuration flinkConfig) { + Map writeProperties, + Configuration flinkConfig, + DataStream> forwardWriteResults) { return new CommitHookDynamicIcebergSink( commitHook, CATALOG_EXTENSION.catalogLoader(), @@ -1367,7 +1608,7 @@ DynamicIcebergSink instantiateSink( "uidPrefix", writeProperties, flinkConfig, - 100); + forwardWriteResults); } } @@ -1383,14 +1624,15 @@ static class CommitHookDynamicIcebergSink extends DynamicIcebergSink { String uidPrefix, Map writeProperties, Configuration flinkConfig, - int cacheMaximumSize) { + DataStream> forwardWritten) { super( catalogLoader, snapshotProperties, uidPrefix, writeProperties, flinkConfig, - cacheMaximumSize); + 100, + forwardWritten); this.commitHook = commitHook; this.overwriteMode = new FlinkWriteConf(writeProperties, flinkConfig).overwriteMode(); } @@ -1409,6 +1651,12 @@ public Committer createCommitter(CommitterInitContext contex } private void verifyResults(List dynamicData) throws IOException { + verifyResults(dynamicData, Maps.newHashMap()); + } + + private void verifyResults( + List dynamicData, Map writeProperties) + throws IOException { // Calculate the expected result Map, List> expectedData = Maps.newHashMap(); Map expectedSchema = Maps.newHashMap(); @@ -1422,9 +1670,12 @@ private void verifyResults(List dynamicData) throws IOEx dynamicData.forEach( r -> { + String branch = + MoreObjects.firstNonNull( + r.branch, writeProperties.get(FlinkWriteOptions.BRANCH.key())); List data = expectedData.computeIfAbsent( - Tuple2.of(r.tableName, r.branch), unused -> Lists.newArrayList()); + Tuple2.of(r.tableName, branch), unused -> Lists.newArrayList()); data.addAll( convertToRowData(expectedSchema.get(r.tableName), ImmutableList.of(r.rowExpected))); }); diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordWithConfig.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordWithConfig.java new file mode 100644 index 000000000000..de55621475ed --- /dev/null +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordWithConfig.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Collections; +import java.util.Set; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +class TestDynamicRecordWithConfig { + + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); + + private static final TableIdentifier TABLE_IDENTIFIER = TableIdentifier.of("db", "table"); + private static final PartitionSpec UNPARTITIONED = PartitionSpec.unpartitioned(); + private static final RowData ROW_DATA = GenericRowData.of(1, StringData.fromString("test")); + + @Test + void testBranchFallBack() { + String defaultBranch = "default-branch"; + FlinkWriteConf conf = + new FlinkWriteConf( + ImmutableMap.of(FlinkWriteOptions.BRANCH.key(), defaultBranch), new Configuration()); + DynamicRecordWithConfig dynamicRecordWithConfig = new DynamicRecordWithConfig(conf); + + DynamicRecord dynamicRecord = + new DynamicRecord(TABLE_IDENTIFIER, null, SCHEMA, ROW_DATA, UNPARTITIONED); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).branch()).isEqualTo(defaultBranch); + + String customBranch = "custom-branch"; + dynamicRecord.setBranch(customBranch); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).branch()).isEqualTo(customBranch); + } + + @Test + void testWriteParallelismFallBack() { + int defaultParallelism = 4; + FlinkWriteConf conf = + new FlinkWriteConf( + ImmutableMap.of( + FlinkWriteOptions.WRITE_PARALLELISM.key(), String.valueOf(defaultParallelism)), + new Configuration()); + DynamicRecordWithConfig dynamicRecordWithConfig = new DynamicRecordWithConfig(conf); + + DynamicRecord dynamicRecord = + new DynamicRecord(TABLE_IDENTIFIER, null, SCHEMA, ROW_DATA, UNPARTITIONED, null, -1); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).writeParallelism()) + .isEqualTo(defaultParallelism); + + dynamicRecord.writeParallelism(0); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).writeParallelism()) + .isEqualTo(defaultParallelism); + + dynamicRecord.writeParallelism(8); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).writeParallelism()).isEqualTo(8); + } + + @Test + void testDelegatesToWrappedRecord() { + FlinkWriteConf conf = new FlinkWriteConf(Collections.emptyMap(), new Configuration()); + PartitionSpec partitioned = PartitionSpec.builderFor(SCHEMA).identity("id").build(); + Set equalityFields = ImmutableSet.of("id", "data"); + + DynamicRecord dynamicRecord = + new DynamicRecord( + TABLE_IDENTIFIER, + SnapshotRef.MAIN_BRANCH, + SCHEMA, + ROW_DATA, + partitioned, + DistributionMode.HASH, + 2); + dynamicRecord.setUpsertMode(true); + dynamicRecord.setEqualityFields(equalityFields); + + DynamicRecordWithConfig record = new DynamicRecordWithConfig(conf).wrap(dynamicRecord); + + assertThat(record.tableIdentifier()).isEqualTo(TABLE_IDENTIFIER); + assertThat(record.schema()).isEqualTo(SCHEMA); + assertThat(record.spec()).isEqualTo(partitioned); + assertThat(record.rowData()).isSameAs(ROW_DATA); + assertThat(record.distributionMode()).isEqualTo(DistributionMode.HASH); + assertThat(record.upsertMode()).isTrue(); + assertThat(record.equalityFields()).isEqualTo(equalityFields); + } +} diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java index 5d5a12418037..fdc12951264e 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java @@ -24,12 +24,14 @@ import java.util.Collections; import org.apache.flink.api.common.functions.OpenContext; +import org.apache.flink.configuration.Configuration; import org.apache.flink.table.data.GenericRowData; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.RegisterExtension; @@ -58,9 +60,6 @@ class TestDynamicTableUpdateOperator { @Test void testDynamicTableUpdateOperatorNewTable() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); @@ -68,12 +67,8 @@ void testDynamicTableUpdateOperatorNewTable() throws Exception { DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_SENSITIVE, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(CASE_SENSITIVE, PRESERVE_COLUMNS)); operator.open((OpenContext) null); DynamicRecordInternal input = @@ -94,21 +89,14 @@ void testDynamicTableUpdateOperatorNewTable() throws Exception { @Test void testDynamicTableUpdateOperatorSchemaChange() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_SENSITIVE, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(CASE_SENSITIVE, PRESERVE_COLUMNS)); operator.open((OpenContext) null); catalog.createTable(table, SCHEMA1); @@ -136,9 +124,6 @@ void testDynamicTableUpdateOperatorSchemaChange() throws Exception { @ParameterizedTest @ValueSource(booleans = {true, false}) void testCaseInSensitivity(boolean caseSensitive) throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); @@ -149,12 +134,8 @@ void testCaseInSensitivity(boolean caseSensitive) throws Exception { DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - caseSensitive, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(caseSensitive, PRESERVE_COLUMNS)); operator.open((OpenContext) null); catalog.createTable(table, initialSchema); @@ -188,21 +169,14 @@ void testCaseInSensitivity(boolean caseSensitive) throws Exception { @Test void testDynamicTableUpdateOperatorPreserveUnusedColumns() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_SENSITIVE, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(CASE_SENSITIVE, PRESERVE_COLUMNS)); operator.open((OpenContext) null); catalog.createTable(table, SCHEMA2); @@ -229,21 +203,14 @@ void testDynamicTableUpdateOperatorPreserveUnusedColumns() throws Exception { @Test void testDynamicTableUpdateOperatorDropUnusedColumns() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_INSENSITIVE, - DROP_COLUMNS); + flinkDynamicSinkConfiguration(CASE_INSENSITIVE, DROP_COLUMNS)); operator.open((OpenContext) null); catalog.createTable(table, SCHEMA2); @@ -266,4 +233,13 @@ void testDynamicTableUpdateOperatorDropUnusedColumns() throws Exception { assertThat(tableSchema.findField("data")).isNull(); assertThat(input).isEqualTo(output); } + + private static FlinkDynamicSinkConf flinkDynamicSinkConfiguration( + boolean caseSensitive, boolean dropUnusedColumns) { + return new FlinkDynamicSinkConf( + ImmutableMap.of( + FlinkDynamicSinkOptions.CASE_SENSITIVE.key(), String.valueOf(caseSensitive), + FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.key(), String.valueOf(dropUnusedColumns)), + new Configuration()); + } } diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java index c65f96b12cbb..9a485fafaf47 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java @@ -25,6 +25,7 @@ import java.util.Map; import java.util.Set; import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.configuration.Configuration; import org.apache.flink.runtime.state.KeyGroupRangeAssignment; import org.apache.flink.table.data.GenericRowData; import org.apache.flink.table.data.RowData; @@ -34,6 +35,9 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.Test; @@ -229,6 +233,38 @@ void testFailOnNonPositiveWriteParallelism() { }); } + @Test + void testNonPositiveWriteParallelismConfigFallback() throws Exception { + int maxWriteParallelism = 5; + HashKeyGenerator generator = new HashKeyGenerator(16, maxWriteParallelism); + PartitionSpec unpartitioned = PartitionSpec.unpartitioned(); + FlinkWriteConf flinkWriteConf = + new FlinkWriteConf( + ImmutableMap.of(FlinkWriteOptions.WRITE_PARALLELISM.key(), "2"), new Configuration()); + + Set writeKeys = Sets.newHashSet(); + for (int i = 0; i < 20; i++) { + GenericRowData row = GenericRowData.of(i, StringData.fromString("z")); + writeKeys.add( + getWriteKey( + generator, + unpartitioned, + DistributionMode.NONE, + i % 2 == 0 ? 0 : -1, + Collections.emptySet(), + row, + flinkWriteConf)); + } + + assertThat(writeKeys).hasSize(2); + assertThat( + writeKeys.stream() + .map(key -> getSubTaskId(key, 2, maxWriteParallelism)) + .distinct() + .count()) + .isEqualTo(2); + } + @Test void testCapAtMaxWriteParallelism() throws Exception { int writeParallelism = 10; @@ -477,10 +513,31 @@ private static int getWriteKey( Set equalityFields, GenericRowData row) throws Exception { - DynamicRecord record = + return getWriteKey( + generator, + spec, + mode, + writeParallelism, + equalityFields, + row, + new FlinkWriteConf(Collections.emptyMap(), new Configuration())); + } + + private static int getWriteKey( + HashKeyGenerator generator, + PartitionSpec spec, + DistributionMode mode, + int writeParallelism, + Set equalityFields, + GenericRowData row, + FlinkWriteConf flinkWriteConf) + throws Exception { + DynamicRecord inputRecord = new DynamicRecord(TABLE_IDENTIFIER, BRANCH, SCHEMA, row, spec, mode, writeParallelism); - record.setEqualityFields(equalityFields); - return generator.generateKey(record); + inputRecord.setEqualityFields(equalityFields); + + DynamicRecordWithConfig dynamicRecordWithConfig = new DynamicRecordWithConfig(flinkWriteConf); + return generator.generateKey(dynamicRecordWithConfig.wrap(inputRecord)); } private static int getSubTaskId(int writeKey1, int writeParallelism, int maxWriteParallelism) { diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java index 70889f4f76aa..fff9b96b3b7c 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java @@ -225,6 +225,68 @@ public void apply( 3))); } + /** + * Integration test verifying that records with eventTime equal to the minimum timestamp of their + * split are correctly included in windows. The {@link + * org.apache.iceberg.flink.source.reader.WatermarkExtractorRecordEmitter} emits the watermark as + * {@code minSplitTs - 1}, so records at exactly {@code minSplitTs} are on-time rather than late. + * + *

    The test writes 3 records at epoch (t=0). The split's column-stats lower-bound is 0, so the + * extracted watermark is 0ms and the emitted watermark is -1ms. Records at t=0 are strictly after + * that watermark and therefore belong to the [0, 5min) window. A later split is then appended to + * advance the watermark past the window boundary and trigger its evaluation. + */ + @Test + public void testWindowingWithRecordsAtSplitMinTimestamp() throws Exception { + GenericAppenderHelper dataAppender = appender(); + + // File 1: 3 records at exactly t=0 (epoch). Extracted watermark = 0ms, emitted = -1ms. + List batch = + ImmutableList.of( + generateRecord(0, "file_1-recordTs_0_a"), + generateRecord(0, "file_1-recordTs_0_b"), + generateRecord(0, "file_1-recordTs_0_c")); + dataAppender.appendToTable(batch); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(1); + + DataStream stream = + env.fromSource( + source(), + WatermarkStrategy.noWatermarks() + .withTimestampAssigner(new RowDataTimestampAssigner()), + SOURCE_NAME, + TypeInformation.of(RowData.class)); + + stream + .windowAll(TumblingEventTimeWindows.of(Time.minutes(5))) + .apply( + new AllWindowFunction() { + @Override + public void apply( + TimeWindow window, Iterable values, Collector out) { + AtomicInteger count = new AtomicInteger(0); + values.forEach(a -> count.incrementAndGet()); + out.collect(row(window.getStart(), count.get())); + WINDOWS.put(window.getStart(), count.get()); + } + }); + + WINDOWS.clear(); + env.executeAsync("Iceberg Source Min Timestamp Windowing Test"); + + // Append a file with much later timestamps to advance the watermark past [0, 5min) + dataAppender.appendToTable( + dataAppender.writeFile(ImmutableList.of(generateRecord(1500, "last-record")))); + + // The [0, 5min) window should fire with all 3 records written at epoch + Awaitility.await() + .pollInterval(Duration.ofMillis(10)) + .atMost(30, TimeUnit.SECONDS) + .until(() -> Integer.valueOf(3).equals(WINDOWS.get(0L))); + } + /** * This is an integration test for watermark handling and throttling. Integration testing the * following: diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java index 5dd7de545e11..09639a8a9568 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java @@ -31,7 +31,7 @@ void testIncrementalFromSnapshotId() { .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) .build(); assertException( - context, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); + context, "Invalid starting snapshot id for INCREMENTAL_FROM_SNAPSHOT_ID strategy: null"); context = ScanContext.builder() @@ -42,7 +42,7 @@ void testIncrementalFromSnapshotId() { .build(); assertException( context, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + "Invalid starting snapshot timestamp for INCREMENTAL_FROM_SNAPSHOT_ID strategy: not null"); } @Test @@ -54,7 +54,7 @@ void testIncrementalFromSnapshotTimestamp() { .build(); assertException( context, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); + "Invalid starting snapshot timestamp for INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP strategy: null"); context = ScanContext.builder() @@ -64,7 +64,8 @@ void testIncrementalFromSnapshotTimestamp() { .startSnapshotTimestamp(1L) .build(); assertException( - context, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + context, + "Invalid starting snapshot id for INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP strategy: not null"); } @Test diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestWatermarkExtractorRecordEmitter.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestWatermarkExtractorRecordEmitter.java new file mode 100644 index 000000000000..1cb7be03c6a7 --- /dev/null +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestWatermarkExtractorRecordEmitter.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import java.util.Map; +import org.apache.flink.api.common.eventtime.Watermark; +import org.apache.flink.api.connector.source.SourceOutput; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestWatermarkExtractorRecordEmitter { + @TempDir protected Path temporaryFolder; + + @Test + public void testWatermarkIsDecrementedByOne() throws IOException { + long extractedWatermark = 1000L; + IcebergSourceSplit split = createSplit(1L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> extractedWatermark); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + emitter.emitRecord(new RecordAndPosition<>("record", 0, 0L), output, split); + + assertThat(output.watermarks).hasSize(1); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(extractedWatermark - 1); + } + + @Test + public void testWatermarkEmittedOnlyOncePerSplit() throws IOException { + IcebergSourceSplit split = createSplit(1L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> 1000L); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + RecordAndPosition element = new RecordAndPosition<>("record", 0, 0L); + emitter.emitRecord(element, output, split); + emitter.emitRecord(element, output, split); + emitter.emitRecord(element, output, split); + + assertThat(output.watermarks).hasSize(1); + assertThat(output.records).hasSize(3); + } + + @Test + public void testWatermarkNotEmittedWhenNewSplitHasLowerValue() throws IOException { + IcebergSourceSplit split1 = createSplit(1L); + IcebergSourceSplit split2 = createSplit(2L); + + Map watermarkMap = Maps.newHashMap(); + watermarkMap.put(split1.splitId(), 2000L); + watermarkMap.put(split2.splitId(), 1000L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> watermarkMap.get(s.splitId())); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + RecordAndPosition element = new RecordAndPosition<>("record", 0, 0L); + emitter.emitRecord(element, output, split1); + emitter.emitRecord(element, output, split2); + + // Only split1's watermark is emitted; split2 has a lower value so it's skipped + assertThat(output.watermarks).hasSize(1); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(1999L); + } + + @Test + public void testWatermarkEmittedForEachHigherSplit() throws IOException { + IcebergSourceSplit split1 = createSplit(1L); + IcebergSourceSplit split2 = createSplit(2L); + + Map watermarkMap = Maps.newHashMap(); + watermarkMap.put(split1.splitId(), 1000L); + watermarkMap.put(split2.splitId(), 2000L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> watermarkMap.get(s.splitId())); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + RecordAndPosition element = new RecordAndPosition<>("record", 0, 0L); + emitter.emitRecord(element, output, split1); + emitter.emitRecord(element, output, split2); + + assertThat(output.watermarks).hasSize(2); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(999L); + assertThat(output.watermarks.get(1).getTimestamp()).isEqualTo(1999L); + } + + @Test + public void testWatermarkAtLongMinValueDoesNotOverflow() throws IOException { + IcebergSourceSplit split = createSplit(1L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> Long.MIN_VALUE); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + emitter.emitRecord(new RecordAndPosition<>("record", 0, 0L), output, split); + + assertThat(output.watermarks).hasSize(1); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(Long.MIN_VALUE); + } + + private IcebergSourceSplit createSplit(long seed) throws IOException { + List> recordBatchList = + ReaderUtil.createRecordBatchList(seed, TestFixtures.SCHEMA, 1, 1); + return IcebergSourceSplit.fromCombinedScanTask( + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, FileFormat.PARQUET, TestFixtures.SCHEMA)); + } + + private static class CapturingSourceOutput implements SourceOutput { + final List watermarks = Lists.newArrayList(); + final List records = Lists.newArrayList(); + + @Override + public void collect(T record) { + records.add(record); + } + + @Override + public void collect(T record, long timestamp) { + records.add(record); + } + + @Override + public void emitWatermark(Watermark watermark) { + watermarks.add(watermark); + } + + @Override + public void markIdle() {} + + @Override + public void markActive() {} + } +} diff --git a/flink/v2.0/build.gradle b/flink/v2.0/build.gradle index 5907f41b3544..7bc37b30e5a1 100644 --- a/flink/v2.0/build.gradle +++ b/flink/v2.0/build.gradle @@ -33,7 +33,8 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { implementation project(':iceberg-hive-metastore') compileOnly libs.flink20.avro - // for dropwizard histogram metrics implementation + compileOnly libs.joda.time + // dropwizard histogram metrics (optional in Flink) compileOnly libs.flink20.metrics.dropwizard compileOnly libs.flink20.streaming.java compileOnly "${libs.flink20.streaming.java.get().module}:${libs.flink20.streaming.java.get().getVersion()}:tests" @@ -83,6 +84,8 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') + testImplementation project(path: ':iceberg-orc', configuration: 'testArtifacts') + testImplementation(testFixtures(project(':iceberg-parquet'))) // By default, hive-exec is a fat/uber jar and it exports a guava library // that's really old. We use the core classifier to be able to override our guava @@ -169,9 +172,6 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { exclude group: 'com.google.code.findbugs', module: 'jsr305' } - // for dropwizard histogram metrics implementation - implementation libs.flink20.metrics.dropwizard - // for integration testing with the flink-runtime-jar // all of those dependencies are required because the integration test extends FlinkTestBase integrationCompileOnly project(':iceberg-api') @@ -266,4 +266,6 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { jar { enabled = false } + + apply from: "${rootDir}/runtime-deps.gradle" } diff --git a/flink/v2.0/flink-runtime/LICENSE b/flink/v2.0/flink-runtime/LICENSE index 36a03cb4fcf9..d73eda0104b9 100644 --- a/flink/v2.0/flink-runtime/LICENSE +++ b/flink/v2.0/flink-runtime/LICENSE @@ -219,6 +219,94 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles FastDoubleParser (via Jackson JSON Processor). + +Copyright: 2023 Werner Randelshofer, Switzerland +Project URL: https://github.com/wrandelshofer/FastDoubleParser +License: MIT + +| Copyright (c) 2023 Werner Randelshofer, Switzerland +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles fast_float (bundled by FastDoubleParser). + +Copyright: 2021 The fast_float authors +Project URL: https://github.com/fastfloat/fast_float +License: MIT + +| Copyright (c) 2021 The fast_float authors +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles bigint (bundled by FastDoubleParser). + +Copyright: 2022 Tim Buktu +Project URL: https://github.com/tbuktu/bigint +License: BSD 2-Clause + +| Copyright 2022 Tim Buktu +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| 1. Redistributions of source code must retain the above copyright notice, this +| list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + This product bundles Apache Parquet. Copyright: 2014-2020 The Apache Software Foundation. @@ -227,7 +315,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Thrift. +This product bundles Apache Thrift (bundled by Parquet). Copyright: 2006-2010 The Apache Software Foundation. Project URL: https://thrift.apache.org/ @@ -235,55 +323,57 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Fastutil. +This product includes code from Daniel Lemire's JavaFastPFOR project (bundled by Parquet). -Copyright: 2002-2014 Sebastiano Vigna -Project URL: http://fastutil.di.unimi.it/ +Copyright: 2013 Daniel Lemire +Project URL: https://github.com/lemire/JavaFastPFOR License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Apache ORC. +This product bundles fastutil (bundled by Parquet). -Copyright: 2013-2020 The Apache Software Foundation. -Project URL: https://orc.apache.org/ +Copyright: 2002-2014 Sebastiano Vigna +Project URL: http://fastutil.di.unimi.it/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Apache Datasketches. +This product bundles Zero-Allocation Hashing (bundled by Parquet). -Project URL: https://datasketches.apache.org/ +Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Apache Hive. +This product bundles Apache ORC. Copyright: 2013-2020 The Apache Software Foundation. -Project URL: https://hive.apache.org/ +Project URL: https://orc.apache.org/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Airlift Aircompressor. +This product bundles Apache Hive's Storage API (bundled by ORC). -Copyright: 2011-2020 Aircompressor authors. -Project URL: https://github.com/airlift/aircompressor +Copyright: 2008-2020 The Apache Software Foundation. +Project URL: https://hive.apache.org/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Google GAX. +This product bundles Google protobuf (bundled by ORC). -Project URL: https://github.com/googleapis/gax-java +Copyright: 2008 Google Inc. +Project URL: https://developers.google.com/protocol-buffers License: BSD 3-Clause -| Copyright 2016, Google Inc. All rights reserved. -| + +| Copyright 2008 Google Inc. All rights reserved. +| | Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are | met: -| +| | * Redistributions of source code must retain the above copyright | notice, this list of conditions and the following disclaimer. | * Redistributions in binary form must reproduce the above @@ -293,7 +383,7 @@ License: BSD 3-Clause | * Neither the name of Google Inc. nor the names of its | contributors may be used to endorse or promote products derived from | this software without specific prior written permission. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -305,40 +395,26 @@ License: BSD 3-Clause | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +| +| Code generated by the Protocol Buffer compiler is owned by the owner +| of the input file used when generating it. This code is not +| standalone and requires a support library to be linked with it. This +| support library is itself covered by the above license. -------------------------------------------------------------------------------- -This product bundles Google Auth Library. +This product bundles Apache Datasketches. -License: BSD 3-Clause -| Copyright 2014, Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +Project URL: https://datasketches.apache.org/ +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles Airlift Aircompressor. + +Copyright: 2011-2020 Aircompressor authors. +Project URL: https://github.com/airlift/aircompressor +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -363,6 +439,7 @@ This product bundles checkerframework checker-qual. Copyright: 2004-2020 the Checker Framework developers Project URL: https://github.com/typetools/checker-framework License: MIT + | The annotations are licensed under the MIT License. (The text of this | license appears below.) More specifically, all the parts of the Checker | Framework that you might want to include with your own program use the @@ -401,87 +478,13 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Google protobuf. - -Copyright: 2008 Google Inc. -Project URL: https://developers.google.com/protocol-buffers -License: BSD 3-Clause -| Copyright 2008 Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -| -| Code generated by the Protocol Buffer compiler is owned by the owner -| of the input file used when generating it. This code is not -| standalone and requires a support library to be linked with it. This -| support library is itself covered by the above license. - --------------------------------------------------------------------------------- - -This product bundles ThreeTen BP. - -Project URL: https://www.threeten.org/threetenbp -License: BSD 3-Clause -| Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. -| -| All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are met: -| -| * Redistributions of source code must retain the above copyright notice, -| this list of conditions and the following disclaimer. -| -| * Redistributions in binary form must reproduce the above copyright notice, -| this list of conditions and the following disclaimer in the documentation -| and/or other materials provided with the distribution. -| -| * Neither the name of JSR-310 nor the names of its contributors -| may be used to endorse or promote products derived from this software -| without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - This product bundles ThreeTen Extra. Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. Project URL: https://www.threeten.org/threeten-extra/ License: BSD 3-Clause + +| Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. | All rights reserved. | | * Redistribution and use in source and binary forms, with or without @@ -530,15 +533,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache HttpComponents (core and client). - -Copyright: 1999-2022 The Apache Software Foundation. -Project URL: https://hc.apache.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product includes code from Apache HttpComponents Client. +This product bundles and includes code from Apache HttpComponents (core/client). * retry and error handling logic in ExponentialHttpRequestRetryStrategy.java @@ -556,14 +551,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Codahale Metrics. - -Copyright: (c) 2010-2013 Coda Hale, Yammer.com, 2014-2021 Dropwizard Team -Project URL: https://github.com/dropwizard/metrics -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - This product bundles RoaringBitmap. Copyright: (c) 2013-... the RoaringBitmap authors @@ -572,7 +559,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Eclipse Microprofile OpenAPI. +This product bundles Eclipse MicroProfile OpenAPI. Copyright: Copyright (c) 2017 Contributors to the Eclipse Foundation Project URL: https://github.com/microprofile/microprofile-open-api @@ -585,6 +572,7 @@ This product bundles Luben Zstd. Copyright: Copyright (c) 2015-present, Luben Karavelov/ All rights reserved. Project URL: https://github.com/luben/zstd-jni/ License: BSD 2-Clause + | Zstd-jni: JNI bindings to Zstd Library | | Copyright (c) 2015-present, Luben Karavelov/ All rights reserved. @@ -614,242 +602,423 @@ License: BSD 2-Clause -------------------------------------------------------------------------------- -This product bundles Google Cloud BigQuery Client for Java. - -Project URL: https://github.com/googleapis/java-bigquery -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Credentials for Java. - -Project URL: https://github.com/aliyun/credentials-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Tea for Java. - -Project URL: https://github.com/aliyun/tea-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Gson. - -Project URL: https://github.com/google/gson/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google API Common. - -License: BSD 3-Clause -| Copyright 2016, Google Inc. -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This product bundles Google Http Client. - -Project URL: https://www.google.com/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OpenCensus. - -Project URL: https://github.com/census-instrumentation/opencensus-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JaCoCo. - -Project URL: https://github.com/jacoco/jacoco/ -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt - --------------------------------------------------------------------------------- - -This product bundles JAXB. - -Project URL: http://jaxb.java.net/ -License: CDDL 1.1 - https://glassfish.java.net/public/CDDL+GPL_1_1.html - --------------------------------------------------------------------------------- - -This product bundles Okio. - -Project URL: https://github.com/square/okio -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OkHttp. - -Project URL: https://github.com/square/okhttp -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Auto Value. - -Project URL: https://github.com/google/auto/tree/master/value -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles gRPC. - -Project URL: https://github.com/grpc/grpc-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Netty. - -Project URL: https://netty.io/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google APIs. - -Project URL: https://github.com/googleapis/googleapis -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud APIs for Java. - -Project URL: https://github.com/googleapis/google-cloud-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - This product bundles JTS Topology Suite. Project URL: https://github.com/locationtech/jts -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt - --------------------------------------------------------------------------------- - -This product bundles javax.annotation-api. - -Project URL: https://javaee.github.io/glassfish -Project URL: http://jcp.org/en/jsr/detail?id=250 -License: CDDL - https://github.com/javaee/javax.annotation/blob/master/LICENSE - --------------------------------------------------------------------------------- - -This product bundles JSpecify. - -Project URL: https://github.com/jspecify/jspecify -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Animal Sniffer Annotations. - -License: MIT -| The MIT License -| -| Copyright (c) 2009 codehaus.org. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in -| all copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -| THE SOFTWARE. - --------------------------------------------------------------------------------- - -This product bundles Android Annotations. - -Project URL: http://source.android.com/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php -This product bundles Conscrypt (openjdk-uber). - -Project URL: https://conscrypt.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Perfmark. - -Project URL: https://github.com/perfmark/perfmark -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles org.json. - -Project URL: https://github.com/douglascrockford/JSON-java -License: Public Domain - https://github.com/stleary/JSON-java/blob/master/LICENSE - --------------------------------------------------------------------------------- - -This product bundles Apache Arrow. - -Project URL: https://github.com/apache/arrow -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| - Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| +| - Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| - Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- -This product bundles Google flatbuffers. +This product bundles the Mozilla Public Suffix List, distributed by Apache HttpComponents. -Project URL: https://github.com/google/flatbuffers -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OpenTelemetry. +Project URL: https://publicsuffix.org/ +License: Mozilla Public License, Version 2.0 - https://mozilla.org/MPL/2.0/ -Project URL: https://opentelemetry.io/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 +| Mozilla Public License Version 2.0 +| ================================== +| +| 1. Definitions +| -------------- +| +| 1.1. "Contributor" +| means each individual or legal entity that creates, contributes to +| the creation of, or owns Covered Software. +| +| 1.2. "Contributor Version" +| means the combination of the Contributions of others (if any) used +| by a Contributor and that particular Contributor's Contribution. +| +| 1.3. "Contribution" +| means Covered Software of a particular Contributor. +| +| 1.4. "Covered Software" +| means Source Code Form to which the initial Contributor has attached +| the notice in Exhibit A, the Executable Form of such Source Code +| Form, and Modifications of such Source Code Form, in each case +| including portions thereof. +| +| 1.5. "Incompatible With Secondary Licenses" +| means +| +| (a) that the initial Contributor has attached the notice described +| in Exhibit B to the Covered Software; or +| +| (b) that the Covered Software was made available under the terms of +| version 1.1 or earlier of the License, but not also under the +| terms of a Secondary License. +| +| 1.6. "Executable Form" +| means any form of the work other than Source Code Form. +| +| 1.7. "Larger Work" +| means a work that combines Covered Software with other material, in +| a separate file or files, that is not Covered Software. +| +| 1.8. "License" +| means this document. +| +| 1.9. "Licensable" +| means having the right to grant, to the maximum extent possible, +| whether at the time of the initial grant or subsequently, any and +| all of the rights conveyed by this License. +| +| 1.10. "Modifications" +| means any of the following: +| +| (a) any file in Source Code Form that results from an addition to, +| deletion from, or modification of the contents of Covered +| Software; or +| +| (b) any new file in Source Code Form that contains any Covered +| Software. +| +| 1.11. "Patent Claims" of a Contributor +| means any patent claim(s), including without limitation, method, +| process, and apparatus claims, in any patent Licensable by such +| Contributor that would be infringed, but for the grant of the +| License, by the making, using, selling, offering for sale, having +| made, import, or transfer of either its Contributions or its +| Contributor Version. +| +| 1.12. "Secondary License" +| means either the GNU General Public License, Version 2.0, the GNU +| Lesser General Public License, Version 2.1, the GNU Affero General +| Public License, Version 3.0, or any later versions of those +| licenses. +| +| 1.13. "Source Code Form" +| means the form of the work preferred for making modifications. +| +| 1.14. "You" (or "Your") +| means an individual or a legal entity exercising rights under this +| License. For legal entities, "You" includes any entity that +| controls, is controlled by, or is under common control with You. For +| purposes of this definition, "control" means (a) the power, direct +| or indirect, to cause the direction or management of such entity, +| whether by contract or otherwise, or (b) ownership of more than +| fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants and Conditions +| -------------------------------- +| +| 2.1. Grants +| +| Each Contributor hereby grants You a world-wide, royalty-free, +| non-exclusive license: +| +| (a) under intellectual property rights (other than patent or trademark) +| Licensable by such Contributor to use, reproduce, make available, +| modify, display, perform, distribute, and otherwise exploit its +| Contributions, either on an unmodified basis, with Modifications, or +| as part of a Larger Work; and +| +| (b) under Patent Claims of such Contributor to make, use, sell, offer +| for sale, have made, import, and otherwise transfer either its +| Contributions or its Contributor Version. +| +| 2.2. Effective Date +| +| The licenses granted in Section 2.1 with respect to any Contribution +| become effective for each Contribution on the date the Contributor first +| distributes such Contribution. +| +| 2.3. Limitations on Grant Scope +| +| The licenses granted in this Section 2 are the only rights granted under +| this License. No additional rights or licenses will be implied from the +| distribution or licensing of Covered Software under this License. +| Notwithstanding Section 2.1(b) above, no patent license is granted by a +| Contributor: +| +| (a) for any code that a Contributor has removed from Covered Software; +| or +| +| (b) for infringements caused by: (i) Your and any other third party's +| modifications of Covered Software, or (ii) the combination of its +| Contributions with other software (except as part of its Contributor +| Version); or +| +| (c) under Patent Claims infringed by Covered Software in the absence of +| its Contributions. +| +| This License does not grant any rights in the trademarks, service marks, +| or logos of any Contributor (except as may be necessary to comply with +| the notice requirements in Section 3.4). +| +| 2.4. Subsequent Licenses +| +| No Contributor makes additional grants as a result of Your choice to +| distribute the Covered Software under a subsequent version of this +| License (see Section 10.2) or under the terms of a Secondary License (if +| permitted under the terms of Section 3.3). +| +| 2.5. Representation +| +| Each Contributor represents that the Contributor believes its +| Contributions are its original creation(s) or it has sufficient rights +| to grant the rights to its Contributions conveyed by this License. +| +| 2.6. Fair Use +| +| This License is not intended to limit any rights You have under +| applicable copyright doctrines of fair use, fair dealing, or other +| equivalents. +| +| 2.7. Conditions +| +| Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +| in Section 2.1. +| +| 3. Responsibilities +| ------------------- +| +| 3.1. Distribution of Source Form +| +| All distribution of Covered Software in Source Code Form, including any +| Modifications that You create or to which You contribute, must be under +| the terms of this License. You must inform recipients that the Source +| Code Form of the Covered Software is governed by the terms of this +| License, and how they can obtain a copy of this License. You may not +| attempt to alter or restrict the recipients' rights in the Source Code +| Form. +| +| 3.2. Distribution of Executable Form +| +| If You distribute Covered Software in Executable Form then: +| +| (a) such Covered Software must also be made available in Source Code +| Form, as described in Section 3.1, and You must inform recipients of +| the Executable Form how they can obtain a copy of such Source Code +| Form by reasonable means in a timely manner, at a charge no more +| than the cost of distribution to the recipient; and +| +| (b) You may distribute such Executable Form under the terms of this +| License, or sublicense it under different terms, provided that the +| license for the Executable Form does not attempt to limit or alter +| the recipients' rights in the Source Code Form under this License. +| +| 3.3. Distribution of a Larger Work +| +| You may create and distribute a Larger Work under terms of Your choice, +| provided that You also comply with the requirements of this License for +| the Covered Software. If the Larger Work is a combination of Covered +| Software with a work governed by one or more Secondary Licenses, and the +| Covered Software is not Incompatible With Secondary Licenses, this +| License permits You to additionally distribute such Covered Software +| under the terms of such Secondary License(s), so that the recipient of +| the Larger Work may, at their option, further distribute the Covered +| Software under the terms of either this License or such Secondary +| License(s). +| +| 3.4. Notices +| +| You may not remove or alter the substance of any license notices +| (including copyright notices, patent notices, disclaimers of warranty, +| or limitations of liability) contained within the Source Code Form of +| the Covered Software, except that You may alter any license notices to +| the extent required to remedy known factual inaccuracies. +| +| 3.5. Application of Additional Terms +| +| You may choose to offer, and to charge a fee for, warranty, support, +| indemnity or liability obligations to one or more recipients of Covered +| Software. However, You may do so only on Your own behalf, and not on +| behalf of any Contributor. You must make it absolutely clear that any +| such warranty, support, indemnity, or liability obligation is offered by +| You alone, and You hereby agree to indemnify every Contributor for any +| liability incurred by such Contributor as a result of warranty, support, +| indemnity or liability terms You offer. You may include additional +| disclaimers of warranty and limitations of liability specific to any +| jurisdiction. +| +| 4. Inability to Comply Due to Statute or Regulation +| --------------------------------------------------- +| +| If it is impossible for You to comply with any of the terms of this +| License with respect to some or all of the Covered Software due to +| statute, judicial order, or regulation then You must: (a) comply with +| the terms of this License to the maximum extent possible; and (b) +| describe the limitations and the code they affect. Such description must +| be placed in a text file included with all distributions of the Covered +| Software under this License. Except to the extent prohibited by statute +| or regulation, such description must be sufficiently detailed for a +| recipient of ordinary skill to be able to understand it. +| +| 5. Termination +| -------------- +| +| 5.1. The rights granted under this License will terminate automatically +| if You fail to comply with any of its terms. However, if You become +| compliant, then the rights granted under this License from a particular +| Contributor are reinstated (a) provisionally, unless and until such +| Contributor explicitly and finally terminates Your grants, and (b) on an +| ongoing basis, if such Contributor fails to notify You of the +| non-compliance by some reasonable means prior to 60 days after You have +| come back into compliance. Moreover, Your grants from a particular +| Contributor are reinstated on an ongoing basis if such Contributor +| notifies You of the non-compliance by some reasonable means, this is the +| first time You have received notice of non-compliance with this License +| from such Contributor, and You become compliant prior to 30 days after +| Your receipt of the notice. +| +| 5.2. If You initiate litigation against any entity by asserting a patent +| infringement claim (excluding declaratory judgment actions, +| counter-claims, and cross-claims) alleging that a Contributor Version +| directly or indirectly infringes any patent, then the rights granted to +| You by any and all Contributors for the Covered Software under Section +| 2.1 of this License shall terminate. +| +| 5.3. In the event of termination under Sections 5.1 or 5.2 above, all +| end user license agreements (excluding distributors and resellers) which +| have been validly granted by You or Your distributors under this License +| prior to termination shall survive termination. +| +| ************************************************************************ +| * * +| * 6. Disclaimer of Warranty * +| * ------------------------- * +| * * +| * Covered Software is provided under this License on an "as is" * +| * basis, without warranty of any kind, either expressed, implied, or * +| * statutory, including, without limitation, warranties that the * +| * Covered Software is free of defects, merchantable, fit for a * +| * particular purpose or non-infringing. The entire risk as to the * +| * quality and performance of the Covered Software is with You. * +| * Should any Covered Software prove defective in any respect, You * +| * (not any Contributor) assume the cost of any necessary servicing, * +| * repair, or correction. This disclaimer of warranty constitutes an * +| * essential part of this License. No use of any Covered Software is * +| * authorized under this License except under this disclaimer. * +| * * +| ************************************************************************ +| +| ************************************************************************ +| * * +| * 7. Limitation of Liability * +| * -------------------------- * +| * * +| * Under no circumstances and under no legal theory, whether tort * +| * (including negligence), contract, or otherwise, shall any * +| * Contributor, or anyone who distributes Covered Software as * +| * permitted above, be liable to You for any direct, indirect, * +| * special, incidental, or consequential damages of any character * +| * including, without limitation, damages for lost profits, loss of * +| * goodwill, work stoppage, computer failure or malfunction, or any * +| * and all other commercial damages or losses, even if such party * +| * shall have been informed of the possibility of such damages. This * +| * limitation of liability shall not apply to liability for death or * +| * personal injury resulting from such party's negligence to the * +| * extent applicable law prohibits such limitation. Some * +| * jurisdictions do not allow the exclusion or limitation of * +| * incidental or consequential damages, so this exclusion and * +| * limitation may not apply to You. * +| * * +| ************************************************************************ +| +| 8. Litigation +| ------------- +| +| Any litigation relating to this License may be brought only in the +| courts of a jurisdiction where the defendant maintains its principal +| place of business and such litigation shall be governed by laws of that +| jurisdiction, without reference to its conflict-of-law provisions. +| Nothing in this Section shall prevent a party's ability to bring +| cross-claims or counter-claims. +| +| 9. Miscellaneous +| ---------------- +| +| This License represents the complete agreement concerning the subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. Any law or regulation which provides +| that the language of a contract shall be construed against the drafter +| shall not be used to construe this License against a Contributor. +| +| 10. Versions of the License +| --------------------------- +| +| 10.1. New Versions +| +| Mozilla Foundation is the license steward. Except as provided in Section +| 10.3, no one other than the license steward has the right to modify or +| publish new versions of this License. Each version will be given a +| distinguishing version number. +| +| 10.2. Effect of New Versions +| +| You may distribute the Covered Software under the terms of the version +| of the License under which You originally received the Covered Software, +| or under the terms of any subsequent version published by the license +| steward. +| +| 10.3. Modified Versions +| +| If you create software not governed by this License, and you want to +| create a new license for such software, you may create and use a +| modified version of this License if you rename the license and remove +| any references to the name of the license steward (except to note that +| such modified license differs from this License). +| +| 10.4. Distributing Source Code Form that is Incompatible With Secondary +| Licenses +| +| If You choose to distribute Source Code Form that is Incompatible With +| Secondary Licenses under the terms of this version of the License, the +| notice described in Exhibit B of this License must be attached. +| +| Exhibit A - Source Code Form License Notice +| ------------------------------------------- +| +| This Source Code Form is subject to the terms of the Mozilla Public +| License, v. 2.0. If a copy of the MPL was not distributed with this +| file, You can obtain one at http://mozilla.org/MPL/2.0/. +| +| If it is not possible or desirable to put the notice in a particular +| file, then You may include the notice in a location (such as a LICENSE +| file in a relevant directory) where a recipient would be likely to look +| for such a notice. +| +| You may add additional accurate notices of copyright ownership. +| +| Exhibit B - "Incompatible With Secondary Licenses" Notice +| --------------------------------------------------------- +| +| This Source Code Form is "Incompatible With Secondary Licenses", as +| defined by the Mozilla Public License, v. 2.0. -------------------------------------------------------------------------------- -This product bundles Kotlin. - -Project URL: https://github.com/JetBrains/kotlin -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 diff --git a/flink/v2.0/flink-runtime/NOTICE b/flink/v2.0/flink-runtime/NOTICE index 72916788b5e4..61b02129d0e1 100644 --- a/flink/v2.0/flink-runtime/NOTICE +++ b/flink/v2.0/flink-runtime/NOTICE @@ -356,128 +356,6 @@ This product bundles Eclipse Microprofile OpenAPI with the following in its NOTI -------------------------------------------------------------------------------- -This product bundles gRPC with the following in its NOTICE file: -| Copyright 2014 The gRPC Authors -| -| Licensed under the Apache License, Version 2.0 (the "License"); -| you may not use this file except in compliance with the License. -| You may obtain a copy of the License at -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, -| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -| See the License for the specific language governing permissions and -| limitations under the License. -| -| ----------------------------------------------------------------------- -| -| This product contains a modified portion of 'OkHttp', an open source -| HTTP & SPDY client for Android and Java applications, which can be obtained -| at: -| -| * LICENSE: -| * okhttp/third_party/okhttp/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/square/okhttp -| * LOCATION_IN_GRPC: -| * okhttp/third_party/okhttp -| -| This product contains a modified portion of 'Envoy', an open source -| cloud-native high-performance edge/middle/service proxy, which can be -| obtained at: -| -| * LICENSE: -| * xds/third_party/envoy/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/envoy/NOTICE -| * HOMEPAGE: -| * https://www.envoyproxy.io -| * LOCATION_IN_GRPC: -| * xds/third_party/envoy -| -| This product contains a modified portion of 'protoc-gen-validate (PGV)', -| an open source protoc plugin to generate polyglot message validators, -| which can be obtained at: -| -| * LICENSE: -| * xds/third_party/protoc-gen-validate/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/protoc-gen-validate/NOTICE -| * HOMEPAGE: -| * https://github.com/envoyproxy/protoc-gen-validate -| * LOCATION_IN_GRPC: -| * xds/third_party/protoc-gen-validate -| -| This product contains a modified portion of 'udpa', -| an open source universal data plane API, which can be obtained at: -| -| * LICENSE: -| * xds/third_party/udpa/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/cncf/udpa -| * LOCATION_IN_GRPC: -| * xds/third_party/udpa - --------------------------------------------------------------------------------- - -This product bundles Netty with the following in its NOTICE file: -| -| The Netty Project -| ================= -| -| Please visit the Netty web site for more information: -| -| * http://netty.io/ -| -| Copyright 2016 The Netty Project -| -| The Netty Project licenses this file to you under the Apache License, -| version 2.0 (the "License"); you may not use this file except in compliance -| with the License. You may obtain a copy of the License at: -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -| WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -| License for the specific language governing permissions and limitations -| under the License. -| -| ------------------------------------------------------------------------------- -| This product contains a forked and modified version of Tomcat Native -| -| * LICENSE: -| * license/LICENSE.tomcat-native.txt (Apache License 2.0) -| * HOMEPAGE: -| * http://tomcat.apache.org/native-doc/ -| * https://svn.apache.org/repos/asf/tomcat/native/ -| -| This product contains the Maven wrapper scripts from 'Maven Wrapper', that provides an easy way to ensure a user has everything necessary to run the Maven build. -| -| * LICENSE: -| * license/LICENSE.mvn-wrapper.txt (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/takari/maven-wrapper -| -| This product contains small piece of code to support AIX, taken from netbsd. -| -| * LICENSE: -| * license/LICENSE.aix-netbsd.txt (OpenSSL License) -| * HOMEPAGE: -| * https://ftp.netbsd.org/pub/NetBSD/NetBSD-current/src/crypto/external/bsd/openssl/dist -| -| -| This product contains code from boringssl. -| -| * LICENSE (Combination ISC and OpenSSL license) -| * license/LICENSE.boringssl.txt (Combination ISC and OpenSSL license) -| * HOMEPAGE: -| * https://boringssl.googlesource.com/boringssl/ - --------------------------------------------------------------------------------- - This product bundles Jackson JSON Processor with the following in its NOTICE file: | # Jackson JSON processor | diff --git a/flink/v2.0/flink-runtime/baseline-class-uniqueness.lock b/flink/v2.0/flink-runtime/baseline-class-uniqueness.lock deleted file mode 100644 index 7868296a79e8..000000000000 --- a/flink/v2.0/flink-runtime/baseline-class-uniqueness.lock +++ /dev/null @@ -1,60 +0,0 @@ -# Danger! Multiple jars contain identically named classes. This may cause different behaviour depending on classpath ordering. -# Run ./gradlew checkClassUniqueness --fix to update this file - -## runtimeClasspath -[com.google.protobuf:protobuf-java, dev.vortex:vortex-jni] - - com.google.protobuf.BoolValue - - com.google.protobuf.BoolValue$1 - - com.google.protobuf.BoolValue$Builder - - com.google.protobuf.BoolValueOrBuilder - - com.google.protobuf.BytesValue - - com.google.protobuf.BytesValue$1 - - com.google.protobuf.BytesValue$Builder - - com.google.protobuf.BytesValueOrBuilder - - com.google.protobuf.DoubleValue - - com.google.protobuf.DoubleValue$1 - - com.google.protobuf.DoubleValue$Builder - - com.google.protobuf.DoubleValueOrBuilder - - com.google.protobuf.FloatValue - - com.google.protobuf.FloatValue$1 - - com.google.protobuf.FloatValue$Builder - - com.google.protobuf.FloatValueOrBuilder - - com.google.protobuf.Int32Value - - com.google.protobuf.Int32Value$1 - - com.google.protobuf.Int32Value$Builder - - com.google.protobuf.Int32ValueOrBuilder - - com.google.protobuf.Int64Value - - com.google.protobuf.Int64Value$1 - - com.google.protobuf.Int64Value$Builder - - com.google.protobuf.Int64ValueOrBuilder - - com.google.protobuf.ListValue - - com.google.protobuf.ListValue$1 - - com.google.protobuf.ListValue$Builder - - com.google.protobuf.ListValueOrBuilder - - com.google.protobuf.NullValue - - com.google.protobuf.NullValue$1 - - com.google.protobuf.StringValue - - com.google.protobuf.StringValue$1 - - com.google.protobuf.StringValue$Builder - - com.google.protobuf.StringValueOrBuilder - - com.google.protobuf.Struct - - com.google.protobuf.Struct$1 - - com.google.protobuf.Struct$Builder - - com.google.protobuf.Struct$Builder$FieldsConverter - - com.google.protobuf.Struct$FieldsDefaultEntryHolder - - com.google.protobuf.StructOrBuilder - - com.google.protobuf.StructProto - - com.google.protobuf.UInt32Value - - com.google.protobuf.UInt32Value$1 - - com.google.protobuf.UInt32Value$Builder - - com.google.protobuf.UInt32ValueOrBuilder - - com.google.protobuf.UInt64Value - - com.google.protobuf.UInt64Value$1 - - com.google.protobuf.UInt64Value$Builder - - com.google.protobuf.UInt64ValueOrBuilder - - com.google.protobuf.Value - - com.google.protobuf.Value$1 - - com.google.protobuf.Value$Builder - - com.google.protobuf.Value$KindCase - - com.google.protobuf.ValueOrBuilder - - com.google.protobuf.WrappersProto diff --git a/flink/v2.0/flink-runtime/runtime-deps.txt b/flink/v2.0/flink-runtime/runtime-deps.txt new file mode 100644 index 000000000000..2eba0b183fb9 --- /dev/null +++ b/flink/v2.0/flink-runtime/runtime-deps.txt @@ -0,0 +1,47 @@ +com.fasterxml.jackson.core:jackson-annotations:2.21 +com.fasterxml.jackson.core:jackson-core:2.21.3 +com.fasterxml.jackson.core:jackson-databind:2.21.3 +com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.21.3 +com.github.ben-manes.caffeine:caffeine:2.9.3 +com.github.luben:zstd-jni:1.5.7-3 +com.google.errorprone:error_prone_annotations:2.41.0 +com.google.flatbuffers:flatbuffers-java:25.2.10 +com.google.guava:failureaccess:1.0.3 +com.google.guava:guava:33.5.0-jre +com.google.guava:listenablefuture:9999.0-empty-to-avoid-conflict-with-guava +com.google.j2objc:j2objc-annotations:3.1 +dev.failsafe:failsafe:3.3.2 +dev.vortex:vortex-jni:0.67.0 +io.airlift:aircompressor:2.0.3 +io.netty:netty-buffer:4.2.7.Final +io.netty:netty-common:4.2.7.Final +org.apache.arrow:arrow-c-data:18.3.0 +org.apache.arrow:arrow-format:18.3.0 +org.apache.arrow:arrow-memory-core:18.3.0 +org.apache.arrow:arrow-memory-netty-buffer-patch:18.3.0 +org.apache.arrow:arrow-memory-netty:18.3.0 +org.apache.arrow:arrow-vector:18.3.0 +org.apache.avro:avro:1.12.1 +org.apache.datasketches:datasketches-java:6.2.0 +org.apache.datasketches:datasketches-memory:3.0.2 +org.apache.httpcomponents.client5:httpclient5:5.6.1 +org.apache.httpcomponents.core5:httpcore5-h2:5.4 +org.apache.httpcomponents.core5:httpcore5:5.4 +org.apache.orc:orc-core:1.9.8 +org.apache.orc:orc-shims:1.9.8 +org.apache.parquet:parquet-avro:1.17.0 +org.apache.parquet:parquet-column:1.17.0 +org.apache.parquet:parquet-common:1.17.0 +org.apache.parquet:parquet-encoding:1.17.0 +org.apache.parquet:parquet-format-structures:1.17.0 +org.apache.parquet:parquet-hadoop:1.17.0 +org.apache.parquet:parquet-jackson:1.17.0 +org.apache.parquet:parquet-variant:1.17.0 +org.checkerframework:checker-qual:3.19.0 +org.eclipse.microprofile.openapi:microprofile-openapi-api:4.1.1 +org.jspecify:jspecify:1.0.0 +org.locationtech.jts:jts-core:1.20.0 +org.projectnessie.nessie:nessie-client:0.107.5 +org.projectnessie.nessie:nessie-model:0.107.5 +org.roaringbitmap:RoaringBitmap:1.6.14 +org.threeten:threeten-extra:1.7.1 diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java index e0672811cf5f..7661372c88e8 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java @@ -44,7 +44,7 @@ public FlinkConfParser(Table table, Map options, ReadableConfig this.readableConfig = readableConfig; } - FlinkConfParser(Map options, ReadableConfig readableConfig) { + public FlinkConfParser(Map options, ReadableConfig readableConfig) { this.tableProperties = ImmutableMap.of(); this.options = options; this.readableConfig = readableConfig; diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java index 408065f06057..8f106da8d56b 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java @@ -137,11 +137,17 @@ public Type visit(TimeType timeType) { @Override public Type visit(TimestampType timestampType) { + if (timestampType.getPrecision() > 6) { + return Types.TimestampNanoType.withoutZone(); + } return Types.TimestampType.withoutZone(); } @Override public Type visit(LocalZonedTimestampType localZonedTimestampType) { + if (localZonedTimestampType.getPrecision() > 6) { + return Types.TimestampNanoType.withZone(); + } return Types.TimestampType.withZone(); } diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java index 3ef611f2ded5..920e44b24b31 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java @@ -114,19 +114,35 @@ private static PositionalGetter buildGetter(LogicalType logicalType, Type typ case TIMESTAMP_WITHOUT_TIME_ZONE: TimestampType timestampType = (TimestampType) logicalType; - return (row, pos) -> { - LocalDateTime localDateTime = - row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); - return DateTimeUtil.microsFromTimestamp(localDateTime); - }; + if (type.typeId() == Type.TypeID.TIMESTAMP_NANO) { + return (row, pos) -> { + LocalDateTime localDateTime = + row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); + return DateTimeUtil.nanosFromTimestamp(localDateTime); + }; + } else { + return (row, pos) -> { + LocalDateTime localDateTime = + row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); + return DateTimeUtil.microsFromTimestamp(localDateTime); + }; + } case TIMESTAMP_WITH_LOCAL_TIME_ZONE: LocalZonedTimestampType lzTs = (LocalZonedTimestampType) logicalType; - return (row, pos) -> { - TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); - return timestampData.getMillisecond() * 1000 - + timestampData.getNanoOfMillisecond() / 1000; - }; + if (type.typeId() == Type.TypeID.TIMESTAMP_NANO) { + return (row, pos) -> { + TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); + return timestampData.getMillisecond() * 1_000_000L + + timestampData.getNanoOfMillisecond(); + }; + } else { + return (row, pos) -> { + TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); + return timestampData.getMillisecond() * 1000L + + timestampData.getNanoOfMillisecond() / 1000; + }; + } case ROW: RowType rowType = (RowType) logicalType; diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java index 65b9d44ad4b8..77f16bfdb2ab 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java @@ -70,7 +70,7 @@ public OrcValueReader record( TypeDescription record, List names, List> fields) { - return FlinkOrcReaders.struct(fields, iStruct, idToConstant); + return FlinkOrcReaders.struct(record, fields, iStruct, idToConstant); } @Override @@ -112,6 +112,13 @@ public OrcValueReader primitive(Type.PrimitiveType iPrimitive, TypeDescriptio } else { return FlinkOrcReaders.timestamps(); } + case TIMESTAMP_NANO: + Types.TimestampNanoType timestampNanoType = (Types.TimestampNanoType) iPrimitive; + if (timestampNanoType.shouldAdjustToUTC()) { + return FlinkOrcReaders.timestampTzs(); + } else { + return FlinkOrcReaders.timestamps(); + } case STRING: return FlinkOrcReaders.strings(); case UUID: diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java index 7a4a15c7e600..c5c958fbdb04 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java @@ -39,6 +39,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; +import org.apache.orc.TypeDescription; import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; import org.apache.orc.storage.ql.exec.vector.ColumnVector; import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; @@ -91,8 +92,11 @@ public static OrcValueReader map( } public static OrcValueReader struct( - List> readers, Types.StructType struct, Map idToConstant) { - return new StructReader(readers, struct, idToConstant); + TypeDescription record, + List> readers, + Types.StructType struct, + Map idToConstant) { + return new StructReader(record, readers, struct, idToConstant); } private static class StringReader implements OrcValueReader { @@ -265,8 +269,11 @@ private static class StructReader extends OrcValueReaders.StructReader private final int numFields; StructReader( - List> readers, Types.StructType struct, Map idToConstant) { - super(readers, struct, idToConstant); + TypeDescription record, + List> readers, + Types.StructType struct, + Map idToConstant) { + super(record, readers, struct, idToConstant); this.numFields = struct.fields().size(); } diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java index a467d848337d..c1b46252e18a 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java @@ -145,6 +145,13 @@ public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, LogicalType fl } else { return FlinkOrcWriters.timestamps(); } + case TIMESTAMP_NANO: + Types.TimestampNanoType timestampNanoType = (Types.TimestampNanoType) iPrimitive; + if (timestampNanoType.shouldAdjustToUTC()) { + return FlinkOrcWriters.timestampNanoTzs(); + } else { + return FlinkOrcWriters.timestampNanos(); + } case STRING: return FlinkOrcWriters.strings(); case UUID: diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java index 684842aa099c..bf19a46c05fb 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java @@ -70,6 +70,14 @@ static OrcValueWriter timestampTzs() { return TimestampTzWriter.INSTANCE; } + static OrcValueWriter timestampNanos() { + return TimestampNanoWriter.INSTANCE; + } + + static OrcValueWriter timestampNanoTzs() { + return TimestampNanoTzWriter.INSTANCE; + } + static OrcValueWriter decimals(int precision, int scale) { if (precision <= 18) { return new Decimal18Writer(precision, scale); @@ -170,6 +178,35 @@ public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { } } + private static class TimestampNanoWriter implements OrcValueWriter { + private static final TimestampNanoWriter INSTANCE = new TimestampNanoWriter(); + + @Override + public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { + TimestampColumnVector cv = (TimestampColumnVector) output; + cv.setIsUTC(true); + // millis + OffsetDateTime offsetDateTime = data.toInstant().atOffset(ZoneOffset.UTC); + cv.time[rowId] = + offsetDateTime.toEpochSecond() * 1_000 + offsetDateTime.getNano() / 1_000_000; + cv.nanos[rowId] = offsetDateTime.getNano(); + } + } + + private static class TimestampNanoTzWriter implements OrcValueWriter { + private static final TimestampNanoTzWriter INSTANCE = new TimestampNanoTzWriter(); + + @SuppressWarnings("JavaInstantGetSecondsGetNano") + @Override + public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { + TimestampColumnVector cv = (TimestampColumnVector) output; + // millis + Instant instant = data.toInstant(); + cv.time[rowId] = instant.toEpochMilli(); + cv.nanos[rowId] = instant.getNano(); + } + } + private static class Decimal18Writer implements OrcValueWriter { private final int precision; private final int scale; diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java index f23a7ee3d0d3..81bb55967992 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java @@ -69,6 +69,8 @@ public static Object convertConstant(Type type, Object value) { return (int) ((Long) value / 1000); case TIMESTAMP: // TimestampData return TimestampData.fromLocalDateTime(DateTimeUtil.timestampFromMicros((Long) value)); + case TIMESTAMP_NANO: + return TimestampData.fromLocalDateTime(DateTimeUtil.timestampFromNanos((Long) value)); case UUID: return UUIDUtil.convert((UUID) value); default: diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java index 34576a1e5c0b..b469f2310f42 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java @@ -48,6 +48,7 @@ import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.ByteBuffers; +import org.apache.iceberg.util.DateTimeUtil; @Internal public class StructRowData implements RowData { @@ -120,8 +121,8 @@ public int getInt(int pos) { if (integer instanceof Integer) { return (int) integer; - } else if (integer instanceof LocalDate) { - return (int) ((LocalDate) integer).toEpochDay(); + } else if (integer instanceof LocalDate localDate) { + return (int) localDate.toEpochDay(); } else if (integer instanceof LocalTime) { return (int) (((LocalTime) integer).toNanoOfDay() / 1000_000); } else { @@ -185,8 +186,27 @@ private BigDecimal getDecimalInternal(int pos) { @Override public TimestampData getTimestamp(int pos, int precision) { + if (precision > 6) { + Object timeVal = struct.get(pos, Object.class); + if (timeVal instanceof OffsetDateTime) { + OffsetDateTime odt = (OffsetDateTime) timeVal; + return TimestampData.fromEpochMillis( + odt.toInstant().toEpochMilli(), odt.getNano() % 1_000_000); + } else if (timeVal instanceof LocalDateTime) { + LocalDateTime ldt = (LocalDateTime) timeVal; + return TimestampData.fromEpochMillis( + ldt.toInstant(ZoneOffset.UTC).toEpochMilli(), ldt.getNano() % 1_000_000); + } else if (timeVal instanceof Long) { + long timeLong = (Long) timeVal; + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1_000_000L), (int) Math.floorMod(timeLong, 1_000_000L)); + } else { + throw new IllegalStateException("Unknown type for timestamp_ns: " + timeVal.getClass()); + } + } long timeLong = getLong(pos); - return TimestampData.fromEpochMillis(timeLong / 1000, (int) (timeLong % 1000) * 1000); + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1000L), (int) Math.floorMod(timeLong, 1000L) * 1000); } @Override @@ -257,9 +277,29 @@ private Object convertValue(Type elementType, Object value) { case DECIMAL: return value; case TIMESTAMP: - long millisecond = (long) value / 1000; - int nanoOfMillisecond = (int) ((Long) value % 1000) * 1000; - return TimestampData.fromEpochMillis(millisecond, nanoOfMillisecond); + long timeMillis; + if (value instanceof LocalDateTime localDateTime) { + timeMillis = DateTimeUtil.microsFromTimestamp(localDateTime) / 1000L; + } else if (value instanceof OffsetDateTime offsetDateTime) { + timeMillis = DateTimeUtil.microsFromTimestamptz(offsetDateTime) / 1000L; + } else { + timeMillis = Math.floorDiv((Long) value, 1000L); + } + return TimestampData.fromEpochMillis( + timeMillis, + (int) Math.floorMod(value instanceof Long ? (Long) value : timeMillis * 1000L, 1000L) + * 1000); + case TIMESTAMP_NANO: + long nanoLong; + if (value instanceof LocalDateTime localDateTime) { + nanoLong = DateTimeUtil.nanosFromTimestamp(localDateTime); + } else if (value instanceof OffsetDateTime offsetDateTime) { + nanoLong = DateTimeUtil.nanosFromTimestamptz(offsetDateTime); + } else { + nanoLong = (Long) value; + } + return TimestampData.fromEpochMillis( + Math.floorDiv(nanoLong, 1_000_000L), (int) Math.floorMod(nanoLong, 1_000_000L)); case STRING: return StringData.fromString(value.toString()); case FIXED: diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/AvroToRowDataConverters.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/AvroToRowDataConverters.java new file mode 100644 index 000000000000..0f70e60a1b9f --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/AvroToRowDataConverters.java @@ -0,0 +1,303 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro; + +import java.io.Serializable; +import java.lang.reflect.Array; +import java.nio.ByteBuffer; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.temporal.ChronoField; +import java.util.List; +import java.util.Map; +import org.apache.avro.generic.GenericFixed; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.utils.LogicalTypeUtils; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +/** + * Tool class used to convert from Avro {@link GenericRecord} to {@link RowData}. + * + *

    This class is adapted in Iceberg to add support for nanosecond precision timestamps + * (FLINK-39251). Once that ticket is resolved in Flink, this custom converter may be removed. + */ +@Internal +public class AvroToRowDataConverters { + + private AvroToRowDataConverters() {} + + /** + * Runtime converter that converts Avro data structures into objects of Flink Table & SQL + * internal data structures. + */ + @FunctionalInterface + public interface AvroToRowDataConverter extends Serializable { + Object convert(Object object); + } + + // ------------------------------------------------------------------------------------- + // Runtime Converters + // ------------------------------------------------------------------------------------- + + public static AvroToRowDataConverter createRowConverter(RowType rowType) { + return createRowConverter(rowType, true); + } + + public static AvroToRowDataConverter createRowConverter( + RowType rowType, boolean legacyTimestampMapping) { + final AvroToRowDataConverter[] fieldConverters = + rowType.getFields().stream() + .map(RowType.RowField::getType) + .map(type -> createNullableConverter(type, legacyTimestampMapping)) + .toArray(AvroToRowDataConverter[]::new); + final int arity = rowType.getFieldCount(); + + return avroObject -> { + IndexedRecord record = (IndexedRecord) avroObject; + GenericRowData row = new GenericRowData(arity); + for (int i = 0; i < arity; ++i) { + // avro always deserialize successfully even though the type isn't matched + // so no need to throw exception about which field can't be deserialized + row.setField(i, fieldConverters[i].convert(record.get(i))); + } + return row; + }; + } + + /** Creates a runtime converter which is null safe. */ + private static AvroToRowDataConverter createNullableConverter( + LogicalType type, boolean legacyTimestampMapping) { + final AvroToRowDataConverter converter = createConverter(type, legacyTimestampMapping); + return avroObject -> { + if (avroObject == null) { + return null; + } + return converter.convert(avroObject); + }; + } + + /** Creates a runtime converter which assuming input object is not null. */ + private static AvroToRowDataConverter createConverter( + LogicalType type, boolean legacyTimestampMapping) { + switch (type.getTypeRoot()) { + case NULL: + return avroObject -> null; + case TINYINT: + return avroObject -> ((Integer) avroObject).byteValue(); + case SMALLINT: + return avroObject -> ((Integer) avroObject).shortValue(); + case BOOLEAN: // boolean + case INTEGER: // int + case INTERVAL_YEAR_MONTH: // long + case BIGINT: // long + case INTERVAL_DAY_TIME: // long + case FLOAT: // float + case DOUBLE: // double + return avroObject -> avroObject; + case DATE: + return AvroToRowDataConverters::convertToDate; + case TIME_WITHOUT_TIME_ZONE: + return AvroToRowDataConverters::convertToTime; + case TIMESTAMP_WITHOUT_TIME_ZONE: + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return avroObject -> convertToTimestamp(avroObject, type); + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + if (legacyTimestampMapping) { + throw new UnsupportedOperationException("Unsupported type: " + type); + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return avroObject -> convertToTimestamp(avroObject, type); + } + case CHAR: + case VARCHAR: + return avroObject -> StringData.fromString(avroObject.toString()); + case BINARY: + case VARBINARY: + return AvroToRowDataConverters::convertToBytes; + case DECIMAL: + return createDecimalConverter((DecimalType) type); + case ARRAY: + return createArrayConverter((ArrayType) type, legacyTimestampMapping); + case ROW: + return createRowConverter((RowType) type); + case MAP: + case MULTISET: + return createMapConverter(type, legacyTimestampMapping); + case RAW: + default: + throw new UnsupportedOperationException("Unsupported type: " + type); + } + } + + private static AvroToRowDataConverter createDecimalConverter(DecimalType decimalType) { + final int precision = decimalType.getPrecision(); + final int scale = decimalType.getScale(); + return avroObject -> { + final byte[] bytes; + if (avroObject instanceof GenericFixed) { + bytes = ((GenericFixed) avroObject).bytes(); + } else if (avroObject instanceof ByteBuffer) { + ByteBuffer byteBuffer = (ByteBuffer) avroObject; + bytes = new byte[byteBuffer.remaining()]; + byteBuffer.get(bytes); + } else { + bytes = (byte[]) avroObject; + } + return DecimalData.fromUnscaledBytes(bytes, precision, scale); + }; + } + + private static AvroToRowDataConverter createArrayConverter( + ArrayType arrayType, boolean legacyTimestampMapping) { + final AvroToRowDataConverter elementConverter = + createNullableConverter(arrayType.getElementType(), legacyTimestampMapping); + final Class elementClass = + LogicalTypeUtils.toInternalConversionClass(arrayType.getElementType()); + + return avroObject -> { + final List list = (List) avroObject; + final int length = list.size(); + final Object[] array = (Object[]) Array.newInstance(elementClass, length); + for (int i = 0; i < length; ++i) { + array[i] = elementConverter.convert(list.get(i)); + } + return new GenericArrayData(array); + }; + } + + private static AvroToRowDataConverter createMapConverter( + LogicalType type, boolean legacyTimestampMapping) { + final AvroToRowDataConverter keyConverter = + createConverter(DataTypes.STRING().getLogicalType(), legacyTimestampMapping); + final AvroToRowDataConverter valueConverter = + createNullableConverter( + AvroSchemaConverter.extractValueTypeToAvroMap(type), legacyTimestampMapping); + + return avroObject -> { + final Map map = (Map) avroObject; + Map result = Maps.newHashMap(); + for (Map.Entry entry : map.entrySet()) { + Object key = keyConverter.convert(entry.getKey()); + Object value = valueConverter.convert(entry.getValue()); + result.put(key, value); + } + return new GenericMapData(result); + }; + } + + private static TimestampData convertToTimestamp(Object object, LogicalType type) { + int precision = 3; + if (type instanceof org.apache.flink.table.types.logical.TimestampType) { + precision = ((org.apache.flink.table.types.logical.TimestampType) type).getPrecision(); + } else if (type instanceof org.apache.flink.table.types.logical.LocalZonedTimestampType) { + precision = + ((org.apache.flink.table.types.logical.LocalZonedTimestampType) type).getPrecision(); + } + + if (object instanceof Long) { + long timeLong = (Long) object; + if (precision <= 3) { + return TimestampData.fromEpochMillis(timeLong); + } else if (precision <= 6) { + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1000L), (int) Math.floorMod(timeLong, 1000L) * 1_000_000); + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1_000_000L), (int) Math.floorMod(timeLong, 1_000_000L)); + } + } else if (object instanceof Instant) { + return TimestampData.fromInstant((Instant) object); + } else if (object instanceof LocalDateTime) { + return TimestampData.fromLocalDateTime((LocalDateTime) object); + } else { + JodaConverter jodaConverter = JodaConverter.getConverter(); + if (jodaConverter != null) { + return TimestampData.fromEpochMillis(jodaConverter.convertTimestamp(object)); + } else { + throw new IllegalArgumentException( + "Unexpected object type for TIMESTAMP logical type. Received: " + object); + } + } + } + + private static int convertToDate(Object object) { + if (object instanceof Integer) { + return (Integer) object; + } else if (object instanceof LocalDate) { + return (int) ((LocalDate) object).toEpochDay(); + } else { + JodaConverter jodaConverter = JodaConverter.getConverter(); + if (jodaConverter != null) { + return (int) jodaConverter.convertDate(object); + } else { + throw new IllegalArgumentException( + "Unexpected object type for DATE logical type. Received: " + object); + } + } + } + + private static int convertToTime(Object object) { + final int millis; + if (object instanceof Integer) { + millis = (Integer) object; + } else if (object instanceof LocalTime) { + millis = ((LocalTime) object).get(ChronoField.MILLI_OF_DAY); + } else { + JodaConverter jodaConverter = JodaConverter.getConverter(); + if (jodaConverter != null) { + millis = jodaConverter.convertTime(object); + } else { + throw new IllegalArgumentException( + "Unexpected object type for TIME logical type. Received: " + object); + } + } + return millis; + } + + private static byte[] convertToBytes(Object object) { + if (object instanceof GenericFixed) { + return ((GenericFixed) object).bytes(); + } else if (object instanceof ByteBuffer) { + ByteBuffer byteBuffer = (ByteBuffer) object; + byte[] bytes = new byte[byteBuffer.remaining()]; + byteBuffer.get(bytes); + return bytes; + } else { + return (byte[]) object; + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/JodaConverter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/JodaConverter.java new file mode 100644 index 000000000000..c30b78023345 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/JodaConverter.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro; + +import org.joda.time.DateTime; +import org.joda.time.DateTimeFieldType; +import org.joda.time.LocalDate; +import org.joda.time.LocalTime; + +/** + * Encapsulates joda optional dependency. Instantiates this class only if joda is available on the + * classpath. + */ +@SuppressWarnings("JavaUtilDate") +class JodaConverter { + + private static JodaConverter instance; + private static boolean instantiated = false; + + public static JodaConverter getConverter() { + if (instantiated) { + return instance; + } + + try { + Class.forName( + "org.joda.time.DateTime", false, Thread.currentThread().getContextClassLoader()); + instance = new JodaConverter(); + } catch (ClassNotFoundException e) { + instance = null; + } finally { + instantiated = true; + } + return instance; + } + + public long convertDate(Object object) { + final LocalDate value = (LocalDate) object; + return value.toDate().getTime(); + } + + public int convertTime(Object object) { + final LocalTime value = (LocalTime) object; + return value.get(DateTimeFieldType.millisOfDay()); + } + + public long convertTimestamp(Object object) { + final DateTime value = (DateTime) object; + return value.toDate().getTime(); + } + + private JodaConverter() {} +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/RowDataToAvroConverters.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/RowDataToAvroConverters.java new file mode 100644 index 000000000000..d4c7e4282d6e --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/RowDataToAvroConverters.java @@ -0,0 +1,394 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro; + +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.time.ZoneOffset; +import java.util.List; +import java.util.Map; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.util.Utf8; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.util.CollectionUtil; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +/** + * Tool class used to convert from {@link RowData} to Avro {@link GenericRecord}. + * + *

    This class is adapted in Iceberg to add support for nanosecond precision timestamps + * (FLINK-39251). Once that ticket is resolved in Flink, this custom converter may be removed. + */ +@Internal +public class RowDataToAvroConverters { + + private RowDataToAvroConverters() {} + + // -------------------------------------------------------------------------------- + // Runtime Converters + // -------------------------------------------------------------------------------- + + /** + * Runtime converter that converts objects of Flink Table & SQL internal data structures to + * corresponding Avro data structures. + */ + @FunctionalInterface + public interface RowDataToAvroConverter extends Serializable { + Object convert(Schema schema, Object object); + } + + // -------------------------------------------------------------------------------- + // IMPORTANT! We use anonymous classes instead of lambdas for a reason here. It is + // necessary because the maven shade plugin cannot relocate classes in + // SerializedLambdas (MSHADE-260). On the other hand we want to relocate Avro for + // sql-client uber jars. + // -------------------------------------------------------------------------------- + + /** + * Creates a runtime converter according to the given logical type that converts objects of Flink + * Table & SQL internal data structures to corresponding Avro data structures. + */ + public static RowDataToAvroConverter createConverter(LogicalType type) { + return createConverter(type, true); + } + + @SuppressWarnings("checkstyle:MethodLength") + public static RowDataToAvroConverter createConverter( + LogicalType type, boolean legacyTimestampMapping) { + final RowDataToAvroConverter converter; + switch (type.getTypeRoot()) { + case NULL: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return null; + } + }; + break; + case TINYINT: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ((Byte) object).intValue(); + } + }; + break; + case SMALLINT: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ((Short) object).intValue(); + } + }; + break; + case BOOLEAN: // boolean + case INTEGER: // int + case INTERVAL_YEAR_MONTH: // long + case BIGINT: // long + case INTERVAL_DAY_TIME: // long + case FLOAT: // float + case DOUBLE: // double + case TIME_WITHOUT_TIME_ZONE: // int + case DATE: // int + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return object; + } + }; + break; + case CHAR: + case VARCHAR: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return new Utf8(object.toString()); + } + }; + break; + case BINARY: + case VARBINARY: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ByteBuffer.wrap((byte[]) object); + } + }; + break; + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + case TIMESTAMP_WITHOUT_TIME_ZONE: + final int tzPrecision; + if (type instanceof org.apache.flink.table.types.logical.TimestampType) { + tzPrecision = ((org.apache.flink.table.types.logical.TimestampType) type).getPrecision(); + } else { + tzPrecision = 3; + } + if (legacyTimestampMapping) { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + TimestampData timestampData = (TimestampData) object; + if (tzPrecision <= 3) { + return timestampData.getMillisecond(); + } else if (tzPrecision <= 6) { + return timestampData.getMillisecond() * 1000L + + timestampData.getNanoOfMillisecond() / 1000; + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return timestampData.getMillisecond() * 1_000_000L + + timestampData.getNanoOfMillisecond(); + } + } + }; + } else { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + TimestampData timestampData = (TimestampData) object; + java.time.Instant instant = + timestampData.toLocalDateTime().toInstant(ZoneOffset.UTC); + if (tzPrecision <= 3) { + return instant.toEpochMilli(); + } else if (tzPrecision <= 6) { + return instant.getEpochSecond() * 1_000_000L + instant.getNano() / 1000; + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return instant.getEpochSecond() * 1_000_000_000L + instant.getNano(); + } + } + }; + } + break; + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + final int ltzPrecision; + if (type instanceof org.apache.flink.table.types.logical.LocalZonedTimestampType) { + ltzPrecision = + ((org.apache.flink.table.types.logical.LocalZonedTimestampType) type).getPrecision(); + } else { + ltzPrecision = 3; + } + if (legacyTimestampMapping) { + throw new UnsupportedOperationException("Unsupported type: " + type); + } else { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + TimestampData timestampData = (TimestampData) object; + if (ltzPrecision <= 3) { + return timestampData.getMillisecond(); + } else if (ltzPrecision <= 6) { + return timestampData.getMillisecond() * 1000L + + timestampData.getNanoOfMillisecond() / 1000; + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return timestampData.getMillisecond() * 1_000_000L + + timestampData.getNanoOfMillisecond(); + } + } + }; + } + break; + case DECIMAL: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ByteBuffer.wrap(((DecimalData) object).toUnscaledBytes()); + } + }; + break; + case ARRAY: + converter = createArrayConverter((ArrayType) type, legacyTimestampMapping); + break; + case ROW: + converter = createRowConverter((RowType) type, legacyTimestampMapping); + break; + case MAP: + case MULTISET: + converter = createMapConverter(type, legacyTimestampMapping); + break; + case RAW: + default: + throw new UnsupportedOperationException("Unsupported type: " + type); + } + + // wrap into nullable converter + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + if (object == null) { + return null; + } + + // get actual schema if it is a nullable schema + Schema actualSchema; + if (schema.getType() == Schema.Type.UNION) { + List types = schema.getTypes(); + int size = types.size(); + if (size == 2 && types.get(1).getType() == Schema.Type.NULL) { + actualSchema = types.get(0); + } else if (size == 2 && types.get(0).getType() == Schema.Type.NULL) { + actualSchema = types.get(1); + } else { + throw new IllegalArgumentException( + "The Avro schema is not a nullable type: " + schema.toString()); + } + } else { + actualSchema = schema; + } + return converter.convert(actualSchema, object); + } + }; + } + + private static RowDataToAvroConverter createRowConverter( + RowType rowType, boolean legacyTimestampMapping) { + final RowDataToAvroConverter[] fieldConverters = + rowType.getChildren().stream() + .map(legacyType -> createConverter(legacyType, legacyTimestampMapping)) + .toArray(RowDataToAvroConverter[]::new); + final LogicalType[] fieldTypes = + rowType.getFields().stream().map(RowType.RowField::getType).toArray(LogicalType[]::new); + final RowData.FieldGetter[] fieldGetters = new RowData.FieldGetter[fieldTypes.length]; + for (int i = 0; i < fieldTypes.length; i++) { + fieldGetters[i] = RowData.createFieldGetter(fieldTypes[i], i); + } + final int length = rowType.getFieldCount(); + + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + final RowData row = (RowData) object; + final List fields = schema.getFields(); + final GenericRecord record = new GenericData.Record(schema); + for (int i = 0; i < length; ++i) { + final Schema.Field schemaField = fields.get(i); + try { + Object avroObject = + fieldConverters[i].convert( + schemaField.schema(), fieldGetters[i].getFieldOrNull(row)); + record.put(i, avroObject); + } catch (Throwable t) { + throw new RuntimeException( + String.format("Fail to serialize at field: %s.", schemaField.name()), t); + } + } + return record; + } + }; + } + + private static RowDataToAvroConverter createArrayConverter( + ArrayType arrayType, boolean legacyTimestampMapping) { + LogicalType elementType = arrayType.getElementType(); + final ArrayData.ElementGetter elementGetter = ArrayData.createElementGetter(elementType); + final RowDataToAvroConverter elementConverter = + createConverter(arrayType.getElementType(), legacyTimestampMapping); + + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + final Schema elementSchema = schema.getElementType(); + ArrayData arrayData = (ArrayData) object; + List list = Lists.newArrayList(); + for (int i = 0; i < arrayData.size(); ++i) { + list.add( + elementConverter.convert( + elementSchema, elementGetter.getElementOrNull(arrayData, i))); + } + return list; + } + }; + } + + private static RowDataToAvroConverter createMapConverter( + LogicalType type, boolean legacyTimestampMapping) { + LogicalType valueType = AvroSchemaConverter.extractValueTypeToAvroMap(type); + final ArrayData.ElementGetter valueGetter = ArrayData.createElementGetter(valueType); + final RowDataToAvroConverter valueConverter = + createConverter(valueType, legacyTimestampMapping); + + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + final Schema valueSchema = schema.getValueType(); + final MapData mapData = (MapData) object; + final ArrayData keyArray = mapData.keyArray(); + final ArrayData valueArray = mapData.valueArray(); + final Map map = CollectionUtil.newHashMapWithExpectedSize(mapData.size()); + for (int i = 0; i < mapData.size(); ++i) { + final String key = keyArray.getString(i).toString(); + final Object value = + valueConverter.convert(valueSchema, valueGetter.getElementOrNull(valueArray, i)); + map.put(key, value); + } + return map; + } + }; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/typeutils/AvroSchemaConverter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/typeutils/AvroSchemaConverter.java new file mode 100644 index 000000000000..fb77c124e504 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/typeutils/AvroSchemaConverter.java @@ -0,0 +1,625 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro.typeutils; + +import java.util.List; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; +import org.apache.avro.SchemaParseException; +import org.apache.avro.specific.SpecificData; +import org.apache.avro.specific.SpecificRecord; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.formats.avro.AvroRowDataDeserializationSchema; +import org.apache.flink.formats.avro.AvroRowDataSerializationSchema; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.legacy.types.logical.TypeInformationRawType; +import org.apache.flink.table.types.AtomicDataType; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.IntType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.LogicalTypeFamily; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.MultisetType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.TimeType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.flink.types.Row; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * Converts an Avro schema into Flink's type information. It uses {@link RowTypeInfo} for + * representing objects and converts Avro types into types that are compatible with Flink's Table + * & SQL API. + * + *

    Note: Changes in this class need to be kept in sync with the corresponding runtime classes + * {@link AvroRowDataDeserializationSchema} and {@link AvroRowDataSerializationSchema}. + * + *

    This class is adapted in Iceberg to support custom 'timestamp-nanos' and + * 'local-timestamp-nanos' logical types (FLINK-39251). Once that ticket is resolved in Flink, these + * custom types may be removed. + */ +public class AvroSchemaConverter { + + private AvroSchemaConverter() { + // private + } + + /** + * Converts an Avro class into a nested row structure with deterministic field order and data + * types that are compatible with Flink's Table & SQL API. + * + * @param avroClass Avro specific record that contains schema information + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo( + Class avroClass) { + return convertToTypeInfo(avroClass, true); + } + + /** + * Converts an Avro class into a nested row structure with deterministic field order and data + * types that are compatible with Flink's Table & SQL API. + * + * @param avroClass Avro specific record that contains schema information + * @param legacyTimestampMapping legacy mapping of timestamp types + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo( + Class avroClass, boolean legacyTimestampMapping) { + Preconditions.checkNotNull(avroClass, "Avro specific record class must not be null."); + // determine schema to retrieve deterministic field order + final Schema schema = SpecificData.get().getSchema(avroClass); + return (TypeInformation) convertToTypeInfo(schema, true); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo(String avroSchemaString) { + return convertToTypeInfo(avroSchemaString, true); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @param legacyTimestampMapping legacy mapping of timestamp types + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo( + String avroSchemaString, boolean legacyTimestampMapping) { + Preconditions.checkNotNull(avroSchemaString, "Avro schema must not be null."); + final Schema schema; + try { + schema = new Schema.Parser().parse(avroSchemaString); + } catch (SchemaParseException e) { + throw new IllegalArgumentException("Could not parse Avro schema string.", e); + } + return (TypeInformation) convertToTypeInfo(schema, legacyTimestampMapping); + } + + private static TypeInformation convertToTypeInfo( + Schema schema, boolean legacyTimestampMapping) { + switch (schema.getType()) { + case RECORD: + final List fields = schema.getFields(); + + final TypeInformation[] types = new TypeInformation[fields.size()]; + final String[] names = new String[fields.size()]; + for (int i = 0; i < fields.size(); i++) { + final Schema.Field field = fields.get(i); + types[i] = convertToTypeInfo(field.schema(), legacyTimestampMapping); + names[i] = field.name(); + } + return Types.ROW_NAMED(names, types); + case ENUM: + return Types.STRING; + case ARRAY: + // result type might either be ObjectArrayTypeInfo or BasicArrayTypeInfo for Strings + return Types.OBJECT_ARRAY( + convertToTypeInfo(schema.getElementType(), legacyTimestampMapping)); + case MAP: + return Types.MAP( + Types.STRING, convertToTypeInfo(schema.getValueType(), legacyTimestampMapping)); + case UNION: + final Schema actualSchema; + if (schema.getTypes().size() == 2 + && schema.getTypes().get(0).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(1); + } else if (schema.getTypes().size() == 2 + && schema.getTypes().get(1).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(0); + } else if (schema.getTypes().size() == 1) { + actualSchema = schema.getTypes().get(0); + } else { + // use Kryo for serialization + return Types.GENERIC(Object.class); + } + return convertToTypeInfo(actualSchema, legacyTimestampMapping); + case FIXED: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + return Types.BIG_DEC; + } + // convert fixed size binary data to primitive byte arrays + return Types.PRIMITIVE_ARRAY(Types.BYTE); + case STRING: + // convert Avro's Utf8/CharSequence to String + return Types.STRING; + case BYTES: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + return Types.BIG_DEC; + } + return Types.PRIMITIVE_ARRAY(Types.BYTE); + case INT: + // logical date and time type + final org.apache.avro.LogicalType logicalType = schema.getLogicalType(); + if (logicalType == LogicalTypes.date()) { + return Types.SQL_DATE; + } else if (logicalType == LogicalTypes.timeMillis()) { + return Types.SQL_TIME; + } + return Types.INT; + case LONG: + if (legacyTimestampMapping) { + if (schema.getLogicalType() == LogicalTypes.timestampMillis() + || schema.getLogicalType() == LogicalTypes.timestampMicros() + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + || (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos"))) { + return Types.SQL_TIMESTAMP; + } else if (schema.getLogicalType() == LogicalTypes.timeMicros() + || schema.getLogicalType() == LogicalTypes.timeMillis()) { + return Types.SQL_TIME; + } + } else { + // Avro logical timestamp types to Flink DataStream timestamp types + if (schema.getLogicalType() == LogicalTypes.timestampMillis() + || schema.getLogicalType() == LogicalTypes.timestampMicros() + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + || (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos"))) { + return Types.INSTANT; + } else if (schema.getLogicalType() == LogicalTypes.localTimestampMillis() + || schema.getLogicalType() == LogicalTypes.localTimestampMicros() + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + || (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("local-timestamp-nanos"))) { + return Types.LOCAL_DATE_TIME; + } else if (schema.getLogicalType() == LogicalTypes.timeMicros() + || schema.getLogicalType() == LogicalTypes.timeMillis()) { + return Types.SQL_TIME; + } + } + return Types.LONG; + case FLOAT: + return Types.FLOAT; + case DOUBLE: + return Types.DOUBLE; + case BOOLEAN: + return Types.BOOLEAN; + case NULL: + return Types.VOID; + } + throw new IllegalArgumentException("Unsupported Avro type '" + schema.getType() + "'."); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @return data type matching the schema + */ + public static DataType convertToDataType(String avroSchemaString) { + return convertToDataType(avroSchemaString, true); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @param legacyTimestampMapping legacy mapping of local timestamps + * @return data type matching the schema + */ + public static DataType convertToDataType( + String avroSchemaString, boolean legacyTimestampMapping) { + Preconditions.checkNotNull(avroSchemaString, "Avro schema must not be null."); + final Schema schema; + try { + schema = new Schema.Parser().parse(avroSchemaString); + } catch (SchemaParseException e) { + throw new IllegalArgumentException("Could not parse Avro schema string.", e); + } + return convertToDataType(schema, legacyTimestampMapping); + } + + @SuppressWarnings("deprecation") + private static DataType convertToDataType(Schema schema, boolean legacyMapping) { + switch (schema.getType()) { + case RECORD: + final List schemaFields = schema.getFields(); + + final DataTypes.Field[] fields = new DataTypes.Field[schemaFields.size()]; + for (int i = 0; i < schemaFields.size(); i++) { + final Schema.Field field = schemaFields.get(i); + fields[i] = + DataTypes.FIELD(field.name(), convertToDataType(field.schema(), legacyMapping)); + } + return DataTypes.ROW(fields).notNull(); + case ENUM: + return DataTypes.STRING().notNull(); + case ARRAY: + return DataTypes.ARRAY(convertToDataType(schema.getElementType(), legacyMapping)).notNull(); + case MAP: + return DataTypes.MAP( + DataTypes.STRING().notNull(), + convertToDataType(schema.getValueType(), legacyMapping)) + .notNull(); + case UNION: + final Schema actualSchema; + final boolean nullable; + if (schema.getTypes().size() == 2 + && schema.getTypes().get(0).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(1); + nullable = true; + } else if (schema.getTypes().size() == 2 + && schema.getTypes().get(1).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(0); + nullable = true; + } else if (schema.getTypes().size() == 1) { + actualSchema = schema.getTypes().get(0); + nullable = false; + } else { + // use Kryo for serialization + return new AtomicDataType( + new TypeInformationRawType<>(false, Types.GENERIC(Object.class))); + } + DataType converted = convertToDataType(actualSchema, legacyMapping); + return nullable ? converted.nullable() : converted; + case FIXED: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + final LogicalTypes.Decimal decimalType = (LogicalTypes.Decimal) schema.getLogicalType(); + return DataTypes.DECIMAL(decimalType.getPrecision(), decimalType.getScale()).notNull(); + } + // convert fixed size binary data to primitive byte arrays + return DataTypes.VARBINARY(schema.getFixedSize()).notNull(); + case STRING: + // convert Avro's Utf8/CharSequence to String + return DataTypes.STRING().notNull(); + case BYTES: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + final LogicalTypes.Decimal decimalType = (LogicalTypes.Decimal) schema.getLogicalType(); + return DataTypes.DECIMAL(decimalType.getPrecision(), decimalType.getScale()).notNull(); + } + return DataTypes.BYTES().notNull(); + case INT: + // logical date and time type + final org.apache.avro.LogicalType logicalType = schema.getLogicalType(); + if (logicalType == LogicalTypes.date()) { + return DataTypes.DATE().notNull(); + } else if (logicalType == LogicalTypes.timeMillis()) { + return DataTypes.TIME(3).notNull(); + } + return DataTypes.INT().notNull(); + case LONG: + if (legacyMapping) { + // Avro logical timestamp types to Flink SQL timestamp types + if (schema.getLogicalType() == LogicalTypes.timestampMillis()) { + return DataTypes.TIMESTAMP(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timestampMicros()) { + return DataTypes.TIMESTAMP(6).notNull(); + } else if (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos")) { + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + return DataTypes.TIMESTAMP(9).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMillis()) { + return DataTypes.TIME(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMicros()) { + return DataTypes.TIME(6).notNull(); + } + } else { + // Avro logical timestamp types to Flink SQL timestamp types + if (schema.getLogicalType() == LogicalTypes.timestampMillis()) { + return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timestampMicros()) { + return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(6).notNull(); + } else if (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos")) { + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(9).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMillis()) { + return DataTypes.TIME(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMicros()) { + return DataTypes.TIME(6).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.localTimestampMillis()) { + return DataTypes.TIMESTAMP(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.localTimestampMicros()) { + return DataTypes.TIMESTAMP(6).notNull(); + } else if (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("local-timestamp-nanos")) { + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + return DataTypes.TIMESTAMP(9).notNull(); + } + } + + return DataTypes.BIGINT().notNull(); + case FLOAT: + return DataTypes.FLOAT().notNull(); + case DOUBLE: + return DataTypes.DOUBLE().notNull(); + case BOOLEAN: + return DataTypes.BOOLEAN().notNull(); + case NULL: + return DataTypes.NULL(); + } + throw new IllegalArgumentException("Unsupported Avro type '" + schema.getType() + "'."); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

    Use "org.apache.flink.avro.generated.record" as the type name. + * + * @param schema the schema type, usually it should be the top level record type, e.g. not a + * nested type + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema(LogicalType schema) { + return convertToSchema(schema, true); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

    Use "org.apache.flink.avro.generated.record" as the type name. + * + * @param schema the schema type, usually it should be the top level record type, e.g. not a + * nested type + * @param legacyTimestampMapping whether to use the legacy timestamp mapping + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema(LogicalType schema, boolean legacyTimestampMapping) { + return convertToSchema( + schema, "org.apache.flink.avro.generated.record", legacyTimestampMapping); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

    The "{rowName}_" is used as the nested row type name prefix in order to generate the right + * schema. Nested record type that only differs with type name is still compatible. + * + * @param logicalType logical type + * @param rowName the record name + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema(LogicalType logicalType, String rowName) { + return convertToSchema(logicalType, rowName, true); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

    The "{rowName}_" is used as the nested row type name prefix in order to generate the right + * schema. Nested record type that only differs with type name is still compatible. + * + * @param logicalType logical type + * @param rowName the record name + * @param legacyTimestampMapping whether to use legal timestamp mapping + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema( + LogicalType logicalType, String rowName, boolean legacyTimestampMapping) { + int precision; + boolean nullable = logicalType.isNullable(); + switch (logicalType.getTypeRoot()) { + case NULL: + return SchemaBuilder.builder().nullType(); + case BOOLEAN: + Schema bool = SchemaBuilder.builder().booleanType(); + return nullable ? nullableSchema(bool) : bool; + case TINYINT: + case SMALLINT: + case INTEGER: + Schema integer = SchemaBuilder.builder().intType(); + return nullable ? nullableSchema(integer) : integer; + case BIGINT: + Schema bigint = SchemaBuilder.builder().longType(); + return nullable ? nullableSchema(bigint) : bigint; + case FLOAT: + Schema floatSchema = SchemaBuilder.builder().floatType(); + return nullable ? nullableSchema(floatSchema) : floatSchema; + case DOUBLE: + Schema doubleSchema = SchemaBuilder.builder().doubleType(); + return nullable ? nullableSchema(doubleSchema) : doubleSchema; + case CHAR: + case VARCHAR: + Schema str = SchemaBuilder.builder().stringType(); + return nullable ? nullableSchema(str) : str; + case BINARY: + case VARBINARY: + Schema binary = SchemaBuilder.builder().bytesType(); + return nullable ? nullableSchema(binary) : binary; + case TIMESTAMP_WITHOUT_TIME_ZONE: + // use long to represents Timestamp + final TimestampType timestampType = (TimestampType) logicalType; + precision = timestampType.getPrecision(); + org.apache.avro.LogicalType avroLogicalType; + if (legacyTimestampMapping) { + if (precision <= 3) { + avroLogicalType = LogicalTypes.timestampMillis(); + } else { + throw new IllegalArgumentException( + "Avro does not support TIMESTAMP type " + + "with precision: " + + precision + + ", it only supports precision less than 3."); + } + } else { + if (precision <= 3) { + avroLogicalType = LogicalTypes.localTimestampMillis(); + } else if (precision <= 6) { + avroLogicalType = LogicalTypes.localTimestampMicros(); + } else { + throw new IllegalArgumentException( + "Avro does not support LOCAL TIMESTAMP type " + + "with precision: " + + precision + + ", it only supports precision less than 6."); + } + } + Schema timestamp = avroLogicalType.addToSchema(SchemaBuilder.builder().longType()); + return nullable ? nullableSchema(timestamp) : timestamp; + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + if (legacyTimestampMapping) { + throw new UnsupportedOperationException( + "Unsupported to derive Schema for type: " + logicalType); + } else { + final LocalZonedTimestampType localZonedTimestampType = + (LocalZonedTimestampType) logicalType; + precision = localZonedTimestampType.getPrecision(); + if (precision <= 3) { + avroLogicalType = LogicalTypes.timestampMillis(); + } else if (precision <= 6) { + avroLogicalType = LogicalTypes.timestampMicros(); + } else { + throw new IllegalArgumentException( + "Avro does not support TIMESTAMP type " + + "with precision: " + + precision + + ", it only supports precision less than 6."); + } + timestamp = avroLogicalType.addToSchema(SchemaBuilder.builder().longType()); + return nullable ? nullableSchema(timestamp) : timestamp; + } + case DATE: + // use int to represents Date + Schema date = LogicalTypes.date().addToSchema(SchemaBuilder.builder().intType()); + return nullable ? nullableSchema(date) : date; + case TIME_WITHOUT_TIME_ZONE: + precision = ((TimeType) logicalType).getPrecision(); + if (precision > 3) { + throw new IllegalArgumentException( + "Avro does not support TIME type with precision: " + + precision + + ", it only supports precision less than 3."); + } + // use int to represents Time, we only support millisecond when deserialization + Schema time = LogicalTypes.timeMillis().addToSchema(SchemaBuilder.builder().intType()); + return nullable ? nullableSchema(time) : time; + case DECIMAL: + DecimalType decimalType = (DecimalType) logicalType; + // store BigDecimal as byte[] + Schema decimal = + LogicalTypes.decimal(decimalType.getPrecision(), decimalType.getScale()) + .addToSchema(SchemaBuilder.builder().bytesType()); + return nullable ? nullableSchema(decimal) : decimal; + case ROW: + RowType rowType = (RowType) logicalType; + List fieldNames = rowType.getFieldNames(); + // we have to make sure the record name is different in a Schema + SchemaBuilder.FieldAssembler builder = + SchemaBuilder.builder().record(rowName).fields(); + for (int i = 0; i < rowType.getFieldCount(); i++) { + String fieldName = fieldNames.get(i); + LogicalType fieldType = rowType.getTypeAt(i); + SchemaBuilder.GenericDefault fieldBuilder = + builder + .name(fieldName) + .type( + convertToSchema( + fieldType, rowName + "_" + fieldName, legacyTimestampMapping)); + + if (fieldType.isNullable()) { + builder = fieldBuilder.withDefault(null); + } else { + builder = fieldBuilder.noDefault(); + } + } + Schema record = builder.endRecord(); + return nullable ? nullableSchema(record) : record; + case MULTISET: + case MAP: + Schema map = + SchemaBuilder.builder() + .map() + .values(convertToSchema(extractValueTypeToAvroMap(logicalType), rowName)); + return nullable ? nullableSchema(map) : map; + case ARRAY: + ArrayType arrayType = (ArrayType) logicalType; + Schema array = + SchemaBuilder.builder() + .array() + .items(convertToSchema(arrayType.getElementType(), rowName)); + return nullable ? nullableSchema(array) : array; + case RAW: + default: + throw new UnsupportedOperationException( + "Unsupported to derive Schema for type: " + logicalType); + } + } + + public static LogicalType extractValueTypeToAvroMap(LogicalType type) { + LogicalType keyType; + LogicalType valueType; + if (type instanceof MapType) { + MapType mapType = (MapType) type; + keyType = mapType.getKeyType(); + valueType = mapType.getValueType(); + } else { + MultisetType multisetType = (MultisetType) type; + keyType = multisetType.getElementType(); + valueType = new IntType(); + } + if (!keyType.is(LogicalTypeFamily.CHARACTER_STRING)) { + throw new UnsupportedOperationException( + "Avro format doesn't support non-string as key type of map. " + + "The key type is: " + + keyType.asSummaryString()); + } + return valueType; + } + + /** Returns schema with nullable true. */ + private static Schema nullableSchema(Schema schema) { + return schema.isNullable() + ? schema + : Schema.createUnion(SchemaBuilder.builder().nullType(), schema); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java index f68605accc57..30e95b1edba0 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java @@ -260,10 +260,6 @@ public void unlock() { this, instanceId, count); - } catch (SQLException e) { - // SQL exception happened when deleting lock information - throw new UncheckedSQLException( - e, "Failed to delete %s lock with instanceId %s", this, instanceId); } return null; @@ -298,9 +294,6 @@ private String instanceId() { return null; } } - } catch (SQLException e) { - // SQL exception happened when getting lock information - throw new UncheckedSQLException(e, "Failed to get lock information for %s", type); } }); } catch (InterruptedException e) { diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java index 9aeee75b1464..f03f33a3fd81 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java @@ -24,6 +24,7 @@ import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.util.function.SerializableSupplier; import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.actions.BinPackRewriteFilePlanner; import org.apache.iceberg.actions.SizeBasedFileRewritePlanner; @@ -59,7 +60,7 @@ public static class Builder extends MaintenanceTaskBuilder rewriteOptions = Maps.newHashMapWithExpectedSize(6); private long maxRewriteBytes = Long.MAX_VALUE; - private Expression filter = Expressions.alwaysTrue(); + private SerializableSupplier filterSupplier = Expressions::alwaysTrue; private String branch = SnapshotRef.MAIN_BRANCH; @Override @@ -214,9 +215,32 @@ public Builder maxFilesToRewrite(int maxFilesToRewrite) { * * @param newFilter the filter expression to apply * @return this for method chaining + * @deprecated will be removed in 1.12.0. Use {@link #filter(SerializableSupplier)} instead */ + @Deprecated public Builder filter(Expression newFilter) { - this.filter = newFilter; + this.filterSupplier = () -> newFilter; + return this; + } + + /** + * A user-provided supplier of a filter expression that determines which files are considered by + * the rewrite strategy. + * + *

    The supplier is evaluated by the planner on every compaction trigger, allowing a fresh + * filter to be produced for each compaction run. + * + *

    This is particularly useful for time-relative filters. For example, a supplier such as + * {@code () -> Expressions.greaterThanOrEqual("ts", + * LocalDateTime.now(ZoneOffset.UTC).minus(Duration.ofDays(3)).toString())} ensures that each + * compaction rewrites files from the last 3 days relative to the time the compaction is + * planned, rather than relative to when the job was started. + * + * @param newFilterSupplier the supplier providing the filter expression to apply + * @return this for method chaining + */ + public Builder filter(SerializableSupplier newFilterSupplier) { + this.filterSupplier = newFilterSupplier; return this; } @@ -276,7 +300,7 @@ DataStream append(DataStream trigger) { partialProgressEnabled ? partialProgressMaxCommits : 1, maxRewriteBytes, rewriteOptions, - filter, + filterSupplier, branch)) .name(operatorName(PLANNER_TASK_NAME)) .uid(PLANNER_TASK_NAME + uidSuffix()) diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java index a9360374df28..b78c602c647f 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java @@ -26,6 +26,7 @@ import org.apache.flink.metrics.Counter; import org.apache.flink.streaming.api.functions.ProcessFunction; import org.apache.flink.util.Collector; +import org.apache.flink.util.function.SerializableSupplier; import org.apache.iceberg.DataFile; import org.apache.iceberg.FileScanTask; import org.apache.iceberg.SerializableTable; @@ -62,8 +63,8 @@ public class DataFileRewritePlanner private final long maxRewriteBytes; private final Map rewriterOptions; private transient Counter errorCounter; - private final Expression filter; private final String branch; + private final SerializableSupplier filterSupplier; public DataFileRewritePlanner( String tableName, @@ -73,7 +74,7 @@ public DataFileRewritePlanner( int newPartialProgressMaxCommits, long maxRewriteBytes, Map rewriterOptions, - Expression filter, + SerializableSupplier filterSupplier, String branch) { Preconditions.checkNotNull(tableName, "Table name should no be null"); @@ -89,8 +90,8 @@ public DataFileRewritePlanner( this.partialProgressMaxCommits = newPartialProgressMaxCommits; this.maxRewriteBytes = maxRewriteBytes; this.rewriterOptions = rewriterOptions; - this.filter = filter; this.branch = branch; + this.filterSupplier = filterSupplier; } @Override @@ -125,7 +126,7 @@ public void processElement(Trigger value, Context ctx, Collector o } BinPackRewriteFilePlanner planner = - new BinPackRewriteFilePlanner(table, filter, snapshot.snapshotId(), false); + new BinPackRewriteFilePlanner(table, filterSupplier.get(), snapshot.snapshotId(), false); planner.init(rewriterOptions); FileRewritePlan diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java index f7e8e0c884cf..5f3494330cfc 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java @@ -21,14 +21,14 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.flink.api.common.functions.MapFunction; -import org.apache.flink.formats.avro.AvroToRowDataConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.table.types.logical.RowType; import org.apache.flink.table.types.utils.TypeConversions; import org.apache.iceberg.avro.AvroSchemaUtil; +import org.apache.iceberg.flink.formats.avro.AvroToRowDataConverters; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; /** * This util class converts Avro GenericRecord to Flink RowData.
    diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java index 434f3969577f..6cf15ff713fb 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java @@ -18,23 +18,33 @@ */ package org.apache.iceberg.flink.sink; -import com.codahale.metrics.SlidingWindowReservoir; import java.util.Arrays; import java.util.concurrent.atomic.AtomicLong; import org.apache.flink.annotation.Internal; -import org.apache.flink.dropwizard.metrics.DropwizardHistogramWrapper; +import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.metrics.Counter; import org.apache.flink.metrics.Histogram; import org.apache.flink.metrics.MetricGroup; +import org.apache.iceberg.common.DynClasses; +import org.apache.iceberg.common.DynConstructors; import org.apache.iceberg.io.WriteResult; import org.apache.iceberg.util.ScanTaskUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; @Internal public class IcebergStreamWriterMetrics { + + private static final Logger LOG = LoggerFactory.getLogger(IcebergStreamWriterMetrics.class); + // 1,024 reservoir size should cost about 8KB, which is quite small. // It should also produce good accuracy for histogram distribution (like percentiles). private static final int HISTOGRAM_RESERVOIR_SIZE = 1024; + // Histogram metrics loaded through Flink's optional flink-metrics-dropwizard dependency. + // Will be null if not available. + private static final DropwizardCtors DROPWIZARD = loadDropwizardCtors(); + private final Counter flushedDataFiles; private final Counter flushedDeleteFiles; private final Counter flushedReferencedDataFiles; @@ -51,18 +61,8 @@ public IcebergStreamWriterMetrics(MetricGroup metrics, String fullTableName) { this.lastFlushDurationMs = new AtomicLong(); writerMetrics.gauge("lastFlushDurationMs", lastFlushDurationMs::get); - com.codahale.metrics.Histogram dropwizardDataFilesSizeHistogram = - new com.codahale.metrics.Histogram(new SlidingWindowReservoir(HISTOGRAM_RESERVOIR_SIZE)); - this.dataFilesSizeHistogram = - writerMetrics.histogram( - "dataFilesSizeHistogram", - new DropwizardHistogramWrapper(dropwizardDataFilesSizeHistogram)); - com.codahale.metrics.Histogram dropwizardDeleteFilesSizeHistogram = - new com.codahale.metrics.Histogram(new SlidingWindowReservoir(HISTOGRAM_RESERVOIR_SIZE)); - this.deleteFilesSizeHistogram = - writerMetrics.histogram( - "deleteFilesSizeHistogram", - new DropwizardHistogramWrapper(dropwizardDeleteFilesSizeHistogram)); + this.dataFilesSizeHistogram = registerHistogram(writerMetrics, "dataFilesSizeHistogram"); + this.deleteFilesSizeHistogram = registerHistogram(writerMetrics, "deleteFilesSizeHistogram"); } public void updateFlushResult(WriteResult result) { @@ -74,16 +74,21 @@ public void updateFlushResult(WriteResult result) { // This should works equally well and we avoided the overhead of tracking the list of file sizes // in the {@link CommitSummary}, which currently stores simple stats for counters and gauges // metrics. - Arrays.stream(result.dataFiles()) - .forEach( - dataFile -> { - dataFilesSizeHistogram.update(dataFile.fileSizeInBytes()); - }); - Arrays.stream(result.deleteFiles()) - .forEach( - deleteFile -> { - deleteFilesSizeHistogram.update(ScanTaskUtil.contentSizeInBytes(deleteFile)); - }); + if (dataFilesSizeHistogram != null) { + Arrays.stream(result.dataFiles()) + .forEach( + dataFile -> { + dataFilesSizeHistogram.update(dataFile.fileSizeInBytes()); + }); + } + + if (deleteFilesSizeHistogram != null) { + Arrays.stream(result.deleteFiles()) + .forEach( + deleteFile -> { + deleteFilesSizeHistogram.update(ScanTaskUtil.contentSizeInBytes(deleteFile)); + }); + } } public void flushDuration(long flushDurationMs) { @@ -97,4 +102,60 @@ public Counter getFlushedDataFiles() { public Counter getFlushedDeleteFiles() { return flushedDeleteFiles; } + + @VisibleForTesting + Histogram dataFilesSizeHistogram() { + return dataFilesSizeHistogram; + } + + @VisibleForTesting + Histogram deleteFilesSizeHistogram() { + return deleteFilesSizeHistogram; + } + + private static Histogram registerHistogram(MetricGroup group, String name) { + Histogram histogram = newDropwizardHistogram(); + return histogram != null ? group.histogram(name, histogram) : null; + } + + private static Histogram newDropwizardHistogram() { + if (DROPWIZARD == null) { + return null; + } + + Object reservoir = DROPWIZARD.reservoirCtor.newInstance(HISTOGRAM_RESERVOIR_SIZE); + Object codahaleHistogram = DROPWIZARD.histogramCtor.newInstance(reservoir); + return DROPWIZARD.wrapperCtor.newInstance(codahaleHistogram); + } + + private static DropwizardCtors loadDropwizardCtors() { + try { + Class reservoirInterface = + DynClasses.builder().impl("com.codahale.metrics.Reservoir").buildChecked(); + Class codahaleHistogramClass = + DynClasses.builder().impl("com.codahale.metrics.Histogram").buildChecked(); + return new DropwizardCtors( + DynConstructors.builder() + .impl("com.codahale.metrics.SlidingWindowReservoir", int.class) + .buildChecked(), + DynConstructors.builder() + .impl("com.codahale.metrics.Histogram", reservoirInterface) + .buildChecked(), + DynConstructors.builder(Histogram.class) + .impl( + "org.apache.flink.dropwizard.metrics.DropwizardHistogramWrapper", + codahaleHistogramClass) + .buildChecked()); + } catch (ClassNotFoundException | NoSuchMethodException e) { + LOG.warn( + "Cannot load Dropwizard metrics; is org.apache.flink:flink-metrics-dropwizard on the classpath?", + e); + return null; + } + } + + private record DropwizardCtors( + DynConstructors.Ctor reservoirCtor, + DynConstructors.Ctor histogramCtor, + DynConstructors.Ctor wrapperCtor) {} } diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java index 4b5c9bef41e1..ad430cbf13f8 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java @@ -44,6 +44,7 @@ import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSink; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.streaming.runtime.operators.sink.SinkWriterOperatorFactory; import org.apache.flink.table.data.RowData; import org.apache.flink.util.OutputTag; import org.apache.iceberg.Table; @@ -79,13 +80,17 @@ public class DynamicIcebergSink private final Configuration flinkConfig; private final int cacheMaximumSize; + // Set by the builder before sinkTo() — forward writer results to union into pre-commit topology + private final transient DataStream> forwardWriteResults; + DynamicIcebergSink( CatalogLoader catalogLoader, Map snapshotProperties, String uidPrefix, Map writeProperties, Configuration flinkConfig, - int cacheMaximumSize) { + int cacheMaximumSize, + DataStream> forwardWriteResults) { this.catalogLoader = catalogLoader; this.snapshotProperties = snapshotProperties; this.uidPrefix = uidPrefix; @@ -96,6 +101,7 @@ public class DynamicIcebergSink // This is used to separate files generated by different sinks writing the same table. // Also used to generate the aggregator operator name this.sinkId = UUID.randomUUID().toString(); + this.forwardWriteResults = forwardWriteResults; } @Override @@ -144,7 +150,11 @@ public DataStream> addPreCommitTopology( TypeInformation> typeInformation = CommittableMessageTypeInfo.of(this::getCommittableSerializer); - return writeResults + // Union forward writer results with the shuffle writer results + DataStream> allResults = + writeResults.union(forwardWriteResults); + + return allResults .keyBy( committable -> { if (committable instanceof CommittableSummary) { @@ -167,6 +177,55 @@ public SimpleVersionedSerializer getWriteResultSerializer() return new DynamicWriteResultSerializer(); } + /** + * A lightweight Sink used with {@link SinkWriterOperatorFactory} for the forward write path. + * Implements {@link SupportsCommitter} so that {@code SinkWriterOperator} emits committables + * downstream. The committer is never called — committing is handled by the main sink. + */ + @VisibleForTesting + static class ForwardWriterSink + implements Sink, SupportsCommitter { + + private final CatalogLoader catalogLoader; + private final Map writeProperties; + private final Configuration flinkConfig; + private final int cacheMaximumSize; + + ForwardWriterSink( + CatalogLoader catalogLoader, + Map writeProperties, + Configuration flinkConfig, + int cacheMaximumSize) { + this.catalogLoader = catalogLoader; + this.writeProperties = writeProperties; + this.flinkConfig = flinkConfig; + this.cacheMaximumSize = cacheMaximumSize; + } + + @Override + public SinkWriter createWriter(WriterInitContext context) { + return new DynamicWriter( + catalogLoader.loadCatalog(), + writeProperties, + flinkConfig, + cacheMaximumSize, + new DynamicWriterMetrics(context.metricGroup()), + context.getTaskInfo().getIndexOfThisSubtask(), + context.getTaskInfo().getAttemptNumber()); + } + + @Override + public Committer createCommitter(CommitterInitContext context) { + throw new UnsupportedOperationException( + "WriterSink is used only for writing; committing is handled by the main sink"); + } + + @Override + public SimpleVersionedSerializer getCommittableSerializer() { + return new DynamicWriteResultSerializer(); + } + } + public static class Builder { private DataStream input; private DynamicRecordGenerator generator; @@ -176,12 +235,6 @@ public static class Builder { private final Map snapshotSummary = Maps.newHashMap(); private ReadableConfig readableConfig = new Configuration(); private TableCreator tableCreator = TableCreator.DEFAULT; - private boolean immediateUpdate = false; - private boolean dropUnusedColumns = false; - private int cacheMaximumSize = 100; - private long cacheRefreshMs = 1_000; - private int inputSchemasPerTableCacheMaximumSize = 10; - private boolean caseSensitive = true; Builder() {} @@ -302,7 +355,9 @@ public Builder toBranch(String branch) { } public Builder immediateTableUpdate(boolean newImmediateUpdate) { - this.immediateUpdate = newImmediateUpdate; + writeOptions.put( + FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE.key(), + Boolean.toString(newImmediateUpdate)); return this; } @@ -318,19 +373,21 @@ public Builder immediateTableUpdate(boolean newImmediateUpdate) { * will never return data of the old column. */ public Builder dropUnusedColumns(boolean newDropUnusedColumns) { - this.dropUnusedColumns = newDropUnusedColumns; + writeOptions.put( + FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.key(), + Boolean.toString(newDropUnusedColumns)); return this; } /** Maximum size of the caches used in Dynamic Sink for table data and serializers. */ public Builder cacheMaxSize(int maxSize) { - this.cacheMaximumSize = maxSize; + writeOptions.put(FlinkDynamicSinkOptions.CACHE_MAX_SIZE.key(), Integer.toString(maxSize)); return this; } /** Maximum interval for cache items renewals. */ public Builder cacheRefreshMs(long refreshMs) { - this.cacheRefreshMs = refreshMs; + writeOptions.put(FlinkDynamicSinkOptions.CACHE_REFRESH_MS.key(), Long.toString(refreshMs)); return this; } @@ -340,7 +397,9 @@ public Builder cacheRefreshMs(long refreshMs) { * comparison results. */ public Builder inputSchemasPerTableCacheMaxSize(int inputSchemasPerTableCacheMaxSize) { - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaxSize; + writeOptions.put( + FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE.key(), + Integer.toString(inputSchemasPerTableCacheMaxSize)); return this; } @@ -349,7 +408,8 @@ public Builder inputSchemasPerTableCacheMaxSize(int inputSchemasPerTableCache * field names case-sensitive. */ public Builder caseSensitive(boolean newCaseSensitive) { - this.caseSensitive = newCaseSensitive; + writeOptions.put( + FlinkDynamicSinkOptions.CASE_SENSITIVE.key(), Boolean.toString(newCaseSensitive)); return this; } @@ -357,89 +417,134 @@ private String operatorName(String suffix) { return uidPrefix != null ? uidPrefix + "-" + suffix : suffix; } - private DynamicIcebergSink build() { + private DynamicIcebergSink build( + SingleOutputStreamOperator converted, + DynamicRecordInternalType sideOutputType) { Preconditions.checkArgument( generator != null, "Please use withGenerator() to convert the input DataStream."); Preconditions.checkNotNull(catalogLoader, "Catalog loader shouldn't be null"); - uidPrefix = Optional.ofNullable(uidPrefix).orElse(""); + Configuration flinkConfig = fromReadableConfig(); + FlinkDynamicSinkConf flinkDynamicSinkConf = + new FlinkDynamicSinkConf(writeOptions, flinkConfig); - Configuration flinkConfig = - readableConfig instanceof Configuration - ? (Configuration) readableConfig - : Configuration.fromMap(readableConfig.toMap()); + // Forward writer: chained with generator via forward edge, no data shuffle + ForwardWriterSink forwardWriterSink = + new ForwardWriterSink( + catalogLoader, writeOptions, flinkConfig, flinkDynamicSinkConf.cacheMaxSize()); + TypeInformation> writeResultTypeInfo = + CommittableMessageTypeInfo.of(DynamicWriteResultSerializer::new); - return instantiateSink(writeOptions, flinkConfig); + DataStream> forwardWriteResults = + converted + .getSideOutput( + new OutputTag<>(DynamicRecordProcessor.DYNAMIC_FORWARD_STREAM, sideOutputType)) + .transform( + operatorName("Forward-Writer"), + writeResultTypeInfo, + new SinkWriterOperatorFactory<>(forwardWriterSink)) + .setParallelism(converted.getParallelism()) + .uid(prefixIfNotNull(uidPrefix, "-forward-writer")); + + // Inject forward write results into sink — they'll be unioned in addPreCommitTopology + return instantiateSink(writeOptions, flinkConfig, forwardWriteResults); } @VisibleForTesting DynamicIcebergSink instantiateSink( - Map writeProperties, Configuration flinkWriteConf) { + Map writeProperties, + Configuration flinkWriteConf, + DataStream> forwardWriteResults) { + FlinkDynamicSinkConf flinkDynamicSinkConf = + new FlinkDynamicSinkConf(writeProperties, flinkWriteConf); return new DynamicIcebergSink( catalogLoader, snapshotSummary, uidPrefix, writeProperties, flinkWriteConf, - cacheMaximumSize); + flinkDynamicSinkConf.cacheMaxSize(), + forwardWriteResults); } /** * Append the iceberg sink operators to write records to iceberg table. * + *

    The topology splits records by distribution mode: + * + *

      + *
    • Forward records ({@code null} distributionMode) go through a forward edge to a chained + * writer, avoiding any data shuffle. + *
    • Shuffle records (non-null distributionMode) go through the standard Sink2 pipeline with + * hash/round-robin distribution. + *
    + * + * Both writers feed into a single shared pre-commit aggregator and committer, ensuring atomic + * commits across both paths. + * * @return {@link DataStreamSink} for sink. */ public DataStreamSink append() { + uidPrefix = Optional.ofNullable(uidPrefix).orElse(""); + + FlinkDynamicSinkConf flinkDynamicSinkConf = + new FlinkDynamicSinkConf(writeOptions, readableConfig); + Configuration flinkConfig = fromReadableConfig(); + DynamicRecordInternalType type = - new DynamicRecordInternalType(catalogLoader, false, cacheMaximumSize); - DynamicIcebergSink sink = build(); + new DynamicRecordInternalType(catalogLoader, false, flinkDynamicSinkConf.cacheMaxSize()); + DynamicRecordInternalType sideOutputType = + new DynamicRecordInternalType(catalogLoader, true, flinkDynamicSinkConf.cacheMaxSize()); + SingleOutputStreamOperator converted = input .process( new DynamicRecordProcessor<>( generator, catalogLoader, - immediateUpdate, - cacheMaximumSize, - cacheRefreshMs, - inputSchemasPerTableCacheMaximumSize, tableCreator, - caseSensitive, - dropUnusedColumns)) + flinkDynamicSinkConf, + writeOptions, + flinkConfig)) + .setParallelism(input.getParallelism()) .uid(prefixIfNotNull(uidPrefix, "-generator")) .name(operatorName("generator")) .returns(type); - DataStreamSink rowDataDataStreamSink = + DynamicIcebergSink sink = build(converted, sideOutputType); + + // Shuffle path: table update side output + main output → sinkTo() + DataStream shuffleInput = converted .getSideOutput( new OutputTag<>( - DynamicRecordProcessor.DYNAMIC_TABLE_UPDATE_STREAM, - new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize))) + DynamicRecordProcessor.DYNAMIC_TABLE_UPDATE_STREAM, sideOutputType)) .keyBy((KeySelector) DynamicRecordInternal::tableName) .map( - new DynamicTableUpdateOperator( - catalogLoader, - cacheMaximumSize, - cacheRefreshMs, - inputSchemasPerTableCacheMaximumSize, - tableCreator, - caseSensitive, - dropUnusedColumns)) + new DynamicTableUpdateOperator(catalogLoader, tableCreator, flinkDynamicSinkConf)) .uid(prefixIfNotNull(uidPrefix, "-updater")) .name(operatorName("Updater")) .returns(type) - .union(converted) - .sinkTo(sink) + .union(converted); + + DataStreamSink result = + shuffleInput + .sinkTo(sink) // Forward write results are implicitly injected here .uid(prefixIfNotNull(uidPrefix, "-sink")); FlinkWriteConf flinkWriteConf = new FlinkWriteConf(writeOptions, readableConfig); if (flinkWriteConf.writeParallelism() != null) { - rowDataDataStreamSink.setParallelism(flinkWriteConf.writeParallelism()); + result.setParallelism(flinkWriteConf.writeParallelism()); } - return rowDataDataStreamSink; + return result; + } + + private Configuration fromReadableConfig() { + return readableConfig instanceof Configuration + ? (Configuration) readableConfig + : Configuration.fromMap(readableConfig.toMap()); } } diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java index 9f445766083e..6507a575c2af 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java @@ -20,6 +20,7 @@ import java.util.Set; import javax.annotation.Nullable; +import org.apache.flink.annotation.Internal; import org.apache.flink.table.data.RowData; import org.apache.iceberg.DistributionMode; import org.apache.iceberg.PartitionSpec; @@ -34,20 +35,43 @@ public class DynamicRecord { private Schema schema; private RowData rowData; private PartitionSpec partitionSpec; - private DistributionMode distributionMode; + @Nullable private DistributionMode distributionMode; private int writeParallelism; private boolean upsertMode; @Nullable private Set equalityFields; + @Internal + DynamicRecord() {} + + /** + * Constructs a new DynamicRecord with forward (no shuffle) writes. + * + * @param tableIdentifier The target table identifier. + * @param branch The target table branch. + * @param schema The target table schema. + * @param rowData The data matching the provided schema. + * @param partitionSpec The target table {@link PartitionSpec}. + */ + public DynamicRecord( + TableIdentifier tableIdentifier, + String branch, + Schema schema, + RowData rowData, + PartitionSpec partitionSpec) { + this(tableIdentifier, branch, schema, rowData, partitionSpec, null, -1); + } + /** - * Constructs a new DynamicRecord. + * Constructs a new DynamicRecord. This record will be shuffled as specified by {@code + * distributionMode}. * * @param tableIdentifier The target table identifier. * @param branch The target table branch. * @param schema The target table schema. * @param rowData The data matching the provided schema. * @param partitionSpec The target table {@link PartitionSpec}. - * @param distributionMode The {@link DistributionMode}. + * @param distributionMode The {@link DistributionMode}. {@code null} indicates forward (no + * shuffle) writes. * @param writeParallelism The number of parallel writers. Can be set to any value {@literal > 0}, * but will always be automatically capped by the maximum write parallelism, which is the * parallelism of the sink. Set to Integer.MAX_VALUE for always using the maximum available diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java index 07dfad2780f7..c752b8e9b8d9 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java @@ -18,10 +18,12 @@ */ package org.apache.iceberg.flink.sink.dynamic; +import java.util.Map; import org.apache.flink.annotation.Internal; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.functions.OpenContext; import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.functions.ProcessFunction; import org.apache.flink.table.data.RowData; import org.apache.flink.util.Collector; @@ -30,6 +32,7 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.flink.CatalogLoader; +import org.apache.iceberg.flink.FlinkWriteConf; @Internal class DynamicRecordProcessor extends ProcessFunction @@ -37,8 +40,12 @@ class DynamicRecordProcessor extends ProcessFunction generator; private final CatalogLoader catalogLoader; + private final Map writeProperties; + private final Configuration flinkConfig; private final boolean immediateUpdate; private final boolean dropUnusedColumns; private final int cacheMaximumSize; @@ -51,28 +58,29 @@ class DynamicRecordProcessor extends ProcessFunction updateStream; + private transient OutputTag forwardStream; private transient Collector collector; + private transient DynamicRecordWithConfig dynamicRecordWithConfig; private transient Context context; DynamicRecordProcessor( DynamicRecordGenerator generator, CatalogLoader catalogLoader, - boolean immediateUpdate, - int cacheMaximumSize, - long cacheRefreshMs, - int inputSchemasPerTableCacheMaximumSize, TableCreator tableCreator, - boolean caseSensitive, - boolean dropUnusedColumns) { + FlinkDynamicSinkConf sinkConfig, + Map writeProperties, + Configuration flinkConfig) { this.generator = generator; this.catalogLoader = catalogLoader; - this.immediateUpdate = immediateUpdate; - this.cacheMaximumSize = cacheMaximumSize; - this.cacheRefreshMs = cacheRefreshMs; - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaximumSize; + this.flinkConfig = flinkConfig; + this.writeProperties = writeProperties; + this.immediateUpdate = sinkConfig.immediateTableUpdate(); + this.cacheMaximumSize = sinkConfig.cacheMaxSize(); + this.cacheRefreshMs = sinkConfig.cacheRefreshMs(); + this.inputSchemasPerTableCacheMaximumSize = sinkConfig.inputSchemasPerTableCacheMaxSize(); this.tableCreator = tableCreator; - this.caseSensitive = caseSensitive; - this.dropUnusedColumns = dropUnusedColumns; + this.caseSensitive = sinkConfig.caseSensitive(); + this.dropUnusedColumns = sinkConfig.dropUnusedColumns(); } @Override @@ -90,15 +98,22 @@ public void open(OpenContext openContext) throws Exception { this.hashKeyGenerator = new HashKeyGenerator( cacheMaximumSize, getRuntimeContext().getTaskInfo().getMaxNumberOfParallelSubtasks()); - if (immediateUpdate) { - updater = new TableUpdater(tableCache, catalog, caseSensitive, dropUnusedColumns); - } else { + // Always create updater — needed for forced immediate updates on forward records + this.updater = new TableUpdater(tableCache, catalog, caseSensitive, dropUnusedColumns); + // Always create forward stream tag for forward (distributionMode == null) records + this.forwardStream = + new OutputTag<>( + DYNAMIC_FORWARD_STREAM, + new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize)) {}; + if (!immediateUpdate) { updateStream = new OutputTag<>( DYNAMIC_TABLE_UPDATE_STREAM, new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize)) {}; } + this.dynamicRecordWithConfig = + new DynamicRecordWithConfig(new FlinkWriteConf(writeProperties, flinkConfig)); generator.open(openContext); } @@ -111,7 +126,10 @@ public void processElement(T element, Context ctx, Collector newData = updater.update( data.tableIdentifier(), data.branch(), data.schema(), data.spec(), tableCreator); emit( - collector, data, newData.f0.resolvedTableSchema(), newData.f0.recordConverter(), - newData.f1); + newData.f1, + isForward); } else { + // Shuffled records with immediateUpdate=false go to the update side output int writerKey = hashKeyGenerator.generateKey( data, @@ -159,33 +182,38 @@ public void collect(DynamicRecord data) { } } else { emit( - collector, data, foundSchema.resolvedTableSchema(), foundSchema.recordConverter(), - foundSpec); + foundSpec, + isForward); } } private void emit( - Collector out, DynamicRecord data, Schema schema, DataConverter recordConverter, - PartitionSpec spec) { + PartitionSpec spec, + boolean forward) { RowData rowData = (RowData) recordConverter.convert(data.rowData()); - int writerKey = hashKeyGenerator.generateKey(data, schema, spec, rowData); - String tableName = data.tableIdentifier().toString(); - out.collect( + // writerKey is unused in the forward path. + int writerKey = forward ? -1 : hashKeyGenerator.generateKey(data, schema, spec, rowData); + DynamicRecordInternal record = new DynamicRecordInternal( - tableName, + data.tableIdentifier().toString(), data.branch(), schema, rowData, spec, writerKey, data.upsertMode(), - DynamicSinkUtil.getEqualityFieldIds(data.equalityFields(), schema))); + DynamicSinkUtil.getEqualityFieldIds(data.equalityFields(), schema)); + if (forward) { + context.output(forwardStream, record); + } else { + collector.collect(record); + } } @Override diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordWithConfig.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordWithConfig.java new file mode 100644 index 000000000000..32716c3e4ac7 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordWithConfig.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.util.Set; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.FlinkWriteConf; + +class DynamicRecordWithConfig extends DynamicRecord { + private final String defaultBranch; + private final Integer defaultWriteParallelism; + + private DynamicRecord wrapped; + + DynamicRecordWithConfig(FlinkWriteConf flinkWriteConf) { + this.defaultBranch = flinkWriteConf.branch(); + this.defaultWriteParallelism = flinkWriteConf.writeParallelism(); + } + + DynamicRecordWithConfig wrap(DynamicRecord newWrapped) { + this.wrapped = newWrapped; + return this; + } + + @Override + public String branch() { + return wrapped.branch() != null ? wrapped.branch() : defaultBranch; + } + + @Override + public DistributionMode distributionMode() { + return wrapped.distributionMode(); + } + + @Override + public int writeParallelism() { + int originalParallelism = wrapped.writeParallelism(); + if (originalParallelism > 0 || defaultWriteParallelism == null) { + return originalParallelism; + } + + return defaultWriteParallelism; + } + + @Override + public TableIdentifier tableIdentifier() { + return wrapped.tableIdentifier(); + } + + @Override + public Schema schema() { + return wrapped.schema(); + } + + @Override + public PartitionSpec spec() { + return wrapped.spec(); + } + + @Override + public RowData rowData() { + return wrapped.rowData(); + } + + @Override + public boolean upsertMode() { + return wrapped.upsertMode(); + } + + @Override + public Set equalityFields() { + return wrapped.equalityFields(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java index 456f20adf59f..93c268ff86ad 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java @@ -48,20 +48,15 @@ class DynamicTableUpdateOperator private transient TableUpdater updater; DynamicTableUpdateOperator( - CatalogLoader catalogLoader, - int cacheMaximumSize, - long cacheRefreshMs, - int inputSchemasPerTableCacheMaximumSize, - TableCreator tableCreator, - boolean caseSensitive, - boolean dropUnusedColumns) { + CatalogLoader catalogLoader, TableCreator tableCreator, FlinkDynamicSinkConf configuration) { this.catalogLoader = catalogLoader; - this.cacheMaximumSize = cacheMaximumSize; - this.cacheRefreshMs = cacheRefreshMs; - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaximumSize; this.tableCreator = tableCreator; - this.caseSensitive = caseSensitive; - this.dropUnusedColumns = dropUnusedColumns; + + this.cacheMaximumSize = configuration.cacheMaxSize(); + this.cacheRefreshMs = configuration.cacheRefreshMs(); + this.inputSchemasPerTableCacheMaximumSize = configuration.inputSchemasPerTableCacheMaxSize(); + this.caseSensitive = configuration.caseSensitive(); + this.dropUnusedColumns = configuration.dropUnusedColumns(); } @Override diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkConf.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkConf.java new file mode 100644 index 000000000000..75b169c4b533 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkConf.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.util.Map; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.iceberg.flink.FlinkConfParser; + +/** + * A class for common Dynamic Iceberg sink configs for Flink writes. + * + *

    If a config is set at multiple levels, the following order of precedence is used (top to + * bottom): + * + *

      + *
    1. Write options + *
    2. Flink ReadableConfig + *
    3. Default values + *
    + * + * The most specific value is set in write options and takes precedence over all other configs. If + * no write option is provided, this class checks the flink configuration for any overrides. If no + * applicable value is found in the write options, this class uses the default values. + */ +class FlinkDynamicSinkConf { + + private final FlinkConfParser confParser; + + FlinkDynamicSinkConf(Map writeOptions, ReadableConfig readableConfig) { + this.confParser = new FlinkConfParser(writeOptions, readableConfig); + } + + int cacheMaxSize() { + return confParser + .intConf() + .option(FlinkDynamicSinkOptions.CACHE_MAX_SIZE.key()) + .flinkConfig(FlinkDynamicSinkOptions.CACHE_MAX_SIZE) + .defaultValue(FlinkDynamicSinkOptions.CACHE_MAX_SIZE.defaultValue()) + .parse(); + } + + boolean immediateTableUpdate() { + return confParser + .booleanConf() + .option(FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE.key()) + .flinkConfig(FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE) + .defaultValue(FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE.defaultValue()) + .parse(); + } + + boolean dropUnusedColumns() { + return confParser + .booleanConf() + .option(FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.key()) + .flinkConfig(FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS) + .defaultValue(FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.defaultValue()) + .parse(); + } + + long cacheRefreshMs() { + return confParser + .longConf() + .option(FlinkDynamicSinkOptions.CACHE_REFRESH_MS.key()) + .flinkConfig(FlinkDynamicSinkOptions.CACHE_REFRESH_MS) + .defaultValue(FlinkDynamicSinkOptions.CACHE_REFRESH_MS.defaultValue()) + .parse(); + } + + int inputSchemasPerTableCacheMaxSize() { + return confParser + .intConf() + .option(FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE.key()) + .flinkConfig(FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE) + .defaultValue(FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE.defaultValue()) + .parse(); + } + + boolean caseSensitive() { + return confParser + .booleanConf() + .option(FlinkDynamicSinkOptions.CASE_SENSITIVE.key()) + .flinkConfig(FlinkDynamicSinkOptions.CASE_SENSITIVE) + .defaultValue(FlinkDynamicSinkOptions.CASE_SENSITIVE.defaultValue()) + .parse(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkOptions.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkOptions.java new file mode 100644 index 000000000000..7a4f038219d9 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkOptions.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import org.apache.flink.annotation.Experimental; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; + +@Experimental +public class FlinkDynamicSinkOptions { + + private FlinkDynamicSinkOptions() {} + + public static final ConfigOption CACHE_MAX_SIZE = + ConfigOptions.key("dynamic-sink.cache-max-size") + .intType() + .defaultValue(100) + .withDescription( + "Maximum size of the caches used in Dynamic Sink for table data and serializers."); + + public static final ConfigOption IMMEDIATE_TABLE_UPDATE = + ConfigOptions.key("dynamic-sink.immediate-table-update") + .booleanType() + .defaultValue(false) + .withDescription( + "Controls whether table schema and partition updates should be applied immediately in Dynamic Sink."); + + public static final ConfigOption DROP_UNUSED_COLUMNS = + ConfigOptions.key("dynamic-sink.drop-unused-columns") + .booleanType() + .defaultValue(false) + .withDescription( + "Allows dropping unused columns during schema evolution in Dynamic Sink."); + + public static final ConfigOption CACHE_REFRESH_MS = + ConfigOptions.key("dynamic-sink.cache-refresh-ms") + .longType() + .defaultValue(1_000L) + .withDescription( + "Cache refresh interval for dynamic table metadata in Dynamic Sink in milliseconds."); + + public static final ConfigOption INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE = + ConfigOptions.key("dynamic-sink.input-schemas-per-table-cache-max-size") + .intType() + .defaultValue(10) + .withDescription( + "Maximum input schema objects to cache per each table in Dynamic Sink for performance."); + + public static final ConfigOption CASE_SENSITIVE = + ConfigOptions.key("dynamic-sink.case-sensitive") + .booleanType() + .defaultValue(true) + .withDescription( + "Controls whether schema field name matching should be case-sensitive in Dynamic Sink."); +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java index fca45bf882e0..61a850212bf4 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java @@ -88,7 +88,7 @@ int generateKey( dynamicRecord.schema(), dynamicRecord.spec(), dynamicRecord.equalityFields(), - MoreObjects.firstNonNull(dynamicRecord.distributionMode(), DistributionMode.NONE), + dynamicRecord.distributionMode(), Math.min(dynamicRecord.writeParallelism(), maxWriteParallelism)); KeySelector keySelector = keySelectorCache.computeIfAbsent( @@ -98,8 +98,7 @@ int generateKey( tableIdent, MoreObjects.firstNonNull(tableSchema, dynamicRecord.schema()), MoreObjects.firstNonNull(tableSpec, dynamicRecord.spec()), - MoreObjects.firstNonNull( - dynamicRecord.distributionMode(), DistributionMode.NONE), + dynamicRecord.distributionMode(), MoreObjects.firstNonNull( dynamicRecord.equalityFields(), Collections.emptySet()), Math.min(dynamicRecord.writeParallelism(), maxWriteParallelism))); diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java index 8ef1f1fbb833..d74b8b9d620f 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java @@ -23,8 +23,6 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.flink.annotation.Internal; -import org.apache.flink.formats.avro.RowDataToAvroConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.LogicalType; @@ -32,6 +30,8 @@ import org.apache.flink.table.types.utils.TypeConversions; import org.apache.iceberg.avro.AvroSchemaUtil; import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.formats.avro.RowDataToAvroConverters; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; /** * This is not serializable because Avro {@link Schema} is not actually serializable, even though it diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java index bac7c05bdfef..1c6644238c3d 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java @@ -136,23 +136,26 @@ void validate() { if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) { Preconditions.checkArgument( startSnapshotId != null, - "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); + "Invalid starting snapshot id for %s strategy: null", + startingStrategy); Preconditions.checkArgument( startSnapshotTimestamp == null, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + "Invalid starting snapshot timestamp for %s strategy: not null", + startingStrategy); } if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) { Preconditions.checkArgument( startSnapshotTimestamp != null, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); + "Invalid starting snapshot timestamp for %s strategy: null", + startingStrategy); Preconditions.checkArgument( startSnapshotId == null, - "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + "Invalid starting snapshot id for %s strategy: not null", + startingStrategy); } Preconditions.checkArgument( - tag == null, - String.format("Cannot scan table using ref %s configured for streaming reader", tag)); + tag == null, "Cannot scan table using ref %s configured for streaming reader", tag); Preconditions.checkArgument( snapshotId == null, "Cannot set snapshot-id option for streaming reader"); Preconditions.checkArgument( diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java index b158b0871a53..cfef780a4daa 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java @@ -21,8 +21,6 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.formats.avro.RowDataToAvroConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; import org.apache.flink.formats.avro.typeutils.GenericRecordAvroTypeInfo; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.DataType; @@ -31,6 +29,8 @@ import org.apache.flink.table.types.utils.TypeConversions; import org.apache.iceberg.avro.AvroSchemaUtil; import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.formats.avro.RowDataToAvroConverters; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; public class AvroGenericRecordConverter implements RowDataConverter { private final Schema avroSchema; diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java index 02ef57d344b1..3af9957875e8 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java @@ -34,7 +34,7 @@ class WatermarkExtractorRecordEmitter implements SerializableRecordEmitter private static final Logger LOG = LoggerFactory.getLogger(WatermarkExtractorRecordEmitter.class); private final SplitWatermarkExtractor timeExtractor; private String lastSplitId = null; - private long watermark; + private long watermark = Long.MIN_VALUE; WatermarkExtractorRecordEmitter(SplitWatermarkExtractor timeExtractor) { this.timeExtractor = timeExtractor; @@ -44,7 +44,10 @@ class WatermarkExtractorRecordEmitter implements SerializableRecordEmitter public void emitRecord( RecordAndPosition element, SourceOutput output, IcebergSourceSplit split) { if (!split.splitId().equals(lastSplitId)) { - long newWatermark = timeExtractor.extractWatermark(split); + long extracted = timeExtractor.extractWatermark(split); + // Subtract 1 because watermark W means all records with eventTime <= W have arrived; + // records in this split have eventTime == extracted, so watermark must be extracted - 1. + long newWatermark = extracted > Long.MIN_VALUE ? extracted - 1 : Long.MIN_VALUE; if (newWatermark < watermark) { LOG.info( "Received a new split with lower watermark. Previous watermark = {}, current watermark = {}, previous split = {}, current split = {}", diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java index e2cd411d7069..795c4fa5a766 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java @@ -75,6 +75,11 @@ public static class Primitives implements DataGenerator { OffsetDateTime.of(2022, 1, 10, 0, 0, 0, 0, ZoneOffset.UTC); private static final LocalDateTime JAVA_LOCAL_DATE_TIME_20220110 = LocalDateTime.of(2022, 1, 10, 0, 0, 0); + private static final OffsetDateTime JAVA_OFFSET_DATE_TIME_MAX_NANO = + OffsetDateTime.of(2262, 4, 11, 23, 47, 16, 854_775_807, ZoneOffset.UTC); + private static final LocalDateTime JAVA_LOCAL_DATE_TIME_MAX_NANO = + LocalDateTime.of(2262, 4, 11, 23, 47, 16, 854_775_807); + private static final long ICEBERG_MAX_NANOS_EPOCH = 9223372036854775807L; private static final BigDecimal BIG_DECIMAL_NEGATIVE = new BigDecimal("-1.50"); private static final byte[] FIXED_BYTES = "012345689012345".getBytes(StandardCharsets.UTF_8); @@ -96,7 +101,11 @@ public static class Primitives implements DataGenerator { Types.NestedField.required(12, "uuid_field", Types.UUIDType.get()), Types.NestedField.required(13, "binary_field", Types.BinaryType.get()), Types.NestedField.required(14, "decimal_field", Types.DecimalType.of(9, 2)), - Types.NestedField.required(15, "fixed_field", Types.FixedType.ofLength(16))); + Types.NestedField.required(15, "fixed_field", Types.FixedType.ofLength(16)), + Types.NestedField.required( + 16, "ts_ns_with_zone_field", Types.TimestampNanoType.withZone()), + Types.NestedField.required( + 17, "ts_ns_without_zone_field", Types.TimestampNanoType.withoutZone())); private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); @@ -171,6 +180,8 @@ public GenericRecord generateIcebergGenericRecord() { genericRecord.setField("time_field", JAVA_LOCAL_TIME_HOUR8); genericRecord.setField("ts_with_zone_field", JAVA_OFFSET_DATE_TIME_20220110); genericRecord.setField("ts_without_zone_field", JAVA_LOCAL_DATE_TIME_20220110); + genericRecord.setField("ts_ns_with_zone_field", JAVA_OFFSET_DATE_TIME_MAX_NANO); + genericRecord.setField("ts_ns_without_zone_field", JAVA_LOCAL_DATE_TIME_MAX_NANO); byte[] uuidBytes = new byte[16]; for (int i = 0; i < 16; ++i) { @@ -220,7 +231,11 @@ public GenericRowData generateFlinkRowData() { uuidBytes, binaryBytes, DecimalData.fromBigDecimal(BIG_DECIMAL_NEGATIVE, 9, 2), - FIXED_BYTES); + FIXED_BYTES, + TimestampData.fromEpochMillis( + ICEBERG_MAX_NANOS_EPOCH / 1_000_000, (int) (ICEBERG_MAX_NANOS_EPOCH % 1_000_000)), + TimestampData.fromEpochMillis( + ICEBERG_MAX_NANOS_EPOCH / 1_000_000, (int) (ICEBERG_MAX_NANOS_EPOCH % 1_000_000))); } @Override @@ -236,10 +251,12 @@ public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { genericRecord.put("date_field", DAYS_BTW_EPOC_AND_20220110); genericRecord.put("time_field", HOUR_8_IN_MILLI); - // Although Avro logical type for timestamp fields are in micro seconds, - // AvroToRowDataConverters only looks for long value in milliseconds. - genericRecord.put("ts_with_zone_field", JODA_DATETIME_20220110.getMillis()); - genericRecord.put("ts_without_zone_field", JODA_DATETIME_20220110.getMillis()); + // Now that AvroToRowDataConverters correctly supports microseconds, + // we must inject correct microsecond scale values into the Avro data. + genericRecord.put("ts_with_zone_field", JODA_DATETIME_20220110.getMillis() * 1000L); + genericRecord.put("ts_without_zone_field", JODA_DATETIME_20220110.getMillis() * 1000L); + genericRecord.put("ts_ns_with_zone_field", ICEBERG_MAX_NANOS_EPOCH); + genericRecord.put("ts_ns_without_zone_field", ICEBERG_MAX_NANOS_EPOCH); byte[] uuidBytes = new byte[16]; for (int i = 0; i < 16; ++i) { @@ -554,7 +571,11 @@ public static class ArrayOfPrimitive implements DataGenerator { new Schema( Types.NestedField.required(1, "row_id", Types.StringType.get()), Types.NestedField.required( - 2, "array_of_int", Types.ListType.ofOptional(101, Types.IntegerType.get()))); + 2, "array_of_int", Types.ListType.ofOptional(101, Types.IntegerType.get())), + Types.NestedField.optional( + 3, + "array_of_ts_ns", + Types.ListType.ofRequired(102, Types.TimestampNanoType.withoutZone()))); private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); @@ -581,13 +602,33 @@ public GenericRecord generateIcebergGenericRecord() { GenericRecord genericRecord = GenericRecord.create(icebergSchema); genericRecord.setField("row_id", "row_id_value"); genericRecord.setField("array_of_int", Arrays.asList(1, 2, 3)); + + LocalDateTime posNanos = LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789); + LocalDateTime negNanos = LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321); + genericRecord.setField("array_of_ts_ns", Arrays.asList(posNanos, negNanos)); return genericRecord; } @Override public GenericRowData generateFlinkRowData() { Integer[] arr = {1, 2, 3}; - return GenericRowData.of(StringData.fromString("row_id_value"), new GenericArrayData(arr)); + + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + TimestampData[] tsArr = { + TimestampData.fromEpochMillis( + Math.floorDiv(posNanos, 1_000_000L), (int) Math.floorMod(posNanos, 1_000_000L)), + TimestampData.fromEpochMillis( + Math.floorDiv(negNanos, 1_000_000L), (int) Math.floorMod(negNanos, 1_000_000L)) + }; + return GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericArrayData(arr), + new GenericArrayData(tsArr)); } @Override @@ -595,6 +636,14 @@ public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); genericRecord.put("row_id", "row_id_value"); genericRecord.put("array_of_int", Arrays.asList(1, 2, 3)); + + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + genericRecord.put("array_of_ts_ns", Arrays.asList(posNanos, negNanos)); return genericRecord; } } @@ -808,7 +857,12 @@ public static class MapOfPrimitives implements DataGenerator { 2, "map_of_primitives", Types.MapType.ofRequired( - 101, 102, Types.StringType.get(), Types.IntegerType.get()))); + 101, 102, Types.StringType.get(), Types.IntegerType.get())), + Types.NestedField.optional( + 3, + "map_of_ts_ns", + Types.MapType.ofRequired( + 103, 104, Types.StringType.get(), Types.TimestampNanoType.withoutZone()))); private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); @@ -835,15 +889,37 @@ public GenericRecord generateIcebergGenericRecord() { GenericRecord genericRecord = GenericRecord.create(icebergSchema); genericRecord.setField("row_id", "row_id_value"); genericRecord.setField("map_of_primitives", ImmutableMap.of("Jane", 1, "Joe", 2)); + + LocalDateTime posNanos = LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789); + LocalDateTime negNanos = LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321); + genericRecord.setField( + "map_of_ts_ns", ImmutableMap.of("positive", posNanos, "negative", negNanos)); return genericRecord; } @Override public GenericRowData generateFlinkRowData() { + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + return GenericRowData.of( StringData.fromString("row_id_value"), new GenericMapData( - ImmutableMap.of(StringData.fromString("Jane"), 1, StringData.fromString("Joe"), 2))); + ImmutableMap.of(StringData.fromString("Jane"), 1, StringData.fromString("Joe"), 2)), + new GenericMapData( + ImmutableMap.of( + StringData.fromString("positive"), + TimestampData.fromEpochMillis( + Math.floorDiv(posNanos, 1_000_000L), + (int) Math.floorMod(posNanos, 1_000_000L)), + StringData.fromString("negative"), + TimestampData.fromEpochMillis( + Math.floorDiv(negNanos, 1_000_000L), + (int) Math.floorMod(negNanos, 1_000_000L))))); } @Override @@ -851,6 +927,15 @@ public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); genericRecord.put("row_id", "row_id_value"); genericRecord.put("map_of_primitives", ImmutableMap.of("Jane", 1, "Joe", 2)); + + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + genericRecord.put( + "map_of_ts_ns", ImmutableMap.of("positive", posNanos, "negative", negNanos)); return genericRecord; } } diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java index cd6964b5ed0f..0e7635a33e87 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java @@ -30,7 +30,6 @@ import org.apache.iceberg.data.Record; import org.apache.iceberg.flink.data.RandomRowData; import org.apache.iceberg.util.StructLikeWrapper; -import org.junit.jupiter.api.Disabled; public class TestRowDataWrapper extends RecordWrapperTestBase { @@ -60,18 +59,6 @@ public void testTime() { }); } - @Disabled - @Override - public void testTimestampNanoWithoutZone() { - // Flink does not support nanosecond timestamp without zone. - } - - @Disabled - @Override - public void testTimestampNanoWithZone() { - // Flink does not support nanosecond timestamp with zone. - } - @Override protected void generateAndValidate( Schema schema, RecordWrapperTestBase.AssertMethod assertMethod) { diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java index 8c99fdf52110..1f0fe70ac53b 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java @@ -19,13 +19,17 @@ package org.apache.iceberg.flink.data; import java.util.List; +import org.apache.flink.table.data.GenericRowData; import org.apache.flink.table.data.RowData; +import org.apache.iceberg.PartitionData; import org.apache.iceberg.Schema; import org.apache.iceberg.data.BaseFormatModelTests; import org.apache.iceberg.data.Record; import org.apache.iceberg.flink.FlinkSchemaUtil; import org.apache.iceberg.flink.RowDataConverter; import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; public class TestFlinkFormatModel extends BaseFormatModelTests { @@ -48,4 +52,26 @@ protected RowData convertToEngine(Record record, Schema schema) { protected void assertEquals(Schema schema, List expected, List actual) { TestHelpers.assertRows(actual, expected, FlinkSchemaUtil.convert(schema)); } + + @Override + protected Object convertConstantToEngine(Type type, Object value) { + if (value instanceof PartitionData partitionData) { + Types.StructType structType = type.asStructType(); + List fields = structType.fields(); + GenericRowData rowData = new GenericRowData(fields.size()); + int sourceSize = partitionData.size(); + for (int i = 0; i < fields.size(); i++) { + if (i < sourceSize) { + Object fieldValue = partitionData.get(i, Object.class); + rowData.setField(i, convertConstantToEngine(fields.get(i).type(), fieldValue)); + } else { + rowData.setField(i, null); + } + } + + return rowData; + } + + return RowDataUtil.convertConstant(type, value); + } } diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java index 4a70802f2a2e..b7b0a54156cc 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java @@ -49,6 +49,11 @@ protected boolean allowsWritingNullValuesForRequiredFields() { return true; } + @Override + protected boolean supportsTimestampNanos() { + return true; + } + @Override protected void writeAndValidate(Schema schema) throws IOException { List expectedRecords = RandomGenericData.generate(schema, NUM_RECORDS, 1990L); diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java index 4e5b38ffb026..a2411da1e344 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java @@ -271,18 +271,19 @@ public void testMapOfPrimitivesProjection() { GenericRowData.of( StringData.fromString("row_id_value"), new GenericMapData( - ImmutableMap.of(StringData.fromString("foo"), 1, StringData.fromString("bar"), 2))); + ImmutableMap.of(StringData.fromString("foo"), 1, StringData.fromString("bar"), 2)), + null); testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData, true); testEqualsAndHashCode(schema, mapOnly, rowData, copyRowData, otherRowData); testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); GenericRowData rowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("row_id_value"), null); + GenericRowData.of(StringData.fromString("row_id_value"), null, null); GenericRowData copyRowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("row_id_value"), null); + GenericRowData.of(StringData.fromString("row_id_value"), null, null); // modify the map field value GenericRowData otherRowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("other_row_id_value"), null); + GenericRowData.of(StringData.fromString("other_row_id_value"), null, null); testEqualsAndHashCode( schema, idOnly, @@ -432,7 +433,8 @@ public void testArrayOfPrimitiveProjection() { GenericRowData otherRowData = GenericRowData.of( StringData.fromString("other_row_id_value"), - new GenericArrayData(new Integer[] {4, 5, 6})); + new GenericArrayData(new Integer[] {4, 5, 6}), + null); testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); testEqualsAndHashCode(schema, arrayOnly, rowData, copyRowData, otherRowData); testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); @@ -440,16 +442,19 @@ public void testArrayOfPrimitiveProjection() { GenericRowData rowDataNullOptionalFields = GenericRowData.of( StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {1, null, 3})); + new GenericArrayData(new Integer[] {1, null, 3}), + null); GenericRowData copyRowDataNullOptionalFields = GenericRowData.of( StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {1, null, 3})); + new GenericArrayData(new Integer[] {1, null, 3}), + null); // modify the map field value GenericRowData otherRowDataNullOptionalFields = GenericRowData.of( StringData.fromString("other_row_id_value"), - new GenericArrayData(new Integer[] {4, null, 6})); + new GenericArrayData(new Integer[] {4, null, 6}), + null); testEqualsAndHashCode( schema, idOnly, diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java index 3cb18ffbb77e..4d35792e440e 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java @@ -19,11 +19,18 @@ package org.apache.iceberg.flink.maintenance.api; import static org.apache.iceberg.flink.maintenance.api.JdbcLockFactory.INIT_LOCK_TABLES_PROPERTY; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import java.sql.SQLTransientConnectionException; import java.util.Map; import java.util.UUID; +import java.util.concurrent.atomic.AtomicInteger; import org.apache.iceberg.jdbc.JdbcCatalog; +import org.apache.iceberg.jdbc.JdbcClientPool; +import org.apache.iceberg.jdbc.UncheckedSQLException; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.Test; class TestJdbcLockFactory extends TestLockFactoryBase { @Override @@ -38,4 +45,62 @@ TriggerLockFactory lockFactory(String tableName) { tableName, properties); } + + @Test + void testSQLExceptionEnablesRetryInClientPool() throws Exception { + // Regression test for #15759: verify that removing the inner try-catch allows + // ClientPoolImpl to retry on transient connection failures. + // + // Before the fix: inner catch converted SQLException -> UncheckedSQLException + // (RuntimeException) inside the lambda. ClientPoolImpl only catches the declared + // exception type (SQLException), so RuntimeException bypasses retry entirely. + // After the fix: SQLException propagates naturally, ClientPoolImpl catches it, + // and retries on transient connection exceptions. + Map props = Maps.newHashMap(); + props.put("username", "user"); + props.put("password", "password"); + String uri = "jdbc:sqlite:file::memory:?ic" + UUID.randomUUID().toString().replace("-", ""); + + try (JdbcClientPool pool = new JdbcClientPool(1, uri, props)) { + AtomicInteger attempts = new AtomicInteger(0); + + String result = + pool.run( + conn -> { + if (attempts.incrementAndGet() == 1) { + throw new SQLTransientConnectionException("transient failure"); + } + + return "success"; + }); + + assertThat(result).isEqualTo("success"); + assertThat(attempts.get()).isGreaterThan(1); + } + } + + @Test + void testUncheckedSQLExceptionBypassesRetry() throws Exception { + // Companion test: demonstrates that wrapping SQLException as UncheckedSQLException + // (the OLD behavior before the fix) prevents ClientPoolImpl from retrying. + Map props = Maps.newHashMap(); + props.put("username", "user"); + props.put("password", "password"); + String uri = "jdbc:sqlite:file::memory:?ic" + UUID.randomUUID().toString().replace("-", ""); + + try (JdbcClientPool pool = new JdbcClientPool(1, uri, props)) { + assertThatThrownBy( + () -> + pool.run( + conn -> { + try { + throw new SQLTransientConnectionException("transient failure"); + } catch (java.sql.SQLException e) { + throw new UncheckedSQLException(e, "wrapped"); + } + })) + .isInstanceOf(UncheckedSQLException.class) + .hasMessageContaining("wrapped"); + } + } } diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java index bb53b5265655..88b949a9a7f8 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java @@ -29,9 +29,11 @@ import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.REMOVED_DATA_FILE_SIZE_METRIC; import static org.assertj.core.api.Assertions.assertThat; +import java.time.Instant; import java.util.List; import java.util.stream.StreamSupport; import org.apache.flink.streaming.api.graph.StreamGraphGenerator; +import org.apache.iceberg.FileFormat; import org.apache.iceberg.ManifestFiles; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; @@ -43,8 +45,14 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.FieldSource; class TestRewriteDataFiles extends MaintenanceTaskTestBase { + + private static final FileFormat[] FILE_FORMATS = + new FileFormat[] {FileFormat.AVRO, FileFormat.PARQUET, FileFormat.ORC}; + @Test void testRewriteUnpartitioned() throws Exception { Table table = createTable(); @@ -82,13 +90,14 @@ void testRewriteUnpartitioned() throws Exception { createRecord(4, "d"))); } - @Test - void testRewriteUnpartitionedPreserveLineage() throws Exception { - Table table = createTable(3); - insert(table, 1, "a"); - insert(table, 2, "b"); - insert(table, 3, "c"); - insert(table, 4, "d"); + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testRewriteUnpartitionedPreserveLineage(FileFormat fileFormat) throws Exception { + Table table = createTable(3, fileFormat); + insert(table, 1, "a", fileFormat); + insert(table, 2, "b", fileFormat); + insert(table, 3, "c", fileFormat); + insert(table, 4, "d", fileFormat); assertFileNum(table, 4, 0); @@ -122,15 +131,17 @@ void testRewriteUnpartitionedPreserveLineage() throws Exception { schema); } - @Test - void testRewriteTheSameFilePreserveLineage() throws Exception { - Table table = createTable(3); - insert(table, 1, "a"); - insert(table, 2, "b"); + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testRewriteTheSameFilePreserveLineage(FileFormat fileFormat) throws Exception { + Table table = createTable(3, fileFormat); + insert(table, 1, "a", fileFormat); + insert(table, 2, "b", fileFormat); // Create a file with two lines of data to verify that the rowid is read correctly. insert( table, - ImmutableList.of(SimpleDataUtil.createRecord(3, "c"), SimpleDataUtil.createRecord(4, "d"))); + ImmutableList.of(SimpleDataUtil.createRecord(3, "c"), SimpleDataUtil.createRecord(4, "d")), + fileFormat); assertFileNum(table, 3, 0); @@ -166,13 +177,14 @@ void testRewriteTheSameFilePreserveLineage() throws Exception { schema); } - @Test - void testRewritePartitionedPreserveLineage() throws Exception { - Table table = createPartitionedTable(3); - insertPartitioned(table, 1, "p1"); - insertPartitioned(table, 2, "p1"); - insertPartitioned(table, 3, "p2"); - insertPartitioned(table, 4, "p2"); + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testRewritePartitionedPreserveLineage(FileFormat fileFormat) throws Exception { + Table table = createPartitionedTable(3, fileFormat); + insertPartitioned(table, 1, "p1", fileFormat); + insertPartitioned(table, 2, "p1", fileFormat); + insertPartitioned(table, 3, "p2", fileFormat); + insertPartitioned(table, 4, "p2", fileFormat); assertFileNum(table, 4, 0); @@ -529,6 +541,57 @@ void testRewriteWithFilter() throws Exception { createRecord(4, "d"))); } + /** + * By verifying that the creation time of the data content in the builder is later than the + * creation time of the filter condition — if the filter condition is actually created in the + * planner, then all files can be compacted; otherwise, not all files can be compacted — we can + * confirm whether the filter condition is actually created in the planner. + */ + @Test + void testRewriteWithFilterSupplier() throws Exception { + Table table = createTable(); + + appendRewriteDataFiles( + RewriteDataFiles.builder() + .parallelism(2) + .deleteFileThreshold(10) + .targetFileSizeBytes(1_000_000L) + .maxFileGroupSizeBytes(10_000_000L) + .maxFileSizeBytes(2_000_000L) + .minFileSizeBytes(500_000L) + .minInputFiles(2) + // Rewrite data files where id is less than current timestamp in planner + .filter(() -> Expressions.lessThan("id", (int) Instant.now().getEpochSecond())) + .partialProgressEnabled(true) + .partialProgressMaxCommits(1) + .maxRewriteBytes(100_000L) + .rewriteAll(false)); + + insert(table, 1, "a"); + insert(table, 2, "b"); + insert(table, 3, "c"); + + int epochSecond = (int) Instant.now().getEpochSecond(); + insert(table, epochSecond, "d"); + + assertFileNum(table, 4, 0); + + Thread.sleep(1_000L); + runAndWaitForSuccess(infra.env(), infra.source(), infra.sink()); + + // There is four files, only id is less than current timestamp will be rewritten. so expect 2 + // files. + assertFileNum(table, 1, 0); + + SimpleDataUtil.assertTableRecords( + table, + ImmutableList.of( + createRecord(1, "a"), + createRecord(2, "b"), + createRecord(3, "c"), + createRecord(epochSecond, "d"))); + } + @Test void testBranch() throws Exception { Table table = createTable(); diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java index b9422a63d646..d6563e782e43 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java @@ -24,7 +24,10 @@ import java.io.File; import java.io.IOException; import java.nio.file.Path; +import java.time.LocalDateTime; +import java.time.ZoneOffset; import java.util.List; +import java.util.concurrent.TimeUnit; import javax.annotation.Nullable; import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.MetricOptions; @@ -79,6 +82,12 @@ public class OperatorTestBase { ImmutableMap.of(), ImmutableSet.of(SimpleDataUtil.SCHEMA.columns().get(0).fieldId())); + private static final Schema SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "ts", Types.TimestampType.withoutZone())); + protected static final String UID_SUFFIX = "UID-Dummy"; protected static final String SLOT_SHARING_GROUP = "SlotSharingGroup"; protected static final TriggerLockFactory LOCK_FACTORY = new MemoryLockFactory(); @@ -124,10 +133,14 @@ void after() throws IOException { } protected static Table createTable() { - return createTable(2); + return createTable(2, FileFormat.PARQUET); } protected static Table createTable(int formatVersion) { + return createPartitionedTable(formatVersion, FileFormat.PARQUET); + } + + protected static Table createTable(int formatVersion, FileFormat fileFormat) { return CATALOG_EXTENSION .catalog() .createTable( @@ -136,12 +149,29 @@ protected static Table createTable(int formatVersion) { PartitionSpec.unpartitioned(), null, ImmutableMap.of( + "write.format.default", + fileFormat.name(), TableProperties.FORMAT_VERSION, String.valueOf(formatVersion), "flink.max-continuous-empty-commits", "100000")); } + protected static Table createTableWithTimestampWithoutZone() { + return CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE, + PartitionSpec.builderFor(SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE).identity("ts").build(), + null, + ImmutableMap.of( + TableProperties.FORMAT_VERSION, + "2", + "flink.max-continuous-empty-commits", + "100000")); + } + protected static Table createTableWithDelete() { return createTableWithDelete(2); } @@ -158,7 +188,7 @@ protected static Table createTableWithDelete(int formatVersion) { "format-version", String.valueOf(formatVersion), "write.upsert.enabled", "true")); } - protected static Table createPartitionedTable(int formatVersion) { + protected static Table createPartitionedTable(int formatVersion, FileFormat fileFormat) { return CATALOG_EXTENSION .catalog() .createTable( @@ -167,6 +197,8 @@ protected static Table createPartitionedTable(int formatVersion) { PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build(), null, ImmutableMap.of( + "write.format.default", + fileFormat.name(), "format-version", String.valueOf(formatVersion), "flink.max-continuous-empty-commits", @@ -174,17 +206,27 @@ protected static Table createPartitionedTable(int formatVersion) { } protected static Table createPartitionedTable() { - return createPartitionedTable(2); + return createPartitionedTable(2, FileFormat.PARQUET); } protected void insert(Table table, Integer id, String data) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + insert(table, id, data, FileFormat.PARQUET); + } + + protected void insert(Table table, Integer id, String data, FileFormat fileFormat) + throws IOException { + new GenericAppenderHelper(table, fileFormat, warehouseDir) .appendToTable(Lists.newArrayList(SimpleDataUtil.createRecord(id, data))); table.refresh(); } protected void insert(Table table, List records) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir).appendToTable(records); + insert(table, records, FileFormat.PARQUET); + } + + protected void insert(Table table, List records, FileFormat fileFormat) + throws IOException { + new GenericAppenderHelper(table, fileFormat, warehouseDir).appendToTable(records); table.refresh(); } @@ -194,6 +236,20 @@ protected void insert(Table table, Integer id, String data, String extra) throws table.refresh(); } + protected void insertWithTimestampWithoutZone( + Table table, Integer id, String data, LocalDateTime ts) throws IOException { + GenericRecord record = GenericRecord.create(SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE); + record.setField("id", id); + record.setField("data", data); + record.setField("ts", ts); + long tsMicros = + TimeUnit.SECONDS.toMicros(ts.toEpochSecond(ZoneOffset.UTC)) + + TimeUnit.NANOSECONDS.toMicros(ts.getNano()); + new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + .appendToTable(TestHelpers.Row.of(tsMicros), Lists.newArrayList(record)); + table.refresh(); + } + /** * For the same identifier column id this methods simulate the following row operations: *
  • add an equality delete on oldData @@ -271,7 +327,12 @@ protected void update(Table table, Integer id, String oldData, String tempData, } protected void insertPartitioned(Table table, Integer id, String data) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + insertPartitioned(table, id, data, FileFormat.PARQUET); + } + + protected void insertPartitioned(Table table, Integer id, String data, FileFormat fileFormat) + throws IOException { + new GenericAppenderHelper(table, fileFormat, warehouseDir) .appendToTable( TestHelpers.Row.of(data), Lists.newArrayList(SimpleDataUtil.createRecord(id, data))); table.refresh(); diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java index 8a8a2fa194d4..7b8f638b7e2f 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java @@ -57,7 +57,7 @@ static List planDataFileRewrite( 11, 10_000_000L, rewriterOptions, - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java index 16d524f05cf7..8300df8c94eb 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java @@ -24,6 +24,9 @@ import static org.apache.iceberg.flink.maintenance.operator.RewriteUtil.planDataFileRewrite; import static org.assertj.core.api.Assertions.assertThat; +import java.time.Duration; +import java.time.LocalDateTime; +import java.time.ZoneOffset; import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -107,7 +110,7 @@ void testError() throws Exception { 11, 1L, ImmutableMap.of(MIN_INPUT_FILES, "2"), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); @@ -174,7 +177,7 @@ void testMaxRewriteBytes() throws Exception { 11, maxRewriteBytes, ImmutableMap.of(MIN_INPUT_FILES, "2"), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); @@ -228,7 +231,7 @@ void testBranch() throws Exception { 11, 10_000_000L, ImmutableMap.of(MIN_INPUT_FILES, "2"), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, branchName))) { testHarness.open(); @@ -243,6 +246,46 @@ void testBranch() throws Exception { } } + @Test + void testFilterSupplierWithTimestamp() throws Exception { + Table table = createTableWithTimestampWithoutZone(); + + LocalDateTime oldTs = LocalDateTime.now().minusDays(10); + insertWithTimestampWithoutZone(table, 1, "old_a", oldTs); + insertWithTimestampWithoutZone(table, 2, "old_b", oldTs); + + LocalDateTime recentTs = LocalDateTime.now().minusHours(1); + insertWithTimestampWithoutZone(table, 3, "new_a", recentTs); + insertWithTimestampWithoutZone(table, 4, "new_b", recentTs); + + try (OneInputStreamOperatorTestHarness + testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new DataFileRewritePlanner( + OperatorTestBase.DUMMY_TABLE_NAME, + OperatorTestBase.DUMMY_TABLE_NAME, + 0, + tableLoader(), + 11, + 10_000_000L, + ImmutableMap.of(MIN_INPUT_FILES, "2"), + () -> + Expressions.greaterThanOrEqual( + "ts", + LocalDateTime.now(ZoneOffset.UTC).minus(Duration.ofDays(3)).toString()), + SnapshotRef.MAIN_BRANCH))) { + testHarness.open(); + + trigger(testHarness); + + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); + List planned = testHarness.extractOutputValues(); + + assertThat(planned).hasSize(1); + assertThat(planned.get(0).group().fileScanTasks()).hasSize(2); + } + } + void assertRewriteFileGroup( DataFileRewritePlanner.PlannedGroup plannedGroup, Table table, Set files) { assertThat(plannedGroup.table().currentSnapshot().snapshotId()) diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java index 9202a1df92af..62b29e7c017a 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java @@ -309,7 +309,7 @@ void testSplitSize() throws Exception { "2", TARGET_FILE_SIZE_BYTES, String.valueOf(targetFileSize)), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriterMetrics.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriterMetrics.java new file mode 100644 index 000000000000..42bbfc0d3628 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriterMetrics.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatNoException; + +import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; +import org.apache.iceberg.io.WriteResult; +import org.junit.jupiter.api.Test; + +public class TestIcebergStreamWriterMetrics { + + @Test + void histogramsCreatedWhenDropwizardAvailable() { + IcebergStreamWriterMetrics metrics = + new IcebergStreamWriterMetrics( + UnregisteredMetricsGroup.createSinkWriterMetricGroup(), "db.table"); + + assertThat(metrics.dataFilesSizeHistogram()).isNotNull(); + assertThat(metrics.deleteFilesSizeHistogram()).isNotNull(); + + assertThatNoException() + .isThrownBy(() -> metrics.updateFlushResult(WriteResult.builder().build())); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java index 27b1e3d84a8c..89befb9e8ea2 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java @@ -43,7 +43,10 @@ import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.RestartStrategyOptions; import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; +import org.apache.flink.runtime.OperatorIDPair; import org.apache.flink.runtime.client.JobExecutionException; +import org.apache.flink.runtime.jobgraph.JobVertex; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; @@ -75,6 +78,7 @@ import org.apache.iceberg.flink.CatalogLoader; import org.apache.iceberg.flink.FlinkSchemaUtil; import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.SimpleDataUtil; import org.apache.iceberg.flink.TestHelpers; @@ -83,6 +87,7 @@ import org.apache.iceberg.flink.sink.dynamic.TestDynamicCommitter.FailBeforeAndAfterCommit; import org.apache.iceberg.inmemory.InMemoryInputFile; import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; @@ -119,6 +124,7 @@ private static class DynamicIcebergDataImpl implements Serializable { PartitionSpec partitionSpec; boolean upsertMode; Set equalityFields; + int writeParallelism; private DynamicIcebergDataImpl( Schema schemaProvided, String tableName, String branch, PartitionSpec partitionSpec) { @@ -130,7 +136,8 @@ private DynamicIcebergDataImpl( partitionSpec, false, Collections.emptySet(), - false); + false, + 10); } private DynamicIcebergDataImpl( @@ -147,7 +154,8 @@ private DynamicIcebergDataImpl( partitionSpec, false, Collections.emptySet(), - false); + false, + 10); } private DynamicIcebergDataImpl( @@ -166,7 +174,8 @@ private DynamicIcebergDataImpl( partitionSpec, upsertMode, equalityFields, - isDuplicate); + isDuplicate, + 10); } private DynamicIcebergDataImpl( @@ -177,7 +186,8 @@ private DynamicIcebergDataImpl( PartitionSpec partitionSpec, boolean upsertMode, Set equalityFields, - boolean isDuplicate) { + boolean isDuplicate, + int writeParallelism) { this.rowProvided = randomRow(schemaProvided, isDuplicate ? seed : ++seed); this.rowExpected = isDuplicate ? null : rowProvided; this.schemaProvided = schemaProvided; @@ -187,6 +197,7 @@ private DynamicIcebergDataImpl( this.partitionSpec = partitionSpec; this.upsertMode = upsertMode; this.equalityFields = equalityFields; + this.writeParallelism = writeParallelism; } } @@ -206,6 +217,56 @@ public void generate(DynamicIcebergDataImpl row, Collector out) { converter(schema).toInternal(row.rowProvided), spec, spec.isPartitioned() ? DistributionMode.HASH : DistributionMode.NONE, + row.writeParallelism); + dynamicRecord.setUpsertMode(row.upsertMode); + dynamicRecord.setEqualityFields(row.equalityFields); + out.collect(dynamicRecord); + } + } + + /** Generator that always emits forward (null distributionMode) records. */ + private static class ForwardGenerator implements DynamicRecordGenerator { + @Override + public void generate(DynamicIcebergDataImpl row, Collector out) { + TableIdentifier tableIdentifier = TableIdentifier.of(DATABASE, row.tableName); + Schema schema = row.schemaProvided; + PartitionSpec spec = row.partitionSpec; + DynamicRecord dynamicRecord = + new DynamicRecord( + tableIdentifier, + row.branch, + schema, + converter(schema).toInternal(row.rowProvided), + spec); + dynamicRecord.setUpsertMode(row.upsertMode); + dynamicRecord.setEqualityFields(row.equalityFields); + out.collect(dynamicRecord); + } + } + + /** + * Generator that alternates between forward (null distributionMode) and shuffle records. Even + * indices go forward, odd indices go through shuffle. + */ + private static class MixedGenerator implements DynamicRecordGenerator { + private int count = 0; + + @Override + public void generate(DynamicIcebergDataImpl row, Collector out) { + TableIdentifier tableIdentifier = TableIdentifier.of(DATABASE, row.tableName); + Schema schema = row.schemaProvided; + PartitionSpec spec = row.partitionSpec; + boolean forward = (count++ % 2 == 0); + DistributionMode mode = + forward ? null : (spec.isPartitioned() ? DistributionMode.HASH : DistributionMode.NONE); + DynamicRecord dynamicRecord = + new DynamicRecord( + tableIdentifier, + row.branch, + schema, + converter(schema).toInternal(row.rowProvided), + spec, + mode, 10); dynamicRecord.setUpsertMode(row.upsertMode); dynamicRecord.setEqualityFields(row.equalityFields); @@ -238,6 +299,109 @@ void testWrite() throws Exception { runTest(rows); } + @Test + void testNoShuffleTopology() throws Exception { + DataStream dataStream = + env.fromData( + Collections.emptyList(), TypeInformation.of(new TypeHint() {})); + DynamicIcebergSink.forInput(dataStream) + .generator(new ForwardGenerator()) + .catalogLoader(CATALOG_EXTENSION.catalogLoader()) + .writeParallelism(2) + .immediateTableUpdate(false) + .overwrite(false) + .append(); + + boolean generatorAndSinkChained = false; + for (JobVertex vertex : env.getStreamGraph().getJobGraph().getVertices()) { + boolean generatorInThisVertex = false; + boolean sinkInThisVertex = false; + for (OperatorIDPair operatorID : vertex.getOperatorIDs()) { + String uid = operatorID.getUserDefinedOperatorUid(); + if (uid == null) { + continue; + } + + if (uid.endsWith("-forward-writer")) { + sinkInThisVertex = true; + } else if (uid.endsWith("-generator")) { + generatorInThisVertex = true; + } + } + + generatorAndSinkChained = generatorInThisVertex && sinkInThisVertex; + if (generatorAndSinkChained) { + break; + } + } + + assertThat(generatorAndSinkChained).isTrue(); + } + + @Test + void testForwardWrite() throws Exception { + runForwardWriteTest(new ForwardGenerator()); + } + + @Test + void testMixedForwardAndShuffleWrite() throws Exception { + runForwardWriteTest(new MixedGenerator()); + } + + private void runForwardWriteTest(DynamicRecordGenerator generator) + throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned())); + + DataStream dataStream = + env.fromData(rows, TypeInformation.of(new TypeHint<>() {})); + env.setParallelism(1); + + DynamicIcebergSink.forInput(dataStream) + .generator(generator) + .catalogLoader(CATALOG_EXTENSION.catalogLoader()) + .writeParallelism(1) + .immediateTableUpdate(true) + .append(); + + env.execute(); + + verifyResults(rows); + } + + @Test + void testWriteWithNullBranch() throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", null, PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", null, PartitionSpec.unpartitioned())); + + runTest( + rows, this.env, false, 1, ImmutableMap.of(FlinkWriteOptions.BRANCH.key(), "test-branch")); + } + @Test void testWritePartitioned() throws Exception { PartitionSpec spec = PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).bucket("id", 10).build(); @@ -1170,8 +1334,9 @@ void testOperatorUidsFormat() { // pre commit topology was off, but since it is stateless, users will still be able to restore // state, but we must keep the stateful operators UUIds like the committer consistent. assertThat(sinkUids) - .contains( + .containsOnly( "test--sink", + "test--forward-writer", "test--generator", "test--updater", "test--sink: test--pre-commit-topology", @@ -1179,8 +1344,9 @@ void testOperatorUidsFormat() { sinkUids = createSinkAndReturnUIds(""); assertThat(sinkUids) - .contains( + .containsOnly( "--sink", + "--forward-writer", "--generator", "--updater", "--sink: --pre-commit-topology", @@ -1188,14 +1354,71 @@ void testOperatorUidsFormat() { sinkUids = createSinkAndReturnUIds(null); assertThat(sinkUids) - .contains( + .containsOnly( "--sink", + "--forward-writer", "--generator", "--updater", "--sink: --pre-commit-topology", "Sink Committer: --sink"); } + @Test + void testGeneratorDefaultParallelism() { + StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); + streamEnv.setParallelism(4); + + DataStreamSource source = + streamEnv.fromData(Collections.emptySet(), TypeInformation.of(new TypeHint<>() {})); + source.setParallelism(8); + + DynamicIcebergSink.forInput(source) + .generator(new Generator()) + .catalogLoader(CATALOG_EXTENSION.catalogLoader()) + .uidPrefix("test") + .append(); + + // Since the generator parallelism is not directly accessible via the returned DataStreamSink, + // inspect the stream graph to verify the generator inherits the input source parallelism. + int generatorParallelism = + streamEnv.getStreamGraph().getStreamNodes().stream() + .filter(node -> "test--generator".equals(node.getTransformationUID())) + .findFirst() + .map(StreamNode::getParallelism) + .orElseThrow(() -> new AssertionError("Generator node not found")); + + assertThat(generatorParallelism).isEqualTo(source.getParallelism()); + } + + @Test + void testFallBackParallelismFromConfig() throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned(), + false, + Collections.emptySet(), + false, + -1), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned(), + false, + Collections.emptySet(), + false, + 0)); + + runTest( + rows, this.env, true, 2, ImmutableMap.of(FlinkWriteOptions.WRITE_PARALLELISM.key(), "1")); + } + private Set createSinkAndReturnUIds(String uidPrefix) { StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); @@ -1304,6 +1527,18 @@ private void runTest( verifyResults(dynamicData); } + private void runTest( + List dynamicData, + StreamExecutionEnvironment env, + boolean immediateUpdate, + int parallelism, + Map writeProperties) + throws Exception { + executeDynamicSink( + dynamicData, env, immediateUpdate, parallelism, null, false, writeProperties); + verifyResults(dynamicData, writeProperties); + } + private void executeDynamicSink( List dynamicData, StreamExecutionEnvironment env, @@ -1311,7 +1546,8 @@ private void executeDynamicSink( int parallelism, @Nullable CommitHook commitHook) throws Exception { - executeDynamicSink(dynamicData, env, immediateUpdate, parallelism, commitHook, false); + executeDynamicSink( + dynamicData, env, immediateUpdate, parallelism, commitHook, false, Maps.newHashMap()); } private void executeDynamicSink( @@ -1322,6 +1558,19 @@ private void executeDynamicSink( @Nullable CommitHook commitHook, boolean overwrite) throws Exception { + executeDynamicSink( + dynamicData, env, immediateUpdate, parallelism, commitHook, overwrite, Maps.newHashMap()); + } + + private void executeDynamicSink( + List dynamicData, + StreamExecutionEnvironment env, + boolean immediateUpdate, + int parallelism, + @Nullable CommitHook commitHook, + boolean overwrite, + Map writeProperties) + throws Exception { DataStream dataStream = env.fromData(dynamicData, TypeInformation.of(new TypeHint<>() {})); env.setParallelism(parallelism); @@ -1335,6 +1584,7 @@ private void executeDynamicSink( .immediateTableUpdate(immediateUpdate) .setSnapshotProperty("commit.retry.num-retries", "0") .overwrite(overwrite) + .setAll(writeProperties) .append(); } else { DynamicIcebergSink.forInput(dataStream) @@ -1343,6 +1593,7 @@ private void executeDynamicSink( .writeParallelism(parallelism) .immediateTableUpdate(immediateUpdate) .overwrite(overwrite) + .setAll(writeProperties) .append(); } @@ -1359,7 +1610,9 @@ static class CommitHookEnabledDynamicIcebergSink extends DynamicIcebergSink.B @Override DynamicIcebergSink instantiateSink( - Map writeProperties, Configuration flinkConfig) { + Map writeProperties, + Configuration flinkConfig, + DataStream> forwardWriteResults) { return new CommitHookDynamicIcebergSink( commitHook, CATALOG_EXTENSION.catalogLoader(), @@ -1367,7 +1620,7 @@ DynamicIcebergSink instantiateSink( "uidPrefix", writeProperties, flinkConfig, - 100); + forwardWriteResults); } } @@ -1383,14 +1636,15 @@ static class CommitHookDynamicIcebergSink extends DynamicIcebergSink { String uidPrefix, Map writeProperties, Configuration flinkConfig, - int cacheMaximumSize) { + DataStream> forwardWritten) { super( catalogLoader, snapshotProperties, uidPrefix, writeProperties, flinkConfig, - cacheMaximumSize); + 100, + forwardWritten); this.commitHook = commitHook; this.overwriteMode = new FlinkWriteConf(writeProperties, flinkConfig).overwriteMode(); } @@ -1409,6 +1663,12 @@ public Committer createCommitter(CommitterInitContext contex } private void verifyResults(List dynamicData) throws IOException { + verifyResults(dynamicData, Maps.newHashMap()); + } + + private void verifyResults( + List dynamicData, Map writeProperties) + throws IOException { // Calculate the expected result Map, List> expectedData = Maps.newHashMap(); Map expectedSchema = Maps.newHashMap(); @@ -1422,9 +1682,12 @@ private void verifyResults(List dynamicData) throws IOEx dynamicData.forEach( r -> { + String branch = + MoreObjects.firstNonNull( + r.branch, writeProperties.get(FlinkWriteOptions.BRANCH.key())); List data = expectedData.computeIfAbsent( - Tuple2.of(r.tableName, r.branch), unused -> Lists.newArrayList()); + Tuple2.of(r.tableName, branch), unused -> Lists.newArrayList()); data.addAll( convertToRowData(expectedSchema.get(r.tableName), ImmutableList.of(r.rowExpected))); }); diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordWithConfig.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordWithConfig.java new file mode 100644 index 000000000000..de55621475ed --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordWithConfig.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Collections; +import java.util.Set; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +class TestDynamicRecordWithConfig { + + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); + + private static final TableIdentifier TABLE_IDENTIFIER = TableIdentifier.of("db", "table"); + private static final PartitionSpec UNPARTITIONED = PartitionSpec.unpartitioned(); + private static final RowData ROW_DATA = GenericRowData.of(1, StringData.fromString("test")); + + @Test + void testBranchFallBack() { + String defaultBranch = "default-branch"; + FlinkWriteConf conf = + new FlinkWriteConf( + ImmutableMap.of(FlinkWriteOptions.BRANCH.key(), defaultBranch), new Configuration()); + DynamicRecordWithConfig dynamicRecordWithConfig = new DynamicRecordWithConfig(conf); + + DynamicRecord dynamicRecord = + new DynamicRecord(TABLE_IDENTIFIER, null, SCHEMA, ROW_DATA, UNPARTITIONED); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).branch()).isEqualTo(defaultBranch); + + String customBranch = "custom-branch"; + dynamicRecord.setBranch(customBranch); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).branch()).isEqualTo(customBranch); + } + + @Test + void testWriteParallelismFallBack() { + int defaultParallelism = 4; + FlinkWriteConf conf = + new FlinkWriteConf( + ImmutableMap.of( + FlinkWriteOptions.WRITE_PARALLELISM.key(), String.valueOf(defaultParallelism)), + new Configuration()); + DynamicRecordWithConfig dynamicRecordWithConfig = new DynamicRecordWithConfig(conf); + + DynamicRecord dynamicRecord = + new DynamicRecord(TABLE_IDENTIFIER, null, SCHEMA, ROW_DATA, UNPARTITIONED, null, -1); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).writeParallelism()) + .isEqualTo(defaultParallelism); + + dynamicRecord.writeParallelism(0); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).writeParallelism()) + .isEqualTo(defaultParallelism); + + dynamicRecord.writeParallelism(8); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).writeParallelism()).isEqualTo(8); + } + + @Test + void testDelegatesToWrappedRecord() { + FlinkWriteConf conf = new FlinkWriteConf(Collections.emptyMap(), new Configuration()); + PartitionSpec partitioned = PartitionSpec.builderFor(SCHEMA).identity("id").build(); + Set equalityFields = ImmutableSet.of("id", "data"); + + DynamicRecord dynamicRecord = + new DynamicRecord( + TABLE_IDENTIFIER, + SnapshotRef.MAIN_BRANCH, + SCHEMA, + ROW_DATA, + partitioned, + DistributionMode.HASH, + 2); + dynamicRecord.setUpsertMode(true); + dynamicRecord.setEqualityFields(equalityFields); + + DynamicRecordWithConfig record = new DynamicRecordWithConfig(conf).wrap(dynamicRecord); + + assertThat(record.tableIdentifier()).isEqualTo(TABLE_IDENTIFIER); + assertThat(record.schema()).isEqualTo(SCHEMA); + assertThat(record.spec()).isEqualTo(partitioned); + assertThat(record.rowData()).isSameAs(ROW_DATA); + assertThat(record.distributionMode()).isEqualTo(DistributionMode.HASH); + assertThat(record.upsertMode()).isTrue(); + assertThat(record.equalityFields()).isEqualTo(equalityFields); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java index 1c8e6df8591d..f6b2b368c2be 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java @@ -23,12 +23,14 @@ import static org.assertj.core.api.Assertions.assertThat; import java.util.Collections; +import org.apache.flink.configuration.Configuration; import org.apache.flink.table.data.GenericRowData; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.RegisterExtension; @@ -57,9 +59,6 @@ class TestDynamicTableUpdateOperator { @Test void testDynamicTableUpdateOperatorNewTable() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); @@ -67,12 +66,8 @@ void testDynamicTableUpdateOperatorNewTable() throws Exception { DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_SENSITIVE, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(CASE_SENSITIVE, PRESERVE_COLUMNS)); operator.open(null); DynamicRecordInternal input = @@ -93,21 +88,14 @@ void testDynamicTableUpdateOperatorNewTable() throws Exception { @Test void testDynamicTableUpdateOperatorSchemaChange() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_SENSITIVE, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(CASE_SENSITIVE, PRESERVE_COLUMNS)); operator.open(null); catalog.createTable(table, SCHEMA1); @@ -135,9 +123,6 @@ void testDynamicTableUpdateOperatorSchemaChange() throws Exception { @ParameterizedTest @ValueSource(booleans = {true, false}) void testCaseInSensitivity(boolean caseSensitive) throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); @@ -148,12 +133,8 @@ void testCaseInSensitivity(boolean caseSensitive) throws Exception { DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - caseSensitive, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(caseSensitive, PRESERVE_COLUMNS)); operator.open(null); catalog.createTable(table, initialSchema); @@ -187,21 +168,14 @@ void testCaseInSensitivity(boolean caseSensitive) throws Exception { @Test void testDynamicTableUpdateOperatorPreserveUnusedColumns() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_SENSITIVE, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(CASE_SENSITIVE, PRESERVE_COLUMNS)); operator.open(null); catalog.createTable(table, SCHEMA2); @@ -228,21 +202,14 @@ void testDynamicTableUpdateOperatorPreserveUnusedColumns() throws Exception { @Test void testDynamicTableUpdateOperatorDropUnusedColumns() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_INSENSITIVE, - DROP_COLUMNS); + flinkDynamicSinkConfiguration(CASE_INSENSITIVE, DROP_COLUMNS)); operator.open(null); catalog.createTable(table, SCHEMA2); @@ -265,4 +232,13 @@ void testDynamicTableUpdateOperatorDropUnusedColumns() throws Exception { assertThat(tableSchema.findField("data")).isNull(); assertThat(input).isEqualTo(output); } + + private static FlinkDynamicSinkConf flinkDynamicSinkConfiguration( + boolean caseSensitive, boolean dropUnusedColumns) { + return new FlinkDynamicSinkConf( + ImmutableMap.of( + FlinkDynamicSinkOptions.CASE_SENSITIVE.key(), String.valueOf(caseSensitive), + FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.key(), String.valueOf(dropUnusedColumns)), + new Configuration()); + } } diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java index c65f96b12cbb..9a485fafaf47 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java @@ -25,6 +25,7 @@ import java.util.Map; import java.util.Set; import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.configuration.Configuration; import org.apache.flink.runtime.state.KeyGroupRangeAssignment; import org.apache.flink.table.data.GenericRowData; import org.apache.flink.table.data.RowData; @@ -34,6 +35,9 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.Test; @@ -229,6 +233,38 @@ void testFailOnNonPositiveWriteParallelism() { }); } + @Test + void testNonPositiveWriteParallelismConfigFallback() throws Exception { + int maxWriteParallelism = 5; + HashKeyGenerator generator = new HashKeyGenerator(16, maxWriteParallelism); + PartitionSpec unpartitioned = PartitionSpec.unpartitioned(); + FlinkWriteConf flinkWriteConf = + new FlinkWriteConf( + ImmutableMap.of(FlinkWriteOptions.WRITE_PARALLELISM.key(), "2"), new Configuration()); + + Set writeKeys = Sets.newHashSet(); + for (int i = 0; i < 20; i++) { + GenericRowData row = GenericRowData.of(i, StringData.fromString("z")); + writeKeys.add( + getWriteKey( + generator, + unpartitioned, + DistributionMode.NONE, + i % 2 == 0 ? 0 : -1, + Collections.emptySet(), + row, + flinkWriteConf)); + } + + assertThat(writeKeys).hasSize(2); + assertThat( + writeKeys.stream() + .map(key -> getSubTaskId(key, 2, maxWriteParallelism)) + .distinct() + .count()) + .isEqualTo(2); + } + @Test void testCapAtMaxWriteParallelism() throws Exception { int writeParallelism = 10; @@ -477,10 +513,31 @@ private static int getWriteKey( Set equalityFields, GenericRowData row) throws Exception { - DynamicRecord record = + return getWriteKey( + generator, + spec, + mode, + writeParallelism, + equalityFields, + row, + new FlinkWriteConf(Collections.emptyMap(), new Configuration())); + } + + private static int getWriteKey( + HashKeyGenerator generator, + PartitionSpec spec, + DistributionMode mode, + int writeParallelism, + Set equalityFields, + GenericRowData row, + FlinkWriteConf flinkWriteConf) + throws Exception { + DynamicRecord inputRecord = new DynamicRecord(TABLE_IDENTIFIER, BRANCH, SCHEMA, row, spec, mode, writeParallelism); - record.setEqualityFields(equalityFields); - return generator.generateKey(record); + inputRecord.setEqualityFields(equalityFields); + + DynamicRecordWithConfig dynamicRecordWithConfig = new DynamicRecordWithConfig(flinkWriteConf); + return generator.generateKey(dynamicRecordWithConfig.wrap(inputRecord)); } private static int getSubTaskId(int writeKey1, int writeParallelism, int maxWriteParallelism) { diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java index f84cf7fb1aae..ec9333674d03 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java @@ -225,6 +225,68 @@ public void apply( 3))); } + /** + * Integration test verifying that records with eventTime equal to the minimum timestamp of their + * split are correctly included in windows. The {@link + * org.apache.iceberg.flink.source.reader.WatermarkExtractorRecordEmitter} emits the watermark as + * {@code minSplitTs - 1}, so records at exactly {@code minSplitTs} are on-time rather than late. + * + *

    The test writes 3 records at epoch (t=0). The split's column-stats lower-bound is 0, so the + * extracted watermark is 0ms and the emitted watermark is -1ms. Records at t=0 are strictly after + * that watermark and therefore belong to the [0, 5min) window. A later split is then appended to + * advance the watermark past the window boundary and trigger its evaluation. + */ + @Test + public void testWindowingWithRecordsAtSplitMinTimestamp() throws Exception { + GenericAppenderHelper dataAppender = appender(); + + // File 1: 3 records at exactly t=0 (epoch). Extracted watermark = 0ms, emitted = -1ms. + List batch = + ImmutableList.of( + generateRecord(0, "file_1-recordTs_0_a"), + generateRecord(0, "file_1-recordTs_0_b"), + generateRecord(0, "file_1-recordTs_0_c")); + dataAppender.appendToTable(batch); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(1); + + DataStream stream = + env.fromSource( + source(), + WatermarkStrategy.noWatermarks() + .withTimestampAssigner(new RowDataTimestampAssigner()), + SOURCE_NAME, + TypeInformation.of(RowData.class)); + + stream + .windowAll(TumblingEventTimeWindows.of(Duration.ofMinutes(5))) + .apply( + new AllWindowFunction() { + @Override + public void apply( + TimeWindow window, Iterable values, Collector out) { + AtomicInteger count = new AtomicInteger(0); + values.forEach(a -> count.incrementAndGet()); + out.collect(row(window.getStart(), count.get())); + WINDOWS.put(window.getStart(), count.get()); + } + }); + + WINDOWS.clear(); + env.executeAsync("Iceberg Source Min Timestamp Windowing Test"); + + // Append a file with much later timestamps to advance the watermark past [0, 5min) + dataAppender.appendToTable( + dataAppender.writeFile(ImmutableList.of(generateRecord(1500, "last-record")))); + + // The [0, 5min) window should fire with all 3 records written at epoch + Awaitility.await() + .pollInterval(Duration.ofMillis(10)) + .atMost(30, TimeUnit.SECONDS) + .until(() -> Integer.valueOf(3).equals(WINDOWS.get(0L))); + } + /** * This is an integration test for watermark handling and throttling. Integration testing the * following: diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java index 5dd7de545e11..09639a8a9568 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java @@ -31,7 +31,7 @@ void testIncrementalFromSnapshotId() { .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) .build(); assertException( - context, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); + context, "Invalid starting snapshot id for INCREMENTAL_FROM_SNAPSHOT_ID strategy: null"); context = ScanContext.builder() @@ -42,7 +42,7 @@ void testIncrementalFromSnapshotId() { .build(); assertException( context, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + "Invalid starting snapshot timestamp for INCREMENTAL_FROM_SNAPSHOT_ID strategy: not null"); } @Test @@ -54,7 +54,7 @@ void testIncrementalFromSnapshotTimestamp() { .build(); assertException( context, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); + "Invalid starting snapshot timestamp for INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP strategy: null"); context = ScanContext.builder() @@ -64,7 +64,8 @@ void testIncrementalFromSnapshotTimestamp() { .startSnapshotTimestamp(1L) .build(); assertException( - context, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + context, + "Invalid starting snapshot id for INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP strategy: not null"); } @Test diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestWatermarkExtractorRecordEmitter.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestWatermarkExtractorRecordEmitter.java new file mode 100644 index 000000000000..1cb7be03c6a7 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestWatermarkExtractorRecordEmitter.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import java.util.Map; +import org.apache.flink.api.common.eventtime.Watermark; +import org.apache.flink.api.connector.source.SourceOutput; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestWatermarkExtractorRecordEmitter { + @TempDir protected Path temporaryFolder; + + @Test + public void testWatermarkIsDecrementedByOne() throws IOException { + long extractedWatermark = 1000L; + IcebergSourceSplit split = createSplit(1L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> extractedWatermark); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + emitter.emitRecord(new RecordAndPosition<>("record", 0, 0L), output, split); + + assertThat(output.watermarks).hasSize(1); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(extractedWatermark - 1); + } + + @Test + public void testWatermarkEmittedOnlyOncePerSplit() throws IOException { + IcebergSourceSplit split = createSplit(1L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> 1000L); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + RecordAndPosition element = new RecordAndPosition<>("record", 0, 0L); + emitter.emitRecord(element, output, split); + emitter.emitRecord(element, output, split); + emitter.emitRecord(element, output, split); + + assertThat(output.watermarks).hasSize(1); + assertThat(output.records).hasSize(3); + } + + @Test + public void testWatermarkNotEmittedWhenNewSplitHasLowerValue() throws IOException { + IcebergSourceSplit split1 = createSplit(1L); + IcebergSourceSplit split2 = createSplit(2L); + + Map watermarkMap = Maps.newHashMap(); + watermarkMap.put(split1.splitId(), 2000L); + watermarkMap.put(split2.splitId(), 1000L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> watermarkMap.get(s.splitId())); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + RecordAndPosition element = new RecordAndPosition<>("record", 0, 0L); + emitter.emitRecord(element, output, split1); + emitter.emitRecord(element, output, split2); + + // Only split1's watermark is emitted; split2 has a lower value so it's skipped + assertThat(output.watermarks).hasSize(1); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(1999L); + } + + @Test + public void testWatermarkEmittedForEachHigherSplit() throws IOException { + IcebergSourceSplit split1 = createSplit(1L); + IcebergSourceSplit split2 = createSplit(2L); + + Map watermarkMap = Maps.newHashMap(); + watermarkMap.put(split1.splitId(), 1000L); + watermarkMap.put(split2.splitId(), 2000L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> watermarkMap.get(s.splitId())); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + RecordAndPosition element = new RecordAndPosition<>("record", 0, 0L); + emitter.emitRecord(element, output, split1); + emitter.emitRecord(element, output, split2); + + assertThat(output.watermarks).hasSize(2); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(999L); + assertThat(output.watermarks.get(1).getTimestamp()).isEqualTo(1999L); + } + + @Test + public void testWatermarkAtLongMinValueDoesNotOverflow() throws IOException { + IcebergSourceSplit split = createSplit(1L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> Long.MIN_VALUE); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + emitter.emitRecord(new RecordAndPosition<>("record", 0, 0L), output, split); + + assertThat(output.watermarks).hasSize(1); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(Long.MIN_VALUE); + } + + private IcebergSourceSplit createSplit(long seed) throws IOException { + List> recordBatchList = + ReaderUtil.createRecordBatchList(seed, TestFixtures.SCHEMA, 1, 1); + return IcebergSourceSplit.fromCombinedScanTask( + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, FileFormat.PARQUET, TestFixtures.SCHEMA)); + } + + private static class CapturingSourceOutput implements SourceOutput { + final List watermarks = Lists.newArrayList(); + final List records = Lists.newArrayList(); + + @Override + public void collect(T record) { + records.add(record); + } + + @Override + public void collect(T record, long timestamp) { + records.add(record); + } + + @Override + public void emitWatermark(Watermark watermark) { + watermarks.add(watermark); + } + + @Override + public void markIdle() {} + + @Override + public void markActive() {} + } +} diff --git a/flink/v2.1/build.gradle b/flink/v2.1/build.gradle index 91081bdc2e42..f93b61646e7c 100644 --- a/flink/v2.1/build.gradle +++ b/flink/v2.1/build.gradle @@ -33,7 +33,8 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { implementation project(':iceberg-hive-metastore') compileOnly libs.flink21.avro - // for dropwizard histogram metrics implementation + compileOnly libs.joda.time + // dropwizard histogram metrics (optional in Flink) compileOnly libs.flink21.metrics.dropwizard compileOnly libs.flink21.streaming.java compileOnly "${libs.flink21.streaming.java.get().module}:${libs.flink21.streaming.java.get().getVersion()}:tests" @@ -83,6 +84,8 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') + testImplementation project(path: ':iceberg-orc', configuration: 'testArtifacts') + testImplementation(testFixtures(project(':iceberg-parquet'))) // By default, hive-exec is a fat/uber jar and it exports a guava library // that's really old. We use the core classifier to be able to override our guava @@ -169,9 +172,6 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { exclude group: 'com.google.code.findbugs', module: 'jsr305' } - // for dropwizard histogram metrics implementation - implementation libs.flink21.metrics.dropwizard - // for integration testing with the flink-runtime-jar // all of those dependencies are required because the integration test extends FlinkTestBase integrationCompileOnly project(':iceberg-api') @@ -266,4 +266,6 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { jar { enabled = false } + + apply from: "${rootDir}/runtime-deps.gradle" } diff --git a/flink/v2.1/flink-runtime/LICENSE b/flink/v2.1/flink-runtime/LICENSE index 36a03cb4fcf9..d73eda0104b9 100644 --- a/flink/v2.1/flink-runtime/LICENSE +++ b/flink/v2.1/flink-runtime/LICENSE @@ -219,6 +219,94 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles FastDoubleParser (via Jackson JSON Processor). + +Copyright: 2023 Werner Randelshofer, Switzerland +Project URL: https://github.com/wrandelshofer/FastDoubleParser +License: MIT + +| Copyright (c) 2023 Werner Randelshofer, Switzerland +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles fast_float (bundled by FastDoubleParser). + +Copyright: 2021 The fast_float authors +Project URL: https://github.com/fastfloat/fast_float +License: MIT + +| Copyright (c) 2021 The fast_float authors +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles bigint (bundled by FastDoubleParser). + +Copyright: 2022 Tim Buktu +Project URL: https://github.com/tbuktu/bigint +License: BSD 2-Clause + +| Copyright 2022 Tim Buktu +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| 1. Redistributions of source code must retain the above copyright notice, this +| list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + This product bundles Apache Parquet. Copyright: 2014-2020 The Apache Software Foundation. @@ -227,7 +315,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Thrift. +This product bundles Apache Thrift (bundled by Parquet). Copyright: 2006-2010 The Apache Software Foundation. Project URL: https://thrift.apache.org/ @@ -235,55 +323,57 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Fastutil. +This product includes code from Daniel Lemire's JavaFastPFOR project (bundled by Parquet). -Copyright: 2002-2014 Sebastiano Vigna -Project URL: http://fastutil.di.unimi.it/ +Copyright: 2013 Daniel Lemire +Project URL: https://github.com/lemire/JavaFastPFOR License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Apache ORC. +This product bundles fastutil (bundled by Parquet). -Copyright: 2013-2020 The Apache Software Foundation. -Project URL: https://orc.apache.org/ +Copyright: 2002-2014 Sebastiano Vigna +Project URL: http://fastutil.di.unimi.it/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Apache Datasketches. +This product bundles Zero-Allocation Hashing (bundled by Parquet). -Project URL: https://datasketches.apache.org/ +Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Apache Hive. +This product bundles Apache ORC. Copyright: 2013-2020 The Apache Software Foundation. -Project URL: https://hive.apache.org/ +Project URL: https://orc.apache.org/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Airlift Aircompressor. +This product bundles Apache Hive's Storage API (bundled by ORC). -Copyright: 2011-2020 Aircompressor authors. -Project URL: https://github.com/airlift/aircompressor +Copyright: 2008-2020 The Apache Software Foundation. +Project URL: https://hive.apache.org/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Google GAX. +This product bundles Google protobuf (bundled by ORC). -Project URL: https://github.com/googleapis/gax-java +Copyright: 2008 Google Inc. +Project URL: https://developers.google.com/protocol-buffers License: BSD 3-Clause -| Copyright 2016, Google Inc. All rights reserved. -| + +| Copyright 2008 Google Inc. All rights reserved. +| | Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are | met: -| +| | * Redistributions of source code must retain the above copyright | notice, this list of conditions and the following disclaimer. | * Redistributions in binary form must reproduce the above @@ -293,7 +383,7 @@ License: BSD 3-Clause | * Neither the name of Google Inc. nor the names of its | contributors may be used to endorse or promote products derived from | this software without specific prior written permission. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -305,40 +395,26 @@ License: BSD 3-Clause | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +| +| Code generated by the Protocol Buffer compiler is owned by the owner +| of the input file used when generating it. This code is not +| standalone and requires a support library to be linked with it. This +| support library is itself covered by the above license. -------------------------------------------------------------------------------- -This product bundles Google Auth Library. +This product bundles Apache Datasketches. -License: BSD 3-Clause -| Copyright 2014, Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +Project URL: https://datasketches.apache.org/ +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles Airlift Aircompressor. + +Copyright: 2011-2020 Aircompressor authors. +Project URL: https://github.com/airlift/aircompressor +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -363,6 +439,7 @@ This product bundles checkerframework checker-qual. Copyright: 2004-2020 the Checker Framework developers Project URL: https://github.com/typetools/checker-framework License: MIT + | The annotations are licensed under the MIT License. (The text of this | license appears below.) More specifically, all the parts of the Checker | Framework that you might want to include with your own program use the @@ -401,87 +478,13 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Google protobuf. - -Copyright: 2008 Google Inc. -Project URL: https://developers.google.com/protocol-buffers -License: BSD 3-Clause -| Copyright 2008 Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -| -| Code generated by the Protocol Buffer compiler is owned by the owner -| of the input file used when generating it. This code is not -| standalone and requires a support library to be linked with it. This -| support library is itself covered by the above license. - --------------------------------------------------------------------------------- - -This product bundles ThreeTen BP. - -Project URL: https://www.threeten.org/threetenbp -License: BSD 3-Clause -| Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. -| -| All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are met: -| -| * Redistributions of source code must retain the above copyright notice, -| this list of conditions and the following disclaimer. -| -| * Redistributions in binary form must reproduce the above copyright notice, -| this list of conditions and the following disclaimer in the documentation -| and/or other materials provided with the distribution. -| -| * Neither the name of JSR-310 nor the names of its contributors -| may be used to endorse or promote products derived from this software -| without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - This product bundles ThreeTen Extra. Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. Project URL: https://www.threeten.org/threeten-extra/ License: BSD 3-Clause + +| Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. | All rights reserved. | | * Redistribution and use in source and binary forms, with or without @@ -530,15 +533,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache HttpComponents (core and client). - -Copyright: 1999-2022 The Apache Software Foundation. -Project URL: https://hc.apache.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product includes code from Apache HttpComponents Client. +This product bundles and includes code from Apache HttpComponents (core/client). * retry and error handling logic in ExponentialHttpRequestRetryStrategy.java @@ -556,14 +551,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Codahale Metrics. - -Copyright: (c) 2010-2013 Coda Hale, Yammer.com, 2014-2021 Dropwizard Team -Project URL: https://github.com/dropwizard/metrics -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - This product bundles RoaringBitmap. Copyright: (c) 2013-... the RoaringBitmap authors @@ -572,7 +559,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Eclipse Microprofile OpenAPI. +This product bundles Eclipse MicroProfile OpenAPI. Copyright: Copyright (c) 2017 Contributors to the Eclipse Foundation Project URL: https://github.com/microprofile/microprofile-open-api @@ -585,6 +572,7 @@ This product bundles Luben Zstd. Copyright: Copyright (c) 2015-present, Luben Karavelov/ All rights reserved. Project URL: https://github.com/luben/zstd-jni/ License: BSD 2-Clause + | Zstd-jni: JNI bindings to Zstd Library | | Copyright (c) 2015-present, Luben Karavelov/ All rights reserved. @@ -614,242 +602,423 @@ License: BSD 2-Clause -------------------------------------------------------------------------------- -This product bundles Google Cloud BigQuery Client for Java. - -Project URL: https://github.com/googleapis/java-bigquery -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Credentials for Java. - -Project URL: https://github.com/aliyun/credentials-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Tea for Java. - -Project URL: https://github.com/aliyun/tea-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Gson. - -Project URL: https://github.com/google/gson/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google API Common. - -License: BSD 3-Clause -| Copyright 2016, Google Inc. -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This product bundles Google Http Client. - -Project URL: https://www.google.com/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OpenCensus. - -Project URL: https://github.com/census-instrumentation/opencensus-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JaCoCo. - -Project URL: https://github.com/jacoco/jacoco/ -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt - --------------------------------------------------------------------------------- - -This product bundles JAXB. - -Project URL: http://jaxb.java.net/ -License: CDDL 1.1 - https://glassfish.java.net/public/CDDL+GPL_1_1.html - --------------------------------------------------------------------------------- - -This product bundles Okio. - -Project URL: https://github.com/square/okio -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OkHttp. - -Project URL: https://github.com/square/okhttp -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Auto Value. - -Project URL: https://github.com/google/auto/tree/master/value -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles gRPC. - -Project URL: https://github.com/grpc/grpc-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Netty. - -Project URL: https://netty.io/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google APIs. - -Project URL: https://github.com/googleapis/googleapis -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud APIs for Java. - -Project URL: https://github.com/googleapis/google-cloud-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - This product bundles JTS Topology Suite. Project URL: https://github.com/locationtech/jts -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt - --------------------------------------------------------------------------------- - -This product bundles javax.annotation-api. - -Project URL: https://javaee.github.io/glassfish -Project URL: http://jcp.org/en/jsr/detail?id=250 -License: CDDL - https://github.com/javaee/javax.annotation/blob/master/LICENSE - --------------------------------------------------------------------------------- - -This product bundles JSpecify. - -Project URL: https://github.com/jspecify/jspecify -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Animal Sniffer Annotations. - -License: MIT -| The MIT License -| -| Copyright (c) 2009 codehaus.org. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in -| all copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -| THE SOFTWARE. - --------------------------------------------------------------------------------- - -This product bundles Android Annotations. - -Project URL: http://source.android.com/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php -This product bundles Conscrypt (openjdk-uber). - -Project URL: https://conscrypt.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Perfmark. - -Project URL: https://github.com/perfmark/perfmark -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles org.json. - -Project URL: https://github.com/douglascrockford/JSON-java -License: Public Domain - https://github.com/stleary/JSON-java/blob/master/LICENSE - --------------------------------------------------------------------------------- - -This product bundles Apache Arrow. - -Project URL: https://github.com/apache/arrow -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| - Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| +| - Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| - Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- -This product bundles Google flatbuffers. +This product bundles the Mozilla Public Suffix List, distributed by Apache HttpComponents. -Project URL: https://github.com/google/flatbuffers -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OpenTelemetry. +Project URL: https://publicsuffix.org/ +License: Mozilla Public License, Version 2.0 - https://mozilla.org/MPL/2.0/ -Project URL: https://opentelemetry.io/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 +| Mozilla Public License Version 2.0 +| ================================== +| +| 1. Definitions +| -------------- +| +| 1.1. "Contributor" +| means each individual or legal entity that creates, contributes to +| the creation of, or owns Covered Software. +| +| 1.2. "Contributor Version" +| means the combination of the Contributions of others (if any) used +| by a Contributor and that particular Contributor's Contribution. +| +| 1.3. "Contribution" +| means Covered Software of a particular Contributor. +| +| 1.4. "Covered Software" +| means Source Code Form to which the initial Contributor has attached +| the notice in Exhibit A, the Executable Form of such Source Code +| Form, and Modifications of such Source Code Form, in each case +| including portions thereof. +| +| 1.5. "Incompatible With Secondary Licenses" +| means +| +| (a) that the initial Contributor has attached the notice described +| in Exhibit B to the Covered Software; or +| +| (b) that the Covered Software was made available under the terms of +| version 1.1 or earlier of the License, but not also under the +| terms of a Secondary License. +| +| 1.6. "Executable Form" +| means any form of the work other than Source Code Form. +| +| 1.7. "Larger Work" +| means a work that combines Covered Software with other material, in +| a separate file or files, that is not Covered Software. +| +| 1.8. "License" +| means this document. +| +| 1.9. "Licensable" +| means having the right to grant, to the maximum extent possible, +| whether at the time of the initial grant or subsequently, any and +| all of the rights conveyed by this License. +| +| 1.10. "Modifications" +| means any of the following: +| +| (a) any file in Source Code Form that results from an addition to, +| deletion from, or modification of the contents of Covered +| Software; or +| +| (b) any new file in Source Code Form that contains any Covered +| Software. +| +| 1.11. "Patent Claims" of a Contributor +| means any patent claim(s), including without limitation, method, +| process, and apparatus claims, in any patent Licensable by such +| Contributor that would be infringed, but for the grant of the +| License, by the making, using, selling, offering for sale, having +| made, import, or transfer of either its Contributions or its +| Contributor Version. +| +| 1.12. "Secondary License" +| means either the GNU General Public License, Version 2.0, the GNU +| Lesser General Public License, Version 2.1, the GNU Affero General +| Public License, Version 3.0, or any later versions of those +| licenses. +| +| 1.13. "Source Code Form" +| means the form of the work preferred for making modifications. +| +| 1.14. "You" (or "Your") +| means an individual or a legal entity exercising rights under this +| License. For legal entities, "You" includes any entity that +| controls, is controlled by, or is under common control with You. For +| purposes of this definition, "control" means (a) the power, direct +| or indirect, to cause the direction or management of such entity, +| whether by contract or otherwise, or (b) ownership of more than +| fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants and Conditions +| -------------------------------- +| +| 2.1. Grants +| +| Each Contributor hereby grants You a world-wide, royalty-free, +| non-exclusive license: +| +| (a) under intellectual property rights (other than patent or trademark) +| Licensable by such Contributor to use, reproduce, make available, +| modify, display, perform, distribute, and otherwise exploit its +| Contributions, either on an unmodified basis, with Modifications, or +| as part of a Larger Work; and +| +| (b) under Patent Claims of such Contributor to make, use, sell, offer +| for sale, have made, import, and otherwise transfer either its +| Contributions or its Contributor Version. +| +| 2.2. Effective Date +| +| The licenses granted in Section 2.1 with respect to any Contribution +| become effective for each Contribution on the date the Contributor first +| distributes such Contribution. +| +| 2.3. Limitations on Grant Scope +| +| The licenses granted in this Section 2 are the only rights granted under +| this License. No additional rights or licenses will be implied from the +| distribution or licensing of Covered Software under this License. +| Notwithstanding Section 2.1(b) above, no patent license is granted by a +| Contributor: +| +| (a) for any code that a Contributor has removed from Covered Software; +| or +| +| (b) for infringements caused by: (i) Your and any other third party's +| modifications of Covered Software, or (ii) the combination of its +| Contributions with other software (except as part of its Contributor +| Version); or +| +| (c) under Patent Claims infringed by Covered Software in the absence of +| its Contributions. +| +| This License does not grant any rights in the trademarks, service marks, +| or logos of any Contributor (except as may be necessary to comply with +| the notice requirements in Section 3.4). +| +| 2.4. Subsequent Licenses +| +| No Contributor makes additional grants as a result of Your choice to +| distribute the Covered Software under a subsequent version of this +| License (see Section 10.2) or under the terms of a Secondary License (if +| permitted under the terms of Section 3.3). +| +| 2.5. Representation +| +| Each Contributor represents that the Contributor believes its +| Contributions are its original creation(s) or it has sufficient rights +| to grant the rights to its Contributions conveyed by this License. +| +| 2.6. Fair Use +| +| This License is not intended to limit any rights You have under +| applicable copyright doctrines of fair use, fair dealing, or other +| equivalents. +| +| 2.7. Conditions +| +| Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +| in Section 2.1. +| +| 3. Responsibilities +| ------------------- +| +| 3.1. Distribution of Source Form +| +| All distribution of Covered Software in Source Code Form, including any +| Modifications that You create or to which You contribute, must be under +| the terms of this License. You must inform recipients that the Source +| Code Form of the Covered Software is governed by the terms of this +| License, and how they can obtain a copy of this License. You may not +| attempt to alter or restrict the recipients' rights in the Source Code +| Form. +| +| 3.2. Distribution of Executable Form +| +| If You distribute Covered Software in Executable Form then: +| +| (a) such Covered Software must also be made available in Source Code +| Form, as described in Section 3.1, and You must inform recipients of +| the Executable Form how they can obtain a copy of such Source Code +| Form by reasonable means in a timely manner, at a charge no more +| than the cost of distribution to the recipient; and +| +| (b) You may distribute such Executable Form under the terms of this +| License, or sublicense it under different terms, provided that the +| license for the Executable Form does not attempt to limit or alter +| the recipients' rights in the Source Code Form under this License. +| +| 3.3. Distribution of a Larger Work +| +| You may create and distribute a Larger Work under terms of Your choice, +| provided that You also comply with the requirements of this License for +| the Covered Software. If the Larger Work is a combination of Covered +| Software with a work governed by one or more Secondary Licenses, and the +| Covered Software is not Incompatible With Secondary Licenses, this +| License permits You to additionally distribute such Covered Software +| under the terms of such Secondary License(s), so that the recipient of +| the Larger Work may, at their option, further distribute the Covered +| Software under the terms of either this License or such Secondary +| License(s). +| +| 3.4. Notices +| +| You may not remove or alter the substance of any license notices +| (including copyright notices, patent notices, disclaimers of warranty, +| or limitations of liability) contained within the Source Code Form of +| the Covered Software, except that You may alter any license notices to +| the extent required to remedy known factual inaccuracies. +| +| 3.5. Application of Additional Terms +| +| You may choose to offer, and to charge a fee for, warranty, support, +| indemnity or liability obligations to one or more recipients of Covered +| Software. However, You may do so only on Your own behalf, and not on +| behalf of any Contributor. You must make it absolutely clear that any +| such warranty, support, indemnity, or liability obligation is offered by +| You alone, and You hereby agree to indemnify every Contributor for any +| liability incurred by such Contributor as a result of warranty, support, +| indemnity or liability terms You offer. You may include additional +| disclaimers of warranty and limitations of liability specific to any +| jurisdiction. +| +| 4. Inability to Comply Due to Statute or Regulation +| --------------------------------------------------- +| +| If it is impossible for You to comply with any of the terms of this +| License with respect to some or all of the Covered Software due to +| statute, judicial order, or regulation then You must: (a) comply with +| the terms of this License to the maximum extent possible; and (b) +| describe the limitations and the code they affect. Such description must +| be placed in a text file included with all distributions of the Covered +| Software under this License. Except to the extent prohibited by statute +| or regulation, such description must be sufficiently detailed for a +| recipient of ordinary skill to be able to understand it. +| +| 5. Termination +| -------------- +| +| 5.1. The rights granted under this License will terminate automatically +| if You fail to comply with any of its terms. However, if You become +| compliant, then the rights granted under this License from a particular +| Contributor are reinstated (a) provisionally, unless and until such +| Contributor explicitly and finally terminates Your grants, and (b) on an +| ongoing basis, if such Contributor fails to notify You of the +| non-compliance by some reasonable means prior to 60 days after You have +| come back into compliance. Moreover, Your grants from a particular +| Contributor are reinstated on an ongoing basis if such Contributor +| notifies You of the non-compliance by some reasonable means, this is the +| first time You have received notice of non-compliance with this License +| from such Contributor, and You become compliant prior to 30 days after +| Your receipt of the notice. +| +| 5.2. If You initiate litigation against any entity by asserting a patent +| infringement claim (excluding declaratory judgment actions, +| counter-claims, and cross-claims) alleging that a Contributor Version +| directly or indirectly infringes any patent, then the rights granted to +| You by any and all Contributors for the Covered Software under Section +| 2.1 of this License shall terminate. +| +| 5.3. In the event of termination under Sections 5.1 or 5.2 above, all +| end user license agreements (excluding distributors and resellers) which +| have been validly granted by You or Your distributors under this License +| prior to termination shall survive termination. +| +| ************************************************************************ +| * * +| * 6. Disclaimer of Warranty * +| * ------------------------- * +| * * +| * Covered Software is provided under this License on an "as is" * +| * basis, without warranty of any kind, either expressed, implied, or * +| * statutory, including, without limitation, warranties that the * +| * Covered Software is free of defects, merchantable, fit for a * +| * particular purpose or non-infringing. The entire risk as to the * +| * quality and performance of the Covered Software is with You. * +| * Should any Covered Software prove defective in any respect, You * +| * (not any Contributor) assume the cost of any necessary servicing, * +| * repair, or correction. This disclaimer of warranty constitutes an * +| * essential part of this License. No use of any Covered Software is * +| * authorized under this License except under this disclaimer. * +| * * +| ************************************************************************ +| +| ************************************************************************ +| * * +| * 7. Limitation of Liability * +| * -------------------------- * +| * * +| * Under no circumstances and under no legal theory, whether tort * +| * (including negligence), contract, or otherwise, shall any * +| * Contributor, or anyone who distributes Covered Software as * +| * permitted above, be liable to You for any direct, indirect, * +| * special, incidental, or consequential damages of any character * +| * including, without limitation, damages for lost profits, loss of * +| * goodwill, work stoppage, computer failure or malfunction, or any * +| * and all other commercial damages or losses, even if such party * +| * shall have been informed of the possibility of such damages. This * +| * limitation of liability shall not apply to liability for death or * +| * personal injury resulting from such party's negligence to the * +| * extent applicable law prohibits such limitation. Some * +| * jurisdictions do not allow the exclusion or limitation of * +| * incidental or consequential damages, so this exclusion and * +| * limitation may not apply to You. * +| * * +| ************************************************************************ +| +| 8. Litigation +| ------------- +| +| Any litigation relating to this License may be brought only in the +| courts of a jurisdiction where the defendant maintains its principal +| place of business and such litigation shall be governed by laws of that +| jurisdiction, without reference to its conflict-of-law provisions. +| Nothing in this Section shall prevent a party's ability to bring +| cross-claims or counter-claims. +| +| 9. Miscellaneous +| ---------------- +| +| This License represents the complete agreement concerning the subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. Any law or regulation which provides +| that the language of a contract shall be construed against the drafter +| shall not be used to construe this License against a Contributor. +| +| 10. Versions of the License +| --------------------------- +| +| 10.1. New Versions +| +| Mozilla Foundation is the license steward. Except as provided in Section +| 10.3, no one other than the license steward has the right to modify or +| publish new versions of this License. Each version will be given a +| distinguishing version number. +| +| 10.2. Effect of New Versions +| +| You may distribute the Covered Software under the terms of the version +| of the License under which You originally received the Covered Software, +| or under the terms of any subsequent version published by the license +| steward. +| +| 10.3. Modified Versions +| +| If you create software not governed by this License, and you want to +| create a new license for such software, you may create and use a +| modified version of this License if you rename the license and remove +| any references to the name of the license steward (except to note that +| such modified license differs from this License). +| +| 10.4. Distributing Source Code Form that is Incompatible With Secondary +| Licenses +| +| If You choose to distribute Source Code Form that is Incompatible With +| Secondary Licenses under the terms of this version of the License, the +| notice described in Exhibit B of this License must be attached. +| +| Exhibit A - Source Code Form License Notice +| ------------------------------------------- +| +| This Source Code Form is subject to the terms of the Mozilla Public +| License, v. 2.0. If a copy of the MPL was not distributed with this +| file, You can obtain one at http://mozilla.org/MPL/2.0/. +| +| If it is not possible or desirable to put the notice in a particular +| file, then You may include the notice in a location (such as a LICENSE +| file in a relevant directory) where a recipient would be likely to look +| for such a notice. +| +| You may add additional accurate notices of copyright ownership. +| +| Exhibit B - "Incompatible With Secondary Licenses" Notice +| --------------------------------------------------------- +| +| This Source Code Form is "Incompatible With Secondary Licenses", as +| defined by the Mozilla Public License, v. 2.0. -------------------------------------------------------------------------------- -This product bundles Kotlin. - -Project URL: https://github.com/JetBrains/kotlin -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 diff --git a/flink/v2.1/flink-runtime/NOTICE b/flink/v2.1/flink-runtime/NOTICE index 72916788b5e4..61b02129d0e1 100644 --- a/flink/v2.1/flink-runtime/NOTICE +++ b/flink/v2.1/flink-runtime/NOTICE @@ -356,128 +356,6 @@ This product bundles Eclipse Microprofile OpenAPI with the following in its NOTI -------------------------------------------------------------------------------- -This product bundles gRPC with the following in its NOTICE file: -| Copyright 2014 The gRPC Authors -| -| Licensed under the Apache License, Version 2.0 (the "License"); -| you may not use this file except in compliance with the License. -| You may obtain a copy of the License at -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, -| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -| See the License for the specific language governing permissions and -| limitations under the License. -| -| ----------------------------------------------------------------------- -| -| This product contains a modified portion of 'OkHttp', an open source -| HTTP & SPDY client for Android and Java applications, which can be obtained -| at: -| -| * LICENSE: -| * okhttp/third_party/okhttp/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/square/okhttp -| * LOCATION_IN_GRPC: -| * okhttp/third_party/okhttp -| -| This product contains a modified portion of 'Envoy', an open source -| cloud-native high-performance edge/middle/service proxy, which can be -| obtained at: -| -| * LICENSE: -| * xds/third_party/envoy/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/envoy/NOTICE -| * HOMEPAGE: -| * https://www.envoyproxy.io -| * LOCATION_IN_GRPC: -| * xds/third_party/envoy -| -| This product contains a modified portion of 'protoc-gen-validate (PGV)', -| an open source protoc plugin to generate polyglot message validators, -| which can be obtained at: -| -| * LICENSE: -| * xds/third_party/protoc-gen-validate/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/protoc-gen-validate/NOTICE -| * HOMEPAGE: -| * https://github.com/envoyproxy/protoc-gen-validate -| * LOCATION_IN_GRPC: -| * xds/third_party/protoc-gen-validate -| -| This product contains a modified portion of 'udpa', -| an open source universal data plane API, which can be obtained at: -| -| * LICENSE: -| * xds/third_party/udpa/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/cncf/udpa -| * LOCATION_IN_GRPC: -| * xds/third_party/udpa - --------------------------------------------------------------------------------- - -This product bundles Netty with the following in its NOTICE file: -| -| The Netty Project -| ================= -| -| Please visit the Netty web site for more information: -| -| * http://netty.io/ -| -| Copyright 2016 The Netty Project -| -| The Netty Project licenses this file to you under the Apache License, -| version 2.0 (the "License"); you may not use this file except in compliance -| with the License. You may obtain a copy of the License at: -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -| WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -| License for the specific language governing permissions and limitations -| under the License. -| -| ------------------------------------------------------------------------------- -| This product contains a forked and modified version of Tomcat Native -| -| * LICENSE: -| * license/LICENSE.tomcat-native.txt (Apache License 2.0) -| * HOMEPAGE: -| * http://tomcat.apache.org/native-doc/ -| * https://svn.apache.org/repos/asf/tomcat/native/ -| -| This product contains the Maven wrapper scripts from 'Maven Wrapper', that provides an easy way to ensure a user has everything necessary to run the Maven build. -| -| * LICENSE: -| * license/LICENSE.mvn-wrapper.txt (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/takari/maven-wrapper -| -| This product contains small piece of code to support AIX, taken from netbsd. -| -| * LICENSE: -| * license/LICENSE.aix-netbsd.txt (OpenSSL License) -| * HOMEPAGE: -| * https://ftp.netbsd.org/pub/NetBSD/NetBSD-current/src/crypto/external/bsd/openssl/dist -| -| -| This product contains code from boringssl. -| -| * LICENSE (Combination ISC and OpenSSL license) -| * license/LICENSE.boringssl.txt (Combination ISC and OpenSSL license) -| * HOMEPAGE: -| * https://boringssl.googlesource.com/boringssl/ - --------------------------------------------------------------------------------- - This product bundles Jackson JSON Processor with the following in its NOTICE file: | # Jackson JSON processor | diff --git a/flink/v2.1/flink-runtime/baseline-class-uniqueness.lock b/flink/v2.1/flink-runtime/baseline-class-uniqueness.lock deleted file mode 100644 index 7868296a79e8..000000000000 --- a/flink/v2.1/flink-runtime/baseline-class-uniqueness.lock +++ /dev/null @@ -1,60 +0,0 @@ -# Danger! Multiple jars contain identically named classes. This may cause different behaviour depending on classpath ordering. -# Run ./gradlew checkClassUniqueness --fix to update this file - -## runtimeClasspath -[com.google.protobuf:protobuf-java, dev.vortex:vortex-jni] - - com.google.protobuf.BoolValue - - com.google.protobuf.BoolValue$1 - - com.google.protobuf.BoolValue$Builder - - com.google.protobuf.BoolValueOrBuilder - - com.google.protobuf.BytesValue - - com.google.protobuf.BytesValue$1 - - com.google.protobuf.BytesValue$Builder - - com.google.protobuf.BytesValueOrBuilder - - com.google.protobuf.DoubleValue - - com.google.protobuf.DoubleValue$1 - - com.google.protobuf.DoubleValue$Builder - - com.google.protobuf.DoubleValueOrBuilder - - com.google.protobuf.FloatValue - - com.google.protobuf.FloatValue$1 - - com.google.protobuf.FloatValue$Builder - - com.google.protobuf.FloatValueOrBuilder - - com.google.protobuf.Int32Value - - com.google.protobuf.Int32Value$1 - - com.google.protobuf.Int32Value$Builder - - com.google.protobuf.Int32ValueOrBuilder - - com.google.protobuf.Int64Value - - com.google.protobuf.Int64Value$1 - - com.google.protobuf.Int64Value$Builder - - com.google.protobuf.Int64ValueOrBuilder - - com.google.protobuf.ListValue - - com.google.protobuf.ListValue$1 - - com.google.protobuf.ListValue$Builder - - com.google.protobuf.ListValueOrBuilder - - com.google.protobuf.NullValue - - com.google.protobuf.NullValue$1 - - com.google.protobuf.StringValue - - com.google.protobuf.StringValue$1 - - com.google.protobuf.StringValue$Builder - - com.google.protobuf.StringValueOrBuilder - - com.google.protobuf.Struct - - com.google.protobuf.Struct$1 - - com.google.protobuf.Struct$Builder - - com.google.protobuf.Struct$Builder$FieldsConverter - - com.google.protobuf.Struct$FieldsDefaultEntryHolder - - com.google.protobuf.StructOrBuilder - - com.google.protobuf.StructProto - - com.google.protobuf.UInt32Value - - com.google.protobuf.UInt32Value$1 - - com.google.protobuf.UInt32Value$Builder - - com.google.protobuf.UInt32ValueOrBuilder - - com.google.protobuf.UInt64Value - - com.google.protobuf.UInt64Value$1 - - com.google.protobuf.UInt64Value$Builder - - com.google.protobuf.UInt64ValueOrBuilder - - com.google.protobuf.Value - - com.google.protobuf.Value$1 - - com.google.protobuf.Value$Builder - - com.google.protobuf.Value$KindCase - - com.google.protobuf.ValueOrBuilder - - com.google.protobuf.WrappersProto diff --git a/flink/v2.1/flink-runtime/runtime-deps.txt b/flink/v2.1/flink-runtime/runtime-deps.txt new file mode 100644 index 000000000000..2eba0b183fb9 --- /dev/null +++ b/flink/v2.1/flink-runtime/runtime-deps.txt @@ -0,0 +1,47 @@ +com.fasterxml.jackson.core:jackson-annotations:2.21 +com.fasterxml.jackson.core:jackson-core:2.21.3 +com.fasterxml.jackson.core:jackson-databind:2.21.3 +com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.21.3 +com.github.ben-manes.caffeine:caffeine:2.9.3 +com.github.luben:zstd-jni:1.5.7-3 +com.google.errorprone:error_prone_annotations:2.41.0 +com.google.flatbuffers:flatbuffers-java:25.2.10 +com.google.guava:failureaccess:1.0.3 +com.google.guava:guava:33.5.0-jre +com.google.guava:listenablefuture:9999.0-empty-to-avoid-conflict-with-guava +com.google.j2objc:j2objc-annotations:3.1 +dev.failsafe:failsafe:3.3.2 +dev.vortex:vortex-jni:0.67.0 +io.airlift:aircompressor:2.0.3 +io.netty:netty-buffer:4.2.7.Final +io.netty:netty-common:4.2.7.Final +org.apache.arrow:arrow-c-data:18.3.0 +org.apache.arrow:arrow-format:18.3.0 +org.apache.arrow:arrow-memory-core:18.3.0 +org.apache.arrow:arrow-memory-netty-buffer-patch:18.3.0 +org.apache.arrow:arrow-memory-netty:18.3.0 +org.apache.arrow:arrow-vector:18.3.0 +org.apache.avro:avro:1.12.1 +org.apache.datasketches:datasketches-java:6.2.0 +org.apache.datasketches:datasketches-memory:3.0.2 +org.apache.httpcomponents.client5:httpclient5:5.6.1 +org.apache.httpcomponents.core5:httpcore5-h2:5.4 +org.apache.httpcomponents.core5:httpcore5:5.4 +org.apache.orc:orc-core:1.9.8 +org.apache.orc:orc-shims:1.9.8 +org.apache.parquet:parquet-avro:1.17.0 +org.apache.parquet:parquet-column:1.17.0 +org.apache.parquet:parquet-common:1.17.0 +org.apache.parquet:parquet-encoding:1.17.0 +org.apache.parquet:parquet-format-structures:1.17.0 +org.apache.parquet:parquet-hadoop:1.17.0 +org.apache.parquet:parquet-jackson:1.17.0 +org.apache.parquet:parquet-variant:1.17.0 +org.checkerframework:checker-qual:3.19.0 +org.eclipse.microprofile.openapi:microprofile-openapi-api:4.1.1 +org.jspecify:jspecify:1.0.0 +org.locationtech.jts:jts-core:1.20.0 +org.projectnessie.nessie:nessie-client:0.107.5 +org.projectnessie.nessie:nessie-model:0.107.5 +org.roaringbitmap:RoaringBitmap:1.6.14 +org.threeten:threeten-extra:1.7.1 diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java index e0672811cf5f..7661372c88e8 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java @@ -44,7 +44,7 @@ public FlinkConfParser(Table table, Map options, ReadableConfig this.readableConfig = readableConfig; } - FlinkConfParser(Map options, ReadableConfig readableConfig) { + public FlinkConfParser(Map options, ReadableConfig readableConfig) { this.tableProperties = ImmutableMap.of(); this.options = options; this.readableConfig = readableConfig; diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java index 377811247233..bfcd34d0b999 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java @@ -138,11 +138,17 @@ public Type visit(TimeType timeType) { @Override public Type visit(TimestampType timestampType) { + if (timestampType.getPrecision() > 6) { + return Types.TimestampNanoType.withoutZone(); + } return Types.TimestampType.withoutZone(); } @Override public Type visit(LocalZonedTimestampType localZonedTimestampType) { + if (localZonedTimestampType.getPrecision() > 6) { + return Types.TimestampNanoType.withZone(); + } return Types.TimestampType.withZone(); } diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java index 3ef611f2ded5..920e44b24b31 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java @@ -114,19 +114,35 @@ private static PositionalGetter buildGetter(LogicalType logicalType, Type typ case TIMESTAMP_WITHOUT_TIME_ZONE: TimestampType timestampType = (TimestampType) logicalType; - return (row, pos) -> { - LocalDateTime localDateTime = - row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); - return DateTimeUtil.microsFromTimestamp(localDateTime); - }; + if (type.typeId() == Type.TypeID.TIMESTAMP_NANO) { + return (row, pos) -> { + LocalDateTime localDateTime = + row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); + return DateTimeUtil.nanosFromTimestamp(localDateTime); + }; + } else { + return (row, pos) -> { + LocalDateTime localDateTime = + row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); + return DateTimeUtil.microsFromTimestamp(localDateTime); + }; + } case TIMESTAMP_WITH_LOCAL_TIME_ZONE: LocalZonedTimestampType lzTs = (LocalZonedTimestampType) logicalType; - return (row, pos) -> { - TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); - return timestampData.getMillisecond() * 1000 - + timestampData.getNanoOfMillisecond() / 1000; - }; + if (type.typeId() == Type.TypeID.TIMESTAMP_NANO) { + return (row, pos) -> { + TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); + return timestampData.getMillisecond() * 1_000_000L + + timestampData.getNanoOfMillisecond(); + }; + } else { + return (row, pos) -> { + TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); + return timestampData.getMillisecond() * 1000L + + timestampData.getNanoOfMillisecond() / 1000; + }; + } case ROW: RowType rowType = (RowType) logicalType; diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java index 65b9d44ad4b8..77f16bfdb2ab 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java @@ -70,7 +70,7 @@ public OrcValueReader record( TypeDescription record, List names, List> fields) { - return FlinkOrcReaders.struct(fields, iStruct, idToConstant); + return FlinkOrcReaders.struct(record, fields, iStruct, idToConstant); } @Override @@ -112,6 +112,13 @@ public OrcValueReader primitive(Type.PrimitiveType iPrimitive, TypeDescriptio } else { return FlinkOrcReaders.timestamps(); } + case TIMESTAMP_NANO: + Types.TimestampNanoType timestampNanoType = (Types.TimestampNanoType) iPrimitive; + if (timestampNanoType.shouldAdjustToUTC()) { + return FlinkOrcReaders.timestampTzs(); + } else { + return FlinkOrcReaders.timestamps(); + } case STRING: return FlinkOrcReaders.strings(); case UUID: diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java index 7a4a15c7e600..c5c958fbdb04 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java @@ -39,6 +39,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; +import org.apache.orc.TypeDescription; import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; import org.apache.orc.storage.ql.exec.vector.ColumnVector; import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; @@ -91,8 +92,11 @@ public static OrcValueReader map( } public static OrcValueReader struct( - List> readers, Types.StructType struct, Map idToConstant) { - return new StructReader(readers, struct, idToConstant); + TypeDescription record, + List> readers, + Types.StructType struct, + Map idToConstant) { + return new StructReader(record, readers, struct, idToConstant); } private static class StringReader implements OrcValueReader { @@ -265,8 +269,11 @@ private static class StructReader extends OrcValueReaders.StructReader private final int numFields; StructReader( - List> readers, Types.StructType struct, Map idToConstant) { - super(readers, struct, idToConstant); + TypeDescription record, + List> readers, + Types.StructType struct, + Map idToConstant) { + super(record, readers, struct, idToConstant); this.numFields = struct.fields().size(); } diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java index a467d848337d..c1b46252e18a 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java @@ -145,6 +145,13 @@ public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, LogicalType fl } else { return FlinkOrcWriters.timestamps(); } + case TIMESTAMP_NANO: + Types.TimestampNanoType timestampNanoType = (Types.TimestampNanoType) iPrimitive; + if (timestampNanoType.shouldAdjustToUTC()) { + return FlinkOrcWriters.timestampNanoTzs(); + } else { + return FlinkOrcWriters.timestampNanos(); + } case STRING: return FlinkOrcWriters.strings(); case UUID: diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java index 684842aa099c..bf19a46c05fb 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java @@ -70,6 +70,14 @@ static OrcValueWriter timestampTzs() { return TimestampTzWriter.INSTANCE; } + static OrcValueWriter timestampNanos() { + return TimestampNanoWriter.INSTANCE; + } + + static OrcValueWriter timestampNanoTzs() { + return TimestampNanoTzWriter.INSTANCE; + } + static OrcValueWriter decimals(int precision, int scale) { if (precision <= 18) { return new Decimal18Writer(precision, scale); @@ -170,6 +178,35 @@ public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { } } + private static class TimestampNanoWriter implements OrcValueWriter { + private static final TimestampNanoWriter INSTANCE = new TimestampNanoWriter(); + + @Override + public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { + TimestampColumnVector cv = (TimestampColumnVector) output; + cv.setIsUTC(true); + // millis + OffsetDateTime offsetDateTime = data.toInstant().atOffset(ZoneOffset.UTC); + cv.time[rowId] = + offsetDateTime.toEpochSecond() * 1_000 + offsetDateTime.getNano() / 1_000_000; + cv.nanos[rowId] = offsetDateTime.getNano(); + } + } + + private static class TimestampNanoTzWriter implements OrcValueWriter { + private static final TimestampNanoTzWriter INSTANCE = new TimestampNanoTzWriter(); + + @SuppressWarnings("JavaInstantGetSecondsGetNano") + @Override + public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { + TimestampColumnVector cv = (TimestampColumnVector) output; + // millis + Instant instant = data.toInstant(); + cv.time[rowId] = instant.toEpochMilli(); + cv.nanos[rowId] = instant.getNano(); + } + } + private static class Decimal18Writer implements OrcValueWriter { private final int precision; private final int scale; diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java index f23a7ee3d0d3..81bb55967992 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java @@ -69,6 +69,8 @@ public static Object convertConstant(Type type, Object value) { return (int) ((Long) value / 1000); case TIMESTAMP: // TimestampData return TimestampData.fromLocalDateTime(DateTimeUtil.timestampFromMicros((Long) value)); + case TIMESTAMP_NANO: + return TimestampData.fromLocalDateTime(DateTimeUtil.timestampFromNanos((Long) value)); case UUID: return UUIDUtil.convert((UUID) value); default: diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java index b93e4346a47a..40a2d91f87f8 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java @@ -49,6 +49,7 @@ import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.ByteBuffers; +import org.apache.iceberg.util.DateTimeUtil; @Internal public class StructRowData implements RowData { @@ -121,8 +122,8 @@ public int getInt(int pos) { if (integer instanceof Integer) { return (int) integer; - } else if (integer instanceof LocalDate) { - return (int) ((LocalDate) integer).toEpochDay(); + } else if (integer instanceof LocalDate localDate) { + return (int) localDate.toEpochDay(); } else if (integer instanceof LocalTime) { return (int) (((LocalTime) integer).toNanoOfDay() / 1000_000); } else { @@ -186,8 +187,27 @@ private BigDecimal getDecimalInternal(int pos) { @Override public TimestampData getTimestamp(int pos, int precision) { + if (precision > 6) { + Object timeVal = struct.get(pos, Object.class); + if (timeVal instanceof OffsetDateTime) { + OffsetDateTime odt = (OffsetDateTime) timeVal; + return TimestampData.fromEpochMillis( + odt.toInstant().toEpochMilli(), odt.getNano() % 1_000_000); + } else if (timeVal instanceof LocalDateTime) { + LocalDateTime ldt = (LocalDateTime) timeVal; + return TimestampData.fromEpochMillis( + ldt.toInstant(ZoneOffset.UTC).toEpochMilli(), ldt.getNano() % 1_000_000); + } else if (timeVal instanceof Long) { + long timeLong = (Long) timeVal; + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1_000_000L), (int) Math.floorMod(timeLong, 1_000_000L)); + } else { + throw new IllegalStateException("Unknown type for timestamp_ns: " + timeVal.getClass()); + } + } long timeLong = getLong(pos); - return TimestampData.fromEpochMillis(timeLong / 1000, (int) (timeLong % 1000) * 1000); + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1000L), (int) Math.floorMod(timeLong, 1000L) * 1000); } @Override @@ -263,9 +283,29 @@ private Object convertValue(Type elementType, Object value) { case DECIMAL: return value; case TIMESTAMP: - long millisecond = (long) value / 1000; - int nanoOfMillisecond = (int) ((Long) value % 1000) * 1000; - return TimestampData.fromEpochMillis(millisecond, nanoOfMillisecond); + long timeMillis; + if (value instanceof LocalDateTime localDateTime) { + timeMillis = DateTimeUtil.microsFromTimestamp(localDateTime) / 1000L; + } else if (value instanceof OffsetDateTime offsetDateTime) { + timeMillis = DateTimeUtil.microsFromTimestamptz(offsetDateTime) / 1000L; + } else { + timeMillis = Math.floorDiv((Long) value, 1000L); + } + return TimestampData.fromEpochMillis( + timeMillis, + (int) Math.floorMod(value instanceof Long ? (Long) value : timeMillis * 1000L, 1000L) + * 1000); + case TIMESTAMP_NANO: + long nanoLong; + if (value instanceof LocalDateTime localDateTime) { + nanoLong = DateTimeUtil.nanosFromTimestamp(localDateTime); + } else if (value instanceof OffsetDateTime offsetDateTime) { + nanoLong = DateTimeUtil.nanosFromTimestamptz(offsetDateTime); + } else { + nanoLong = (Long) value; + } + return TimestampData.fromEpochMillis( + Math.floorDiv(nanoLong, 1_000_000L), (int) Math.floorMod(nanoLong, 1_000_000L)); case STRING: return StringData.fromString(value.toString()); case FIXED: diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/AvroToRowDataConverters.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/AvroToRowDataConverters.java new file mode 100644 index 000000000000..0f70e60a1b9f --- /dev/null +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/AvroToRowDataConverters.java @@ -0,0 +1,303 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro; + +import java.io.Serializable; +import java.lang.reflect.Array; +import java.nio.ByteBuffer; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.temporal.ChronoField; +import java.util.List; +import java.util.Map; +import org.apache.avro.generic.GenericFixed; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.utils.LogicalTypeUtils; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +/** + * Tool class used to convert from Avro {@link GenericRecord} to {@link RowData}. + * + *

    This class is adapted in Iceberg to add support for nanosecond precision timestamps + * (FLINK-39251). Once that ticket is resolved in Flink, this custom converter may be removed. + */ +@Internal +public class AvroToRowDataConverters { + + private AvroToRowDataConverters() {} + + /** + * Runtime converter that converts Avro data structures into objects of Flink Table & SQL + * internal data structures. + */ + @FunctionalInterface + public interface AvroToRowDataConverter extends Serializable { + Object convert(Object object); + } + + // ------------------------------------------------------------------------------------- + // Runtime Converters + // ------------------------------------------------------------------------------------- + + public static AvroToRowDataConverter createRowConverter(RowType rowType) { + return createRowConverter(rowType, true); + } + + public static AvroToRowDataConverter createRowConverter( + RowType rowType, boolean legacyTimestampMapping) { + final AvroToRowDataConverter[] fieldConverters = + rowType.getFields().stream() + .map(RowType.RowField::getType) + .map(type -> createNullableConverter(type, legacyTimestampMapping)) + .toArray(AvroToRowDataConverter[]::new); + final int arity = rowType.getFieldCount(); + + return avroObject -> { + IndexedRecord record = (IndexedRecord) avroObject; + GenericRowData row = new GenericRowData(arity); + for (int i = 0; i < arity; ++i) { + // avro always deserialize successfully even though the type isn't matched + // so no need to throw exception about which field can't be deserialized + row.setField(i, fieldConverters[i].convert(record.get(i))); + } + return row; + }; + } + + /** Creates a runtime converter which is null safe. */ + private static AvroToRowDataConverter createNullableConverter( + LogicalType type, boolean legacyTimestampMapping) { + final AvroToRowDataConverter converter = createConverter(type, legacyTimestampMapping); + return avroObject -> { + if (avroObject == null) { + return null; + } + return converter.convert(avroObject); + }; + } + + /** Creates a runtime converter which assuming input object is not null. */ + private static AvroToRowDataConverter createConverter( + LogicalType type, boolean legacyTimestampMapping) { + switch (type.getTypeRoot()) { + case NULL: + return avroObject -> null; + case TINYINT: + return avroObject -> ((Integer) avroObject).byteValue(); + case SMALLINT: + return avroObject -> ((Integer) avroObject).shortValue(); + case BOOLEAN: // boolean + case INTEGER: // int + case INTERVAL_YEAR_MONTH: // long + case BIGINT: // long + case INTERVAL_DAY_TIME: // long + case FLOAT: // float + case DOUBLE: // double + return avroObject -> avroObject; + case DATE: + return AvroToRowDataConverters::convertToDate; + case TIME_WITHOUT_TIME_ZONE: + return AvroToRowDataConverters::convertToTime; + case TIMESTAMP_WITHOUT_TIME_ZONE: + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return avroObject -> convertToTimestamp(avroObject, type); + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + if (legacyTimestampMapping) { + throw new UnsupportedOperationException("Unsupported type: " + type); + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return avroObject -> convertToTimestamp(avroObject, type); + } + case CHAR: + case VARCHAR: + return avroObject -> StringData.fromString(avroObject.toString()); + case BINARY: + case VARBINARY: + return AvroToRowDataConverters::convertToBytes; + case DECIMAL: + return createDecimalConverter((DecimalType) type); + case ARRAY: + return createArrayConverter((ArrayType) type, legacyTimestampMapping); + case ROW: + return createRowConverter((RowType) type); + case MAP: + case MULTISET: + return createMapConverter(type, legacyTimestampMapping); + case RAW: + default: + throw new UnsupportedOperationException("Unsupported type: " + type); + } + } + + private static AvroToRowDataConverter createDecimalConverter(DecimalType decimalType) { + final int precision = decimalType.getPrecision(); + final int scale = decimalType.getScale(); + return avroObject -> { + final byte[] bytes; + if (avroObject instanceof GenericFixed) { + bytes = ((GenericFixed) avroObject).bytes(); + } else if (avroObject instanceof ByteBuffer) { + ByteBuffer byteBuffer = (ByteBuffer) avroObject; + bytes = new byte[byteBuffer.remaining()]; + byteBuffer.get(bytes); + } else { + bytes = (byte[]) avroObject; + } + return DecimalData.fromUnscaledBytes(bytes, precision, scale); + }; + } + + private static AvroToRowDataConverter createArrayConverter( + ArrayType arrayType, boolean legacyTimestampMapping) { + final AvroToRowDataConverter elementConverter = + createNullableConverter(arrayType.getElementType(), legacyTimestampMapping); + final Class elementClass = + LogicalTypeUtils.toInternalConversionClass(arrayType.getElementType()); + + return avroObject -> { + final List list = (List) avroObject; + final int length = list.size(); + final Object[] array = (Object[]) Array.newInstance(elementClass, length); + for (int i = 0; i < length; ++i) { + array[i] = elementConverter.convert(list.get(i)); + } + return new GenericArrayData(array); + }; + } + + private static AvroToRowDataConverter createMapConverter( + LogicalType type, boolean legacyTimestampMapping) { + final AvroToRowDataConverter keyConverter = + createConverter(DataTypes.STRING().getLogicalType(), legacyTimestampMapping); + final AvroToRowDataConverter valueConverter = + createNullableConverter( + AvroSchemaConverter.extractValueTypeToAvroMap(type), legacyTimestampMapping); + + return avroObject -> { + final Map map = (Map) avroObject; + Map result = Maps.newHashMap(); + for (Map.Entry entry : map.entrySet()) { + Object key = keyConverter.convert(entry.getKey()); + Object value = valueConverter.convert(entry.getValue()); + result.put(key, value); + } + return new GenericMapData(result); + }; + } + + private static TimestampData convertToTimestamp(Object object, LogicalType type) { + int precision = 3; + if (type instanceof org.apache.flink.table.types.logical.TimestampType) { + precision = ((org.apache.flink.table.types.logical.TimestampType) type).getPrecision(); + } else if (type instanceof org.apache.flink.table.types.logical.LocalZonedTimestampType) { + precision = + ((org.apache.flink.table.types.logical.LocalZonedTimestampType) type).getPrecision(); + } + + if (object instanceof Long) { + long timeLong = (Long) object; + if (precision <= 3) { + return TimestampData.fromEpochMillis(timeLong); + } else if (precision <= 6) { + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1000L), (int) Math.floorMod(timeLong, 1000L) * 1_000_000); + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1_000_000L), (int) Math.floorMod(timeLong, 1_000_000L)); + } + } else if (object instanceof Instant) { + return TimestampData.fromInstant((Instant) object); + } else if (object instanceof LocalDateTime) { + return TimestampData.fromLocalDateTime((LocalDateTime) object); + } else { + JodaConverter jodaConverter = JodaConverter.getConverter(); + if (jodaConverter != null) { + return TimestampData.fromEpochMillis(jodaConverter.convertTimestamp(object)); + } else { + throw new IllegalArgumentException( + "Unexpected object type for TIMESTAMP logical type. Received: " + object); + } + } + } + + private static int convertToDate(Object object) { + if (object instanceof Integer) { + return (Integer) object; + } else if (object instanceof LocalDate) { + return (int) ((LocalDate) object).toEpochDay(); + } else { + JodaConverter jodaConverter = JodaConverter.getConverter(); + if (jodaConverter != null) { + return (int) jodaConverter.convertDate(object); + } else { + throw new IllegalArgumentException( + "Unexpected object type for DATE logical type. Received: " + object); + } + } + } + + private static int convertToTime(Object object) { + final int millis; + if (object instanceof Integer) { + millis = (Integer) object; + } else if (object instanceof LocalTime) { + millis = ((LocalTime) object).get(ChronoField.MILLI_OF_DAY); + } else { + JodaConverter jodaConverter = JodaConverter.getConverter(); + if (jodaConverter != null) { + millis = jodaConverter.convertTime(object); + } else { + throw new IllegalArgumentException( + "Unexpected object type for TIME logical type. Received: " + object); + } + } + return millis; + } + + private static byte[] convertToBytes(Object object) { + if (object instanceof GenericFixed) { + return ((GenericFixed) object).bytes(); + } else if (object instanceof ByteBuffer) { + ByteBuffer byteBuffer = (ByteBuffer) object; + byte[] bytes = new byte[byteBuffer.remaining()]; + byteBuffer.get(bytes); + return bytes; + } else { + return (byte[]) object; + } + } +} diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/JodaConverter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/JodaConverter.java new file mode 100644 index 000000000000..c30b78023345 --- /dev/null +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/JodaConverter.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro; + +import org.joda.time.DateTime; +import org.joda.time.DateTimeFieldType; +import org.joda.time.LocalDate; +import org.joda.time.LocalTime; + +/** + * Encapsulates joda optional dependency. Instantiates this class only if joda is available on the + * classpath. + */ +@SuppressWarnings("JavaUtilDate") +class JodaConverter { + + private static JodaConverter instance; + private static boolean instantiated = false; + + public static JodaConverter getConverter() { + if (instantiated) { + return instance; + } + + try { + Class.forName( + "org.joda.time.DateTime", false, Thread.currentThread().getContextClassLoader()); + instance = new JodaConverter(); + } catch (ClassNotFoundException e) { + instance = null; + } finally { + instantiated = true; + } + return instance; + } + + public long convertDate(Object object) { + final LocalDate value = (LocalDate) object; + return value.toDate().getTime(); + } + + public int convertTime(Object object) { + final LocalTime value = (LocalTime) object; + return value.get(DateTimeFieldType.millisOfDay()); + } + + public long convertTimestamp(Object object) { + final DateTime value = (DateTime) object; + return value.toDate().getTime(); + } + + private JodaConverter() {} +} diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/RowDataToAvroConverters.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/RowDataToAvroConverters.java new file mode 100644 index 000000000000..d4c7e4282d6e --- /dev/null +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/RowDataToAvroConverters.java @@ -0,0 +1,394 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro; + +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.time.ZoneOffset; +import java.util.List; +import java.util.Map; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.util.Utf8; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.util.CollectionUtil; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +/** + * Tool class used to convert from {@link RowData} to Avro {@link GenericRecord}. + * + *

    This class is adapted in Iceberg to add support for nanosecond precision timestamps + * (FLINK-39251). Once that ticket is resolved in Flink, this custom converter may be removed. + */ +@Internal +public class RowDataToAvroConverters { + + private RowDataToAvroConverters() {} + + // -------------------------------------------------------------------------------- + // Runtime Converters + // -------------------------------------------------------------------------------- + + /** + * Runtime converter that converts objects of Flink Table & SQL internal data structures to + * corresponding Avro data structures. + */ + @FunctionalInterface + public interface RowDataToAvroConverter extends Serializable { + Object convert(Schema schema, Object object); + } + + // -------------------------------------------------------------------------------- + // IMPORTANT! We use anonymous classes instead of lambdas for a reason here. It is + // necessary because the maven shade plugin cannot relocate classes in + // SerializedLambdas (MSHADE-260). On the other hand we want to relocate Avro for + // sql-client uber jars. + // -------------------------------------------------------------------------------- + + /** + * Creates a runtime converter according to the given logical type that converts objects of Flink + * Table & SQL internal data structures to corresponding Avro data structures. + */ + public static RowDataToAvroConverter createConverter(LogicalType type) { + return createConverter(type, true); + } + + @SuppressWarnings("checkstyle:MethodLength") + public static RowDataToAvroConverter createConverter( + LogicalType type, boolean legacyTimestampMapping) { + final RowDataToAvroConverter converter; + switch (type.getTypeRoot()) { + case NULL: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return null; + } + }; + break; + case TINYINT: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ((Byte) object).intValue(); + } + }; + break; + case SMALLINT: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ((Short) object).intValue(); + } + }; + break; + case BOOLEAN: // boolean + case INTEGER: // int + case INTERVAL_YEAR_MONTH: // long + case BIGINT: // long + case INTERVAL_DAY_TIME: // long + case FLOAT: // float + case DOUBLE: // double + case TIME_WITHOUT_TIME_ZONE: // int + case DATE: // int + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return object; + } + }; + break; + case CHAR: + case VARCHAR: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return new Utf8(object.toString()); + } + }; + break; + case BINARY: + case VARBINARY: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ByteBuffer.wrap((byte[]) object); + } + }; + break; + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + case TIMESTAMP_WITHOUT_TIME_ZONE: + final int tzPrecision; + if (type instanceof org.apache.flink.table.types.logical.TimestampType) { + tzPrecision = ((org.apache.flink.table.types.logical.TimestampType) type).getPrecision(); + } else { + tzPrecision = 3; + } + if (legacyTimestampMapping) { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + TimestampData timestampData = (TimestampData) object; + if (tzPrecision <= 3) { + return timestampData.getMillisecond(); + } else if (tzPrecision <= 6) { + return timestampData.getMillisecond() * 1000L + + timestampData.getNanoOfMillisecond() / 1000; + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return timestampData.getMillisecond() * 1_000_000L + + timestampData.getNanoOfMillisecond(); + } + } + }; + } else { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + TimestampData timestampData = (TimestampData) object; + java.time.Instant instant = + timestampData.toLocalDateTime().toInstant(ZoneOffset.UTC); + if (tzPrecision <= 3) { + return instant.toEpochMilli(); + } else if (tzPrecision <= 6) { + return instant.getEpochSecond() * 1_000_000L + instant.getNano() / 1000; + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return instant.getEpochSecond() * 1_000_000_000L + instant.getNano(); + } + } + }; + } + break; + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + final int ltzPrecision; + if (type instanceof org.apache.flink.table.types.logical.LocalZonedTimestampType) { + ltzPrecision = + ((org.apache.flink.table.types.logical.LocalZonedTimestampType) type).getPrecision(); + } else { + ltzPrecision = 3; + } + if (legacyTimestampMapping) { + throw new UnsupportedOperationException("Unsupported type: " + type); + } else { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + TimestampData timestampData = (TimestampData) object; + if (ltzPrecision <= 3) { + return timestampData.getMillisecond(); + } else if (ltzPrecision <= 6) { + return timestampData.getMillisecond() * 1000L + + timestampData.getNanoOfMillisecond() / 1000; + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return timestampData.getMillisecond() * 1_000_000L + + timestampData.getNanoOfMillisecond(); + } + } + }; + } + break; + case DECIMAL: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ByteBuffer.wrap(((DecimalData) object).toUnscaledBytes()); + } + }; + break; + case ARRAY: + converter = createArrayConverter((ArrayType) type, legacyTimestampMapping); + break; + case ROW: + converter = createRowConverter((RowType) type, legacyTimestampMapping); + break; + case MAP: + case MULTISET: + converter = createMapConverter(type, legacyTimestampMapping); + break; + case RAW: + default: + throw new UnsupportedOperationException("Unsupported type: " + type); + } + + // wrap into nullable converter + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + if (object == null) { + return null; + } + + // get actual schema if it is a nullable schema + Schema actualSchema; + if (schema.getType() == Schema.Type.UNION) { + List types = schema.getTypes(); + int size = types.size(); + if (size == 2 && types.get(1).getType() == Schema.Type.NULL) { + actualSchema = types.get(0); + } else if (size == 2 && types.get(0).getType() == Schema.Type.NULL) { + actualSchema = types.get(1); + } else { + throw new IllegalArgumentException( + "The Avro schema is not a nullable type: " + schema.toString()); + } + } else { + actualSchema = schema; + } + return converter.convert(actualSchema, object); + } + }; + } + + private static RowDataToAvroConverter createRowConverter( + RowType rowType, boolean legacyTimestampMapping) { + final RowDataToAvroConverter[] fieldConverters = + rowType.getChildren().stream() + .map(legacyType -> createConverter(legacyType, legacyTimestampMapping)) + .toArray(RowDataToAvroConverter[]::new); + final LogicalType[] fieldTypes = + rowType.getFields().stream().map(RowType.RowField::getType).toArray(LogicalType[]::new); + final RowData.FieldGetter[] fieldGetters = new RowData.FieldGetter[fieldTypes.length]; + for (int i = 0; i < fieldTypes.length; i++) { + fieldGetters[i] = RowData.createFieldGetter(fieldTypes[i], i); + } + final int length = rowType.getFieldCount(); + + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + final RowData row = (RowData) object; + final List fields = schema.getFields(); + final GenericRecord record = new GenericData.Record(schema); + for (int i = 0; i < length; ++i) { + final Schema.Field schemaField = fields.get(i); + try { + Object avroObject = + fieldConverters[i].convert( + schemaField.schema(), fieldGetters[i].getFieldOrNull(row)); + record.put(i, avroObject); + } catch (Throwable t) { + throw new RuntimeException( + String.format("Fail to serialize at field: %s.", schemaField.name()), t); + } + } + return record; + } + }; + } + + private static RowDataToAvroConverter createArrayConverter( + ArrayType arrayType, boolean legacyTimestampMapping) { + LogicalType elementType = arrayType.getElementType(); + final ArrayData.ElementGetter elementGetter = ArrayData.createElementGetter(elementType); + final RowDataToAvroConverter elementConverter = + createConverter(arrayType.getElementType(), legacyTimestampMapping); + + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + final Schema elementSchema = schema.getElementType(); + ArrayData arrayData = (ArrayData) object; + List list = Lists.newArrayList(); + for (int i = 0; i < arrayData.size(); ++i) { + list.add( + elementConverter.convert( + elementSchema, elementGetter.getElementOrNull(arrayData, i))); + } + return list; + } + }; + } + + private static RowDataToAvroConverter createMapConverter( + LogicalType type, boolean legacyTimestampMapping) { + LogicalType valueType = AvroSchemaConverter.extractValueTypeToAvroMap(type); + final ArrayData.ElementGetter valueGetter = ArrayData.createElementGetter(valueType); + final RowDataToAvroConverter valueConverter = + createConverter(valueType, legacyTimestampMapping); + + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + final Schema valueSchema = schema.getValueType(); + final MapData mapData = (MapData) object; + final ArrayData keyArray = mapData.keyArray(); + final ArrayData valueArray = mapData.valueArray(); + final Map map = CollectionUtil.newHashMapWithExpectedSize(mapData.size()); + for (int i = 0; i < mapData.size(); ++i) { + final String key = keyArray.getString(i).toString(); + final Object value = + valueConverter.convert(valueSchema, valueGetter.getElementOrNull(valueArray, i)); + map.put(key, value); + } + return map; + } + }; + } +} diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/typeutils/AvroSchemaConverter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/typeutils/AvroSchemaConverter.java new file mode 100644 index 000000000000..fb77c124e504 --- /dev/null +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/typeutils/AvroSchemaConverter.java @@ -0,0 +1,625 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro.typeutils; + +import java.util.List; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; +import org.apache.avro.SchemaParseException; +import org.apache.avro.specific.SpecificData; +import org.apache.avro.specific.SpecificRecord; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.formats.avro.AvroRowDataDeserializationSchema; +import org.apache.flink.formats.avro.AvroRowDataSerializationSchema; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.legacy.types.logical.TypeInformationRawType; +import org.apache.flink.table.types.AtomicDataType; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.IntType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.LogicalTypeFamily; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.MultisetType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.TimeType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.flink.types.Row; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * Converts an Avro schema into Flink's type information. It uses {@link RowTypeInfo} for + * representing objects and converts Avro types into types that are compatible with Flink's Table + * & SQL API. + * + *

    Note: Changes in this class need to be kept in sync with the corresponding runtime classes + * {@link AvroRowDataDeserializationSchema} and {@link AvroRowDataSerializationSchema}. + * + *

    This class is adapted in Iceberg to support custom 'timestamp-nanos' and + * 'local-timestamp-nanos' logical types (FLINK-39251). Once that ticket is resolved in Flink, these + * custom types may be removed. + */ +public class AvroSchemaConverter { + + private AvroSchemaConverter() { + // private + } + + /** + * Converts an Avro class into a nested row structure with deterministic field order and data + * types that are compatible with Flink's Table & SQL API. + * + * @param avroClass Avro specific record that contains schema information + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo( + Class avroClass) { + return convertToTypeInfo(avroClass, true); + } + + /** + * Converts an Avro class into a nested row structure with deterministic field order and data + * types that are compatible with Flink's Table & SQL API. + * + * @param avroClass Avro specific record that contains schema information + * @param legacyTimestampMapping legacy mapping of timestamp types + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo( + Class avroClass, boolean legacyTimestampMapping) { + Preconditions.checkNotNull(avroClass, "Avro specific record class must not be null."); + // determine schema to retrieve deterministic field order + final Schema schema = SpecificData.get().getSchema(avroClass); + return (TypeInformation) convertToTypeInfo(schema, true); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo(String avroSchemaString) { + return convertToTypeInfo(avroSchemaString, true); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @param legacyTimestampMapping legacy mapping of timestamp types + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo( + String avroSchemaString, boolean legacyTimestampMapping) { + Preconditions.checkNotNull(avroSchemaString, "Avro schema must not be null."); + final Schema schema; + try { + schema = new Schema.Parser().parse(avroSchemaString); + } catch (SchemaParseException e) { + throw new IllegalArgumentException("Could not parse Avro schema string.", e); + } + return (TypeInformation) convertToTypeInfo(schema, legacyTimestampMapping); + } + + private static TypeInformation convertToTypeInfo( + Schema schema, boolean legacyTimestampMapping) { + switch (schema.getType()) { + case RECORD: + final List fields = schema.getFields(); + + final TypeInformation[] types = new TypeInformation[fields.size()]; + final String[] names = new String[fields.size()]; + for (int i = 0; i < fields.size(); i++) { + final Schema.Field field = fields.get(i); + types[i] = convertToTypeInfo(field.schema(), legacyTimestampMapping); + names[i] = field.name(); + } + return Types.ROW_NAMED(names, types); + case ENUM: + return Types.STRING; + case ARRAY: + // result type might either be ObjectArrayTypeInfo or BasicArrayTypeInfo for Strings + return Types.OBJECT_ARRAY( + convertToTypeInfo(schema.getElementType(), legacyTimestampMapping)); + case MAP: + return Types.MAP( + Types.STRING, convertToTypeInfo(schema.getValueType(), legacyTimestampMapping)); + case UNION: + final Schema actualSchema; + if (schema.getTypes().size() == 2 + && schema.getTypes().get(0).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(1); + } else if (schema.getTypes().size() == 2 + && schema.getTypes().get(1).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(0); + } else if (schema.getTypes().size() == 1) { + actualSchema = schema.getTypes().get(0); + } else { + // use Kryo for serialization + return Types.GENERIC(Object.class); + } + return convertToTypeInfo(actualSchema, legacyTimestampMapping); + case FIXED: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + return Types.BIG_DEC; + } + // convert fixed size binary data to primitive byte arrays + return Types.PRIMITIVE_ARRAY(Types.BYTE); + case STRING: + // convert Avro's Utf8/CharSequence to String + return Types.STRING; + case BYTES: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + return Types.BIG_DEC; + } + return Types.PRIMITIVE_ARRAY(Types.BYTE); + case INT: + // logical date and time type + final org.apache.avro.LogicalType logicalType = schema.getLogicalType(); + if (logicalType == LogicalTypes.date()) { + return Types.SQL_DATE; + } else if (logicalType == LogicalTypes.timeMillis()) { + return Types.SQL_TIME; + } + return Types.INT; + case LONG: + if (legacyTimestampMapping) { + if (schema.getLogicalType() == LogicalTypes.timestampMillis() + || schema.getLogicalType() == LogicalTypes.timestampMicros() + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + || (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos"))) { + return Types.SQL_TIMESTAMP; + } else if (schema.getLogicalType() == LogicalTypes.timeMicros() + || schema.getLogicalType() == LogicalTypes.timeMillis()) { + return Types.SQL_TIME; + } + } else { + // Avro logical timestamp types to Flink DataStream timestamp types + if (schema.getLogicalType() == LogicalTypes.timestampMillis() + || schema.getLogicalType() == LogicalTypes.timestampMicros() + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + || (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos"))) { + return Types.INSTANT; + } else if (schema.getLogicalType() == LogicalTypes.localTimestampMillis() + || schema.getLogicalType() == LogicalTypes.localTimestampMicros() + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + || (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("local-timestamp-nanos"))) { + return Types.LOCAL_DATE_TIME; + } else if (schema.getLogicalType() == LogicalTypes.timeMicros() + || schema.getLogicalType() == LogicalTypes.timeMillis()) { + return Types.SQL_TIME; + } + } + return Types.LONG; + case FLOAT: + return Types.FLOAT; + case DOUBLE: + return Types.DOUBLE; + case BOOLEAN: + return Types.BOOLEAN; + case NULL: + return Types.VOID; + } + throw new IllegalArgumentException("Unsupported Avro type '" + schema.getType() + "'."); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @return data type matching the schema + */ + public static DataType convertToDataType(String avroSchemaString) { + return convertToDataType(avroSchemaString, true); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @param legacyTimestampMapping legacy mapping of local timestamps + * @return data type matching the schema + */ + public static DataType convertToDataType( + String avroSchemaString, boolean legacyTimestampMapping) { + Preconditions.checkNotNull(avroSchemaString, "Avro schema must not be null."); + final Schema schema; + try { + schema = new Schema.Parser().parse(avroSchemaString); + } catch (SchemaParseException e) { + throw new IllegalArgumentException("Could not parse Avro schema string.", e); + } + return convertToDataType(schema, legacyTimestampMapping); + } + + @SuppressWarnings("deprecation") + private static DataType convertToDataType(Schema schema, boolean legacyMapping) { + switch (schema.getType()) { + case RECORD: + final List schemaFields = schema.getFields(); + + final DataTypes.Field[] fields = new DataTypes.Field[schemaFields.size()]; + for (int i = 0; i < schemaFields.size(); i++) { + final Schema.Field field = schemaFields.get(i); + fields[i] = + DataTypes.FIELD(field.name(), convertToDataType(field.schema(), legacyMapping)); + } + return DataTypes.ROW(fields).notNull(); + case ENUM: + return DataTypes.STRING().notNull(); + case ARRAY: + return DataTypes.ARRAY(convertToDataType(schema.getElementType(), legacyMapping)).notNull(); + case MAP: + return DataTypes.MAP( + DataTypes.STRING().notNull(), + convertToDataType(schema.getValueType(), legacyMapping)) + .notNull(); + case UNION: + final Schema actualSchema; + final boolean nullable; + if (schema.getTypes().size() == 2 + && schema.getTypes().get(0).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(1); + nullable = true; + } else if (schema.getTypes().size() == 2 + && schema.getTypes().get(1).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(0); + nullable = true; + } else if (schema.getTypes().size() == 1) { + actualSchema = schema.getTypes().get(0); + nullable = false; + } else { + // use Kryo for serialization + return new AtomicDataType( + new TypeInformationRawType<>(false, Types.GENERIC(Object.class))); + } + DataType converted = convertToDataType(actualSchema, legacyMapping); + return nullable ? converted.nullable() : converted; + case FIXED: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + final LogicalTypes.Decimal decimalType = (LogicalTypes.Decimal) schema.getLogicalType(); + return DataTypes.DECIMAL(decimalType.getPrecision(), decimalType.getScale()).notNull(); + } + // convert fixed size binary data to primitive byte arrays + return DataTypes.VARBINARY(schema.getFixedSize()).notNull(); + case STRING: + // convert Avro's Utf8/CharSequence to String + return DataTypes.STRING().notNull(); + case BYTES: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + final LogicalTypes.Decimal decimalType = (LogicalTypes.Decimal) schema.getLogicalType(); + return DataTypes.DECIMAL(decimalType.getPrecision(), decimalType.getScale()).notNull(); + } + return DataTypes.BYTES().notNull(); + case INT: + // logical date and time type + final org.apache.avro.LogicalType logicalType = schema.getLogicalType(); + if (logicalType == LogicalTypes.date()) { + return DataTypes.DATE().notNull(); + } else if (logicalType == LogicalTypes.timeMillis()) { + return DataTypes.TIME(3).notNull(); + } + return DataTypes.INT().notNull(); + case LONG: + if (legacyMapping) { + // Avro logical timestamp types to Flink SQL timestamp types + if (schema.getLogicalType() == LogicalTypes.timestampMillis()) { + return DataTypes.TIMESTAMP(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timestampMicros()) { + return DataTypes.TIMESTAMP(6).notNull(); + } else if (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos")) { + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + return DataTypes.TIMESTAMP(9).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMillis()) { + return DataTypes.TIME(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMicros()) { + return DataTypes.TIME(6).notNull(); + } + } else { + // Avro logical timestamp types to Flink SQL timestamp types + if (schema.getLogicalType() == LogicalTypes.timestampMillis()) { + return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timestampMicros()) { + return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(6).notNull(); + } else if (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos")) { + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(9).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMillis()) { + return DataTypes.TIME(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMicros()) { + return DataTypes.TIME(6).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.localTimestampMillis()) { + return DataTypes.TIMESTAMP(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.localTimestampMicros()) { + return DataTypes.TIMESTAMP(6).notNull(); + } else if (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("local-timestamp-nanos")) { + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + return DataTypes.TIMESTAMP(9).notNull(); + } + } + + return DataTypes.BIGINT().notNull(); + case FLOAT: + return DataTypes.FLOAT().notNull(); + case DOUBLE: + return DataTypes.DOUBLE().notNull(); + case BOOLEAN: + return DataTypes.BOOLEAN().notNull(); + case NULL: + return DataTypes.NULL(); + } + throw new IllegalArgumentException("Unsupported Avro type '" + schema.getType() + "'."); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

    Use "org.apache.flink.avro.generated.record" as the type name. + * + * @param schema the schema type, usually it should be the top level record type, e.g. not a + * nested type + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema(LogicalType schema) { + return convertToSchema(schema, true); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

    Use "org.apache.flink.avro.generated.record" as the type name. + * + * @param schema the schema type, usually it should be the top level record type, e.g. not a + * nested type + * @param legacyTimestampMapping whether to use the legacy timestamp mapping + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema(LogicalType schema, boolean legacyTimestampMapping) { + return convertToSchema( + schema, "org.apache.flink.avro.generated.record", legacyTimestampMapping); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

    The "{rowName}_" is used as the nested row type name prefix in order to generate the right + * schema. Nested record type that only differs with type name is still compatible. + * + * @param logicalType logical type + * @param rowName the record name + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema(LogicalType logicalType, String rowName) { + return convertToSchema(logicalType, rowName, true); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

    The "{rowName}_" is used as the nested row type name prefix in order to generate the right + * schema. Nested record type that only differs with type name is still compatible. + * + * @param logicalType logical type + * @param rowName the record name + * @param legacyTimestampMapping whether to use legal timestamp mapping + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema( + LogicalType logicalType, String rowName, boolean legacyTimestampMapping) { + int precision; + boolean nullable = logicalType.isNullable(); + switch (logicalType.getTypeRoot()) { + case NULL: + return SchemaBuilder.builder().nullType(); + case BOOLEAN: + Schema bool = SchemaBuilder.builder().booleanType(); + return nullable ? nullableSchema(bool) : bool; + case TINYINT: + case SMALLINT: + case INTEGER: + Schema integer = SchemaBuilder.builder().intType(); + return nullable ? nullableSchema(integer) : integer; + case BIGINT: + Schema bigint = SchemaBuilder.builder().longType(); + return nullable ? nullableSchema(bigint) : bigint; + case FLOAT: + Schema floatSchema = SchemaBuilder.builder().floatType(); + return nullable ? nullableSchema(floatSchema) : floatSchema; + case DOUBLE: + Schema doubleSchema = SchemaBuilder.builder().doubleType(); + return nullable ? nullableSchema(doubleSchema) : doubleSchema; + case CHAR: + case VARCHAR: + Schema str = SchemaBuilder.builder().stringType(); + return nullable ? nullableSchema(str) : str; + case BINARY: + case VARBINARY: + Schema binary = SchemaBuilder.builder().bytesType(); + return nullable ? nullableSchema(binary) : binary; + case TIMESTAMP_WITHOUT_TIME_ZONE: + // use long to represents Timestamp + final TimestampType timestampType = (TimestampType) logicalType; + precision = timestampType.getPrecision(); + org.apache.avro.LogicalType avroLogicalType; + if (legacyTimestampMapping) { + if (precision <= 3) { + avroLogicalType = LogicalTypes.timestampMillis(); + } else { + throw new IllegalArgumentException( + "Avro does not support TIMESTAMP type " + + "with precision: " + + precision + + ", it only supports precision less than 3."); + } + } else { + if (precision <= 3) { + avroLogicalType = LogicalTypes.localTimestampMillis(); + } else if (precision <= 6) { + avroLogicalType = LogicalTypes.localTimestampMicros(); + } else { + throw new IllegalArgumentException( + "Avro does not support LOCAL TIMESTAMP type " + + "with precision: " + + precision + + ", it only supports precision less than 6."); + } + } + Schema timestamp = avroLogicalType.addToSchema(SchemaBuilder.builder().longType()); + return nullable ? nullableSchema(timestamp) : timestamp; + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + if (legacyTimestampMapping) { + throw new UnsupportedOperationException( + "Unsupported to derive Schema for type: " + logicalType); + } else { + final LocalZonedTimestampType localZonedTimestampType = + (LocalZonedTimestampType) logicalType; + precision = localZonedTimestampType.getPrecision(); + if (precision <= 3) { + avroLogicalType = LogicalTypes.timestampMillis(); + } else if (precision <= 6) { + avroLogicalType = LogicalTypes.timestampMicros(); + } else { + throw new IllegalArgumentException( + "Avro does not support TIMESTAMP type " + + "with precision: " + + precision + + ", it only supports precision less than 6."); + } + timestamp = avroLogicalType.addToSchema(SchemaBuilder.builder().longType()); + return nullable ? nullableSchema(timestamp) : timestamp; + } + case DATE: + // use int to represents Date + Schema date = LogicalTypes.date().addToSchema(SchemaBuilder.builder().intType()); + return nullable ? nullableSchema(date) : date; + case TIME_WITHOUT_TIME_ZONE: + precision = ((TimeType) logicalType).getPrecision(); + if (precision > 3) { + throw new IllegalArgumentException( + "Avro does not support TIME type with precision: " + + precision + + ", it only supports precision less than 3."); + } + // use int to represents Time, we only support millisecond when deserialization + Schema time = LogicalTypes.timeMillis().addToSchema(SchemaBuilder.builder().intType()); + return nullable ? nullableSchema(time) : time; + case DECIMAL: + DecimalType decimalType = (DecimalType) logicalType; + // store BigDecimal as byte[] + Schema decimal = + LogicalTypes.decimal(decimalType.getPrecision(), decimalType.getScale()) + .addToSchema(SchemaBuilder.builder().bytesType()); + return nullable ? nullableSchema(decimal) : decimal; + case ROW: + RowType rowType = (RowType) logicalType; + List fieldNames = rowType.getFieldNames(); + // we have to make sure the record name is different in a Schema + SchemaBuilder.FieldAssembler builder = + SchemaBuilder.builder().record(rowName).fields(); + for (int i = 0; i < rowType.getFieldCount(); i++) { + String fieldName = fieldNames.get(i); + LogicalType fieldType = rowType.getTypeAt(i); + SchemaBuilder.GenericDefault fieldBuilder = + builder + .name(fieldName) + .type( + convertToSchema( + fieldType, rowName + "_" + fieldName, legacyTimestampMapping)); + + if (fieldType.isNullable()) { + builder = fieldBuilder.withDefault(null); + } else { + builder = fieldBuilder.noDefault(); + } + } + Schema record = builder.endRecord(); + return nullable ? nullableSchema(record) : record; + case MULTISET: + case MAP: + Schema map = + SchemaBuilder.builder() + .map() + .values(convertToSchema(extractValueTypeToAvroMap(logicalType), rowName)); + return nullable ? nullableSchema(map) : map; + case ARRAY: + ArrayType arrayType = (ArrayType) logicalType; + Schema array = + SchemaBuilder.builder() + .array() + .items(convertToSchema(arrayType.getElementType(), rowName)); + return nullable ? nullableSchema(array) : array; + case RAW: + default: + throw new UnsupportedOperationException( + "Unsupported to derive Schema for type: " + logicalType); + } + } + + public static LogicalType extractValueTypeToAvroMap(LogicalType type) { + LogicalType keyType; + LogicalType valueType; + if (type instanceof MapType) { + MapType mapType = (MapType) type; + keyType = mapType.getKeyType(); + valueType = mapType.getValueType(); + } else { + MultisetType multisetType = (MultisetType) type; + keyType = multisetType.getElementType(); + valueType = new IntType(); + } + if (!keyType.is(LogicalTypeFamily.CHARACTER_STRING)) { + throw new UnsupportedOperationException( + "Avro format doesn't support non-string as key type of map. " + + "The key type is: " + + keyType.asSummaryString()); + } + return valueType; + } + + /** Returns schema with nullable true. */ + private static Schema nullableSchema(Schema schema) { + return schema.isNullable() + ? schema + : Schema.createUnion(SchemaBuilder.builder().nullType(), schema); + } +} diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java index f68605accc57..30e95b1edba0 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java @@ -260,10 +260,6 @@ public void unlock() { this, instanceId, count); - } catch (SQLException e) { - // SQL exception happened when deleting lock information - throw new UncheckedSQLException( - e, "Failed to delete %s lock with instanceId %s", this, instanceId); } return null; @@ -298,9 +294,6 @@ private String instanceId() { return null; } } - } catch (SQLException e) { - // SQL exception happened when getting lock information - throw new UncheckedSQLException(e, "Failed to get lock information for %s", type); } }); } catch (InterruptedException e) { diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java index 9aeee75b1464..f03f33a3fd81 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java @@ -24,6 +24,7 @@ import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.util.function.SerializableSupplier; import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.actions.BinPackRewriteFilePlanner; import org.apache.iceberg.actions.SizeBasedFileRewritePlanner; @@ -59,7 +60,7 @@ public static class Builder extends MaintenanceTaskBuilder rewriteOptions = Maps.newHashMapWithExpectedSize(6); private long maxRewriteBytes = Long.MAX_VALUE; - private Expression filter = Expressions.alwaysTrue(); + private SerializableSupplier filterSupplier = Expressions::alwaysTrue; private String branch = SnapshotRef.MAIN_BRANCH; @Override @@ -214,9 +215,32 @@ public Builder maxFilesToRewrite(int maxFilesToRewrite) { * * @param newFilter the filter expression to apply * @return this for method chaining + * @deprecated will be removed in 1.12.0. Use {@link #filter(SerializableSupplier)} instead */ + @Deprecated public Builder filter(Expression newFilter) { - this.filter = newFilter; + this.filterSupplier = () -> newFilter; + return this; + } + + /** + * A user-provided supplier of a filter expression that determines which files are considered by + * the rewrite strategy. + * + *

    The supplier is evaluated by the planner on every compaction trigger, allowing a fresh + * filter to be produced for each compaction run. + * + *

    This is particularly useful for time-relative filters. For example, a supplier such as + * {@code () -> Expressions.greaterThanOrEqual("ts", + * LocalDateTime.now(ZoneOffset.UTC).minus(Duration.ofDays(3)).toString())} ensures that each + * compaction rewrites files from the last 3 days relative to the time the compaction is + * planned, rather than relative to when the job was started. + * + * @param newFilterSupplier the supplier providing the filter expression to apply + * @return this for method chaining + */ + public Builder filter(SerializableSupplier newFilterSupplier) { + this.filterSupplier = newFilterSupplier; return this; } @@ -276,7 +300,7 @@ DataStream append(DataStream trigger) { partialProgressEnabled ? partialProgressMaxCommits : 1, maxRewriteBytes, rewriteOptions, - filter, + filterSupplier, branch)) .name(operatorName(PLANNER_TASK_NAME)) .uid(PLANNER_TASK_NAME + uidSuffix()) diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java index a9360374df28..b78c602c647f 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java @@ -26,6 +26,7 @@ import org.apache.flink.metrics.Counter; import org.apache.flink.streaming.api.functions.ProcessFunction; import org.apache.flink.util.Collector; +import org.apache.flink.util.function.SerializableSupplier; import org.apache.iceberg.DataFile; import org.apache.iceberg.FileScanTask; import org.apache.iceberg.SerializableTable; @@ -62,8 +63,8 @@ public class DataFileRewritePlanner private final long maxRewriteBytes; private final Map rewriterOptions; private transient Counter errorCounter; - private final Expression filter; private final String branch; + private final SerializableSupplier filterSupplier; public DataFileRewritePlanner( String tableName, @@ -73,7 +74,7 @@ public DataFileRewritePlanner( int newPartialProgressMaxCommits, long maxRewriteBytes, Map rewriterOptions, - Expression filter, + SerializableSupplier filterSupplier, String branch) { Preconditions.checkNotNull(tableName, "Table name should no be null"); @@ -89,8 +90,8 @@ public DataFileRewritePlanner( this.partialProgressMaxCommits = newPartialProgressMaxCommits; this.maxRewriteBytes = maxRewriteBytes; this.rewriterOptions = rewriterOptions; - this.filter = filter; this.branch = branch; + this.filterSupplier = filterSupplier; } @Override @@ -125,7 +126,7 @@ public void processElement(Trigger value, Context ctx, Collector o } BinPackRewriteFilePlanner planner = - new BinPackRewriteFilePlanner(table, filter, snapshot.snapshotId(), false); + new BinPackRewriteFilePlanner(table, filterSupplier.get(), snapshot.snapshotId(), false); planner.init(rewriterOptions); FileRewritePlan diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java index f7e8e0c884cf..5f3494330cfc 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java @@ -21,14 +21,14 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.flink.api.common.functions.MapFunction; -import org.apache.flink.formats.avro.AvroToRowDataConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.table.types.logical.RowType; import org.apache.flink.table.types.utils.TypeConversions; import org.apache.iceberg.avro.AvroSchemaUtil; +import org.apache.iceberg.flink.formats.avro.AvroToRowDataConverters; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; /** * This util class converts Avro GenericRecord to Flink RowData.
    diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java index 434f3969577f..6cf15ff713fb 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java @@ -18,23 +18,33 @@ */ package org.apache.iceberg.flink.sink; -import com.codahale.metrics.SlidingWindowReservoir; import java.util.Arrays; import java.util.concurrent.atomic.AtomicLong; import org.apache.flink.annotation.Internal; -import org.apache.flink.dropwizard.metrics.DropwizardHistogramWrapper; +import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.metrics.Counter; import org.apache.flink.metrics.Histogram; import org.apache.flink.metrics.MetricGroup; +import org.apache.iceberg.common.DynClasses; +import org.apache.iceberg.common.DynConstructors; import org.apache.iceberg.io.WriteResult; import org.apache.iceberg.util.ScanTaskUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; @Internal public class IcebergStreamWriterMetrics { + + private static final Logger LOG = LoggerFactory.getLogger(IcebergStreamWriterMetrics.class); + // 1,024 reservoir size should cost about 8KB, which is quite small. // It should also produce good accuracy for histogram distribution (like percentiles). private static final int HISTOGRAM_RESERVOIR_SIZE = 1024; + // Histogram metrics loaded through Flink's optional flink-metrics-dropwizard dependency. + // Will be null if not available. + private static final DropwizardCtors DROPWIZARD = loadDropwizardCtors(); + private final Counter flushedDataFiles; private final Counter flushedDeleteFiles; private final Counter flushedReferencedDataFiles; @@ -51,18 +61,8 @@ public IcebergStreamWriterMetrics(MetricGroup metrics, String fullTableName) { this.lastFlushDurationMs = new AtomicLong(); writerMetrics.gauge("lastFlushDurationMs", lastFlushDurationMs::get); - com.codahale.metrics.Histogram dropwizardDataFilesSizeHistogram = - new com.codahale.metrics.Histogram(new SlidingWindowReservoir(HISTOGRAM_RESERVOIR_SIZE)); - this.dataFilesSizeHistogram = - writerMetrics.histogram( - "dataFilesSizeHistogram", - new DropwizardHistogramWrapper(dropwizardDataFilesSizeHistogram)); - com.codahale.metrics.Histogram dropwizardDeleteFilesSizeHistogram = - new com.codahale.metrics.Histogram(new SlidingWindowReservoir(HISTOGRAM_RESERVOIR_SIZE)); - this.deleteFilesSizeHistogram = - writerMetrics.histogram( - "deleteFilesSizeHistogram", - new DropwizardHistogramWrapper(dropwizardDeleteFilesSizeHistogram)); + this.dataFilesSizeHistogram = registerHistogram(writerMetrics, "dataFilesSizeHistogram"); + this.deleteFilesSizeHistogram = registerHistogram(writerMetrics, "deleteFilesSizeHistogram"); } public void updateFlushResult(WriteResult result) { @@ -74,16 +74,21 @@ public void updateFlushResult(WriteResult result) { // This should works equally well and we avoided the overhead of tracking the list of file sizes // in the {@link CommitSummary}, which currently stores simple stats for counters and gauges // metrics. - Arrays.stream(result.dataFiles()) - .forEach( - dataFile -> { - dataFilesSizeHistogram.update(dataFile.fileSizeInBytes()); - }); - Arrays.stream(result.deleteFiles()) - .forEach( - deleteFile -> { - deleteFilesSizeHistogram.update(ScanTaskUtil.contentSizeInBytes(deleteFile)); - }); + if (dataFilesSizeHistogram != null) { + Arrays.stream(result.dataFiles()) + .forEach( + dataFile -> { + dataFilesSizeHistogram.update(dataFile.fileSizeInBytes()); + }); + } + + if (deleteFilesSizeHistogram != null) { + Arrays.stream(result.deleteFiles()) + .forEach( + deleteFile -> { + deleteFilesSizeHistogram.update(ScanTaskUtil.contentSizeInBytes(deleteFile)); + }); + } } public void flushDuration(long flushDurationMs) { @@ -97,4 +102,60 @@ public Counter getFlushedDataFiles() { public Counter getFlushedDeleteFiles() { return flushedDeleteFiles; } + + @VisibleForTesting + Histogram dataFilesSizeHistogram() { + return dataFilesSizeHistogram; + } + + @VisibleForTesting + Histogram deleteFilesSizeHistogram() { + return deleteFilesSizeHistogram; + } + + private static Histogram registerHistogram(MetricGroup group, String name) { + Histogram histogram = newDropwizardHistogram(); + return histogram != null ? group.histogram(name, histogram) : null; + } + + private static Histogram newDropwizardHistogram() { + if (DROPWIZARD == null) { + return null; + } + + Object reservoir = DROPWIZARD.reservoirCtor.newInstance(HISTOGRAM_RESERVOIR_SIZE); + Object codahaleHistogram = DROPWIZARD.histogramCtor.newInstance(reservoir); + return DROPWIZARD.wrapperCtor.newInstance(codahaleHistogram); + } + + private static DropwizardCtors loadDropwizardCtors() { + try { + Class reservoirInterface = + DynClasses.builder().impl("com.codahale.metrics.Reservoir").buildChecked(); + Class codahaleHistogramClass = + DynClasses.builder().impl("com.codahale.metrics.Histogram").buildChecked(); + return new DropwizardCtors( + DynConstructors.builder() + .impl("com.codahale.metrics.SlidingWindowReservoir", int.class) + .buildChecked(), + DynConstructors.builder() + .impl("com.codahale.metrics.Histogram", reservoirInterface) + .buildChecked(), + DynConstructors.builder(Histogram.class) + .impl( + "org.apache.flink.dropwizard.metrics.DropwizardHistogramWrapper", + codahaleHistogramClass) + .buildChecked()); + } catch (ClassNotFoundException | NoSuchMethodException e) { + LOG.warn( + "Cannot load Dropwizard metrics; is org.apache.flink:flink-metrics-dropwizard on the classpath?", + e); + return null; + } + } + + private record DropwizardCtors( + DynConstructors.Ctor reservoirCtor, + DynConstructors.Ctor histogramCtor, + DynConstructors.Ctor wrapperCtor) {} } diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java index 4b5c9bef41e1..ad430cbf13f8 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java @@ -44,6 +44,7 @@ import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSink; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.streaming.runtime.operators.sink.SinkWriterOperatorFactory; import org.apache.flink.table.data.RowData; import org.apache.flink.util.OutputTag; import org.apache.iceberg.Table; @@ -79,13 +80,17 @@ public class DynamicIcebergSink private final Configuration flinkConfig; private final int cacheMaximumSize; + // Set by the builder before sinkTo() — forward writer results to union into pre-commit topology + private final transient DataStream> forwardWriteResults; + DynamicIcebergSink( CatalogLoader catalogLoader, Map snapshotProperties, String uidPrefix, Map writeProperties, Configuration flinkConfig, - int cacheMaximumSize) { + int cacheMaximumSize, + DataStream> forwardWriteResults) { this.catalogLoader = catalogLoader; this.snapshotProperties = snapshotProperties; this.uidPrefix = uidPrefix; @@ -96,6 +101,7 @@ public class DynamicIcebergSink // This is used to separate files generated by different sinks writing the same table. // Also used to generate the aggregator operator name this.sinkId = UUID.randomUUID().toString(); + this.forwardWriteResults = forwardWriteResults; } @Override @@ -144,7 +150,11 @@ public DataStream> addPreCommitTopology( TypeInformation> typeInformation = CommittableMessageTypeInfo.of(this::getCommittableSerializer); - return writeResults + // Union forward writer results with the shuffle writer results + DataStream> allResults = + writeResults.union(forwardWriteResults); + + return allResults .keyBy( committable -> { if (committable instanceof CommittableSummary) { @@ -167,6 +177,55 @@ public SimpleVersionedSerializer getWriteResultSerializer() return new DynamicWriteResultSerializer(); } + /** + * A lightweight Sink used with {@link SinkWriterOperatorFactory} for the forward write path. + * Implements {@link SupportsCommitter} so that {@code SinkWriterOperator} emits committables + * downstream. The committer is never called — committing is handled by the main sink. + */ + @VisibleForTesting + static class ForwardWriterSink + implements Sink, SupportsCommitter { + + private final CatalogLoader catalogLoader; + private final Map writeProperties; + private final Configuration flinkConfig; + private final int cacheMaximumSize; + + ForwardWriterSink( + CatalogLoader catalogLoader, + Map writeProperties, + Configuration flinkConfig, + int cacheMaximumSize) { + this.catalogLoader = catalogLoader; + this.writeProperties = writeProperties; + this.flinkConfig = flinkConfig; + this.cacheMaximumSize = cacheMaximumSize; + } + + @Override + public SinkWriter createWriter(WriterInitContext context) { + return new DynamicWriter( + catalogLoader.loadCatalog(), + writeProperties, + flinkConfig, + cacheMaximumSize, + new DynamicWriterMetrics(context.metricGroup()), + context.getTaskInfo().getIndexOfThisSubtask(), + context.getTaskInfo().getAttemptNumber()); + } + + @Override + public Committer createCommitter(CommitterInitContext context) { + throw new UnsupportedOperationException( + "WriterSink is used only for writing; committing is handled by the main sink"); + } + + @Override + public SimpleVersionedSerializer getCommittableSerializer() { + return new DynamicWriteResultSerializer(); + } + } + public static class Builder { private DataStream input; private DynamicRecordGenerator generator; @@ -176,12 +235,6 @@ public static class Builder { private final Map snapshotSummary = Maps.newHashMap(); private ReadableConfig readableConfig = new Configuration(); private TableCreator tableCreator = TableCreator.DEFAULT; - private boolean immediateUpdate = false; - private boolean dropUnusedColumns = false; - private int cacheMaximumSize = 100; - private long cacheRefreshMs = 1_000; - private int inputSchemasPerTableCacheMaximumSize = 10; - private boolean caseSensitive = true; Builder() {} @@ -302,7 +355,9 @@ public Builder toBranch(String branch) { } public Builder immediateTableUpdate(boolean newImmediateUpdate) { - this.immediateUpdate = newImmediateUpdate; + writeOptions.put( + FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE.key(), + Boolean.toString(newImmediateUpdate)); return this; } @@ -318,19 +373,21 @@ public Builder immediateTableUpdate(boolean newImmediateUpdate) { * will never return data of the old column. */ public Builder dropUnusedColumns(boolean newDropUnusedColumns) { - this.dropUnusedColumns = newDropUnusedColumns; + writeOptions.put( + FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.key(), + Boolean.toString(newDropUnusedColumns)); return this; } /** Maximum size of the caches used in Dynamic Sink for table data and serializers. */ public Builder cacheMaxSize(int maxSize) { - this.cacheMaximumSize = maxSize; + writeOptions.put(FlinkDynamicSinkOptions.CACHE_MAX_SIZE.key(), Integer.toString(maxSize)); return this; } /** Maximum interval for cache items renewals. */ public Builder cacheRefreshMs(long refreshMs) { - this.cacheRefreshMs = refreshMs; + writeOptions.put(FlinkDynamicSinkOptions.CACHE_REFRESH_MS.key(), Long.toString(refreshMs)); return this; } @@ -340,7 +397,9 @@ public Builder cacheRefreshMs(long refreshMs) { * comparison results. */ public Builder inputSchemasPerTableCacheMaxSize(int inputSchemasPerTableCacheMaxSize) { - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaxSize; + writeOptions.put( + FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE.key(), + Integer.toString(inputSchemasPerTableCacheMaxSize)); return this; } @@ -349,7 +408,8 @@ public Builder inputSchemasPerTableCacheMaxSize(int inputSchemasPerTableCache * field names case-sensitive. */ public Builder caseSensitive(boolean newCaseSensitive) { - this.caseSensitive = newCaseSensitive; + writeOptions.put( + FlinkDynamicSinkOptions.CASE_SENSITIVE.key(), Boolean.toString(newCaseSensitive)); return this; } @@ -357,89 +417,134 @@ private String operatorName(String suffix) { return uidPrefix != null ? uidPrefix + "-" + suffix : suffix; } - private DynamicIcebergSink build() { + private DynamicIcebergSink build( + SingleOutputStreamOperator converted, + DynamicRecordInternalType sideOutputType) { Preconditions.checkArgument( generator != null, "Please use withGenerator() to convert the input DataStream."); Preconditions.checkNotNull(catalogLoader, "Catalog loader shouldn't be null"); - uidPrefix = Optional.ofNullable(uidPrefix).orElse(""); + Configuration flinkConfig = fromReadableConfig(); + FlinkDynamicSinkConf flinkDynamicSinkConf = + new FlinkDynamicSinkConf(writeOptions, flinkConfig); - Configuration flinkConfig = - readableConfig instanceof Configuration - ? (Configuration) readableConfig - : Configuration.fromMap(readableConfig.toMap()); + // Forward writer: chained with generator via forward edge, no data shuffle + ForwardWriterSink forwardWriterSink = + new ForwardWriterSink( + catalogLoader, writeOptions, flinkConfig, flinkDynamicSinkConf.cacheMaxSize()); + TypeInformation> writeResultTypeInfo = + CommittableMessageTypeInfo.of(DynamicWriteResultSerializer::new); - return instantiateSink(writeOptions, flinkConfig); + DataStream> forwardWriteResults = + converted + .getSideOutput( + new OutputTag<>(DynamicRecordProcessor.DYNAMIC_FORWARD_STREAM, sideOutputType)) + .transform( + operatorName("Forward-Writer"), + writeResultTypeInfo, + new SinkWriterOperatorFactory<>(forwardWriterSink)) + .setParallelism(converted.getParallelism()) + .uid(prefixIfNotNull(uidPrefix, "-forward-writer")); + + // Inject forward write results into sink — they'll be unioned in addPreCommitTopology + return instantiateSink(writeOptions, flinkConfig, forwardWriteResults); } @VisibleForTesting DynamicIcebergSink instantiateSink( - Map writeProperties, Configuration flinkWriteConf) { + Map writeProperties, + Configuration flinkWriteConf, + DataStream> forwardWriteResults) { + FlinkDynamicSinkConf flinkDynamicSinkConf = + new FlinkDynamicSinkConf(writeProperties, flinkWriteConf); return new DynamicIcebergSink( catalogLoader, snapshotSummary, uidPrefix, writeProperties, flinkWriteConf, - cacheMaximumSize); + flinkDynamicSinkConf.cacheMaxSize(), + forwardWriteResults); } /** * Append the iceberg sink operators to write records to iceberg table. * + *

    The topology splits records by distribution mode: + * + *

      + *
    • Forward records ({@code null} distributionMode) go through a forward edge to a chained + * writer, avoiding any data shuffle. + *
    • Shuffle records (non-null distributionMode) go through the standard Sink2 pipeline with + * hash/round-robin distribution. + *
    + * + * Both writers feed into a single shared pre-commit aggregator and committer, ensuring atomic + * commits across both paths. + * * @return {@link DataStreamSink} for sink. */ public DataStreamSink append() { + uidPrefix = Optional.ofNullable(uidPrefix).orElse(""); + + FlinkDynamicSinkConf flinkDynamicSinkConf = + new FlinkDynamicSinkConf(writeOptions, readableConfig); + Configuration flinkConfig = fromReadableConfig(); + DynamicRecordInternalType type = - new DynamicRecordInternalType(catalogLoader, false, cacheMaximumSize); - DynamicIcebergSink sink = build(); + new DynamicRecordInternalType(catalogLoader, false, flinkDynamicSinkConf.cacheMaxSize()); + DynamicRecordInternalType sideOutputType = + new DynamicRecordInternalType(catalogLoader, true, flinkDynamicSinkConf.cacheMaxSize()); + SingleOutputStreamOperator converted = input .process( new DynamicRecordProcessor<>( generator, catalogLoader, - immediateUpdate, - cacheMaximumSize, - cacheRefreshMs, - inputSchemasPerTableCacheMaximumSize, tableCreator, - caseSensitive, - dropUnusedColumns)) + flinkDynamicSinkConf, + writeOptions, + flinkConfig)) + .setParallelism(input.getParallelism()) .uid(prefixIfNotNull(uidPrefix, "-generator")) .name(operatorName("generator")) .returns(type); - DataStreamSink rowDataDataStreamSink = + DynamicIcebergSink sink = build(converted, sideOutputType); + + // Shuffle path: table update side output + main output → sinkTo() + DataStream shuffleInput = converted .getSideOutput( new OutputTag<>( - DynamicRecordProcessor.DYNAMIC_TABLE_UPDATE_STREAM, - new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize))) + DynamicRecordProcessor.DYNAMIC_TABLE_UPDATE_STREAM, sideOutputType)) .keyBy((KeySelector) DynamicRecordInternal::tableName) .map( - new DynamicTableUpdateOperator( - catalogLoader, - cacheMaximumSize, - cacheRefreshMs, - inputSchemasPerTableCacheMaximumSize, - tableCreator, - caseSensitive, - dropUnusedColumns)) + new DynamicTableUpdateOperator(catalogLoader, tableCreator, flinkDynamicSinkConf)) .uid(prefixIfNotNull(uidPrefix, "-updater")) .name(operatorName("Updater")) .returns(type) - .union(converted) - .sinkTo(sink) + .union(converted); + + DataStreamSink result = + shuffleInput + .sinkTo(sink) // Forward write results are implicitly injected here .uid(prefixIfNotNull(uidPrefix, "-sink")); FlinkWriteConf flinkWriteConf = new FlinkWriteConf(writeOptions, readableConfig); if (flinkWriteConf.writeParallelism() != null) { - rowDataDataStreamSink.setParallelism(flinkWriteConf.writeParallelism()); + result.setParallelism(flinkWriteConf.writeParallelism()); } - return rowDataDataStreamSink; + return result; + } + + private Configuration fromReadableConfig() { + return readableConfig instanceof Configuration + ? (Configuration) readableConfig + : Configuration.fromMap(readableConfig.toMap()); } } diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java index 9f445766083e..6507a575c2af 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java @@ -20,6 +20,7 @@ import java.util.Set; import javax.annotation.Nullable; +import org.apache.flink.annotation.Internal; import org.apache.flink.table.data.RowData; import org.apache.iceberg.DistributionMode; import org.apache.iceberg.PartitionSpec; @@ -34,20 +35,43 @@ public class DynamicRecord { private Schema schema; private RowData rowData; private PartitionSpec partitionSpec; - private DistributionMode distributionMode; + @Nullable private DistributionMode distributionMode; private int writeParallelism; private boolean upsertMode; @Nullable private Set equalityFields; + @Internal + DynamicRecord() {} + + /** + * Constructs a new DynamicRecord with forward (no shuffle) writes. + * + * @param tableIdentifier The target table identifier. + * @param branch The target table branch. + * @param schema The target table schema. + * @param rowData The data matching the provided schema. + * @param partitionSpec The target table {@link PartitionSpec}. + */ + public DynamicRecord( + TableIdentifier tableIdentifier, + String branch, + Schema schema, + RowData rowData, + PartitionSpec partitionSpec) { + this(tableIdentifier, branch, schema, rowData, partitionSpec, null, -1); + } + /** - * Constructs a new DynamicRecord. + * Constructs a new DynamicRecord. This record will be shuffled as specified by {@code + * distributionMode}. * * @param tableIdentifier The target table identifier. * @param branch The target table branch. * @param schema The target table schema. * @param rowData The data matching the provided schema. * @param partitionSpec The target table {@link PartitionSpec}. - * @param distributionMode The {@link DistributionMode}. + * @param distributionMode The {@link DistributionMode}. {@code null} indicates forward (no + * shuffle) writes. * @param writeParallelism The number of parallel writers. Can be set to any value {@literal > 0}, * but will always be automatically capped by the maximum write parallelism, which is the * parallelism of the sink. Set to Integer.MAX_VALUE for always using the maximum available diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java index 07dfad2780f7..c752b8e9b8d9 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java @@ -18,10 +18,12 @@ */ package org.apache.iceberg.flink.sink.dynamic; +import java.util.Map; import org.apache.flink.annotation.Internal; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.functions.OpenContext; import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.functions.ProcessFunction; import org.apache.flink.table.data.RowData; import org.apache.flink.util.Collector; @@ -30,6 +32,7 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.flink.CatalogLoader; +import org.apache.iceberg.flink.FlinkWriteConf; @Internal class DynamicRecordProcessor extends ProcessFunction @@ -37,8 +40,12 @@ class DynamicRecordProcessor extends ProcessFunction generator; private final CatalogLoader catalogLoader; + private final Map writeProperties; + private final Configuration flinkConfig; private final boolean immediateUpdate; private final boolean dropUnusedColumns; private final int cacheMaximumSize; @@ -51,28 +58,29 @@ class DynamicRecordProcessor extends ProcessFunction updateStream; + private transient OutputTag forwardStream; private transient Collector collector; + private transient DynamicRecordWithConfig dynamicRecordWithConfig; private transient Context context; DynamicRecordProcessor( DynamicRecordGenerator generator, CatalogLoader catalogLoader, - boolean immediateUpdate, - int cacheMaximumSize, - long cacheRefreshMs, - int inputSchemasPerTableCacheMaximumSize, TableCreator tableCreator, - boolean caseSensitive, - boolean dropUnusedColumns) { + FlinkDynamicSinkConf sinkConfig, + Map writeProperties, + Configuration flinkConfig) { this.generator = generator; this.catalogLoader = catalogLoader; - this.immediateUpdate = immediateUpdate; - this.cacheMaximumSize = cacheMaximumSize; - this.cacheRefreshMs = cacheRefreshMs; - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaximumSize; + this.flinkConfig = flinkConfig; + this.writeProperties = writeProperties; + this.immediateUpdate = sinkConfig.immediateTableUpdate(); + this.cacheMaximumSize = sinkConfig.cacheMaxSize(); + this.cacheRefreshMs = sinkConfig.cacheRefreshMs(); + this.inputSchemasPerTableCacheMaximumSize = sinkConfig.inputSchemasPerTableCacheMaxSize(); this.tableCreator = tableCreator; - this.caseSensitive = caseSensitive; - this.dropUnusedColumns = dropUnusedColumns; + this.caseSensitive = sinkConfig.caseSensitive(); + this.dropUnusedColumns = sinkConfig.dropUnusedColumns(); } @Override @@ -90,15 +98,22 @@ public void open(OpenContext openContext) throws Exception { this.hashKeyGenerator = new HashKeyGenerator( cacheMaximumSize, getRuntimeContext().getTaskInfo().getMaxNumberOfParallelSubtasks()); - if (immediateUpdate) { - updater = new TableUpdater(tableCache, catalog, caseSensitive, dropUnusedColumns); - } else { + // Always create updater — needed for forced immediate updates on forward records + this.updater = new TableUpdater(tableCache, catalog, caseSensitive, dropUnusedColumns); + // Always create forward stream tag for forward (distributionMode == null) records + this.forwardStream = + new OutputTag<>( + DYNAMIC_FORWARD_STREAM, + new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize)) {}; + if (!immediateUpdate) { updateStream = new OutputTag<>( DYNAMIC_TABLE_UPDATE_STREAM, new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize)) {}; } + this.dynamicRecordWithConfig = + new DynamicRecordWithConfig(new FlinkWriteConf(writeProperties, flinkConfig)); generator.open(openContext); } @@ -111,7 +126,10 @@ public void processElement(T element, Context ctx, Collector newData = updater.update( data.tableIdentifier(), data.branch(), data.schema(), data.spec(), tableCreator); emit( - collector, data, newData.f0.resolvedTableSchema(), newData.f0.recordConverter(), - newData.f1); + newData.f1, + isForward); } else { + // Shuffled records with immediateUpdate=false go to the update side output int writerKey = hashKeyGenerator.generateKey( data, @@ -159,33 +182,38 @@ public void collect(DynamicRecord data) { } } else { emit( - collector, data, foundSchema.resolvedTableSchema(), foundSchema.recordConverter(), - foundSpec); + foundSpec, + isForward); } } private void emit( - Collector out, DynamicRecord data, Schema schema, DataConverter recordConverter, - PartitionSpec spec) { + PartitionSpec spec, + boolean forward) { RowData rowData = (RowData) recordConverter.convert(data.rowData()); - int writerKey = hashKeyGenerator.generateKey(data, schema, spec, rowData); - String tableName = data.tableIdentifier().toString(); - out.collect( + // writerKey is unused in the forward path. + int writerKey = forward ? -1 : hashKeyGenerator.generateKey(data, schema, spec, rowData); + DynamicRecordInternal record = new DynamicRecordInternal( - tableName, + data.tableIdentifier().toString(), data.branch(), schema, rowData, spec, writerKey, data.upsertMode(), - DynamicSinkUtil.getEqualityFieldIds(data.equalityFields(), schema))); + DynamicSinkUtil.getEqualityFieldIds(data.equalityFields(), schema)); + if (forward) { + context.output(forwardStream, record); + } else { + collector.collect(record); + } } @Override diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordWithConfig.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordWithConfig.java new file mode 100644 index 000000000000..32716c3e4ac7 --- /dev/null +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordWithConfig.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.util.Set; +import org.apache.flink.table.data.RowData; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.FlinkWriteConf; + +class DynamicRecordWithConfig extends DynamicRecord { + private final String defaultBranch; + private final Integer defaultWriteParallelism; + + private DynamicRecord wrapped; + + DynamicRecordWithConfig(FlinkWriteConf flinkWriteConf) { + this.defaultBranch = flinkWriteConf.branch(); + this.defaultWriteParallelism = flinkWriteConf.writeParallelism(); + } + + DynamicRecordWithConfig wrap(DynamicRecord newWrapped) { + this.wrapped = newWrapped; + return this; + } + + @Override + public String branch() { + return wrapped.branch() != null ? wrapped.branch() : defaultBranch; + } + + @Override + public DistributionMode distributionMode() { + return wrapped.distributionMode(); + } + + @Override + public int writeParallelism() { + int originalParallelism = wrapped.writeParallelism(); + if (originalParallelism > 0 || defaultWriteParallelism == null) { + return originalParallelism; + } + + return defaultWriteParallelism; + } + + @Override + public TableIdentifier tableIdentifier() { + return wrapped.tableIdentifier(); + } + + @Override + public Schema schema() { + return wrapped.schema(); + } + + @Override + public PartitionSpec spec() { + return wrapped.spec(); + } + + @Override + public RowData rowData() { + return wrapped.rowData(); + } + + @Override + public boolean upsertMode() { + return wrapped.upsertMode(); + } + + @Override + public Set equalityFields() { + return wrapped.equalityFields(); + } +} diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java index 456f20adf59f..93c268ff86ad 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java @@ -48,20 +48,15 @@ class DynamicTableUpdateOperator private transient TableUpdater updater; DynamicTableUpdateOperator( - CatalogLoader catalogLoader, - int cacheMaximumSize, - long cacheRefreshMs, - int inputSchemasPerTableCacheMaximumSize, - TableCreator tableCreator, - boolean caseSensitive, - boolean dropUnusedColumns) { + CatalogLoader catalogLoader, TableCreator tableCreator, FlinkDynamicSinkConf configuration) { this.catalogLoader = catalogLoader; - this.cacheMaximumSize = cacheMaximumSize; - this.cacheRefreshMs = cacheRefreshMs; - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaximumSize; this.tableCreator = tableCreator; - this.caseSensitive = caseSensitive; - this.dropUnusedColumns = dropUnusedColumns; + + this.cacheMaximumSize = configuration.cacheMaxSize(); + this.cacheRefreshMs = configuration.cacheRefreshMs(); + this.inputSchemasPerTableCacheMaximumSize = configuration.inputSchemasPerTableCacheMaxSize(); + this.caseSensitive = configuration.caseSensitive(); + this.dropUnusedColumns = configuration.dropUnusedColumns(); } @Override diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkConf.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkConf.java new file mode 100644 index 000000000000..75b169c4b533 --- /dev/null +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkConf.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.util.Map; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.iceberg.flink.FlinkConfParser; + +/** + * A class for common Dynamic Iceberg sink configs for Flink writes. + * + *

    If a config is set at multiple levels, the following order of precedence is used (top to + * bottom): + * + *

      + *
    1. Write options + *
    2. Flink ReadableConfig + *
    3. Default values + *
    + * + * The most specific value is set in write options and takes precedence over all other configs. If + * no write option is provided, this class checks the flink configuration for any overrides. If no + * applicable value is found in the write options, this class uses the default values. + */ +class FlinkDynamicSinkConf { + + private final FlinkConfParser confParser; + + FlinkDynamicSinkConf(Map writeOptions, ReadableConfig readableConfig) { + this.confParser = new FlinkConfParser(writeOptions, readableConfig); + } + + int cacheMaxSize() { + return confParser + .intConf() + .option(FlinkDynamicSinkOptions.CACHE_MAX_SIZE.key()) + .flinkConfig(FlinkDynamicSinkOptions.CACHE_MAX_SIZE) + .defaultValue(FlinkDynamicSinkOptions.CACHE_MAX_SIZE.defaultValue()) + .parse(); + } + + boolean immediateTableUpdate() { + return confParser + .booleanConf() + .option(FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE.key()) + .flinkConfig(FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE) + .defaultValue(FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE.defaultValue()) + .parse(); + } + + boolean dropUnusedColumns() { + return confParser + .booleanConf() + .option(FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.key()) + .flinkConfig(FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS) + .defaultValue(FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.defaultValue()) + .parse(); + } + + long cacheRefreshMs() { + return confParser + .longConf() + .option(FlinkDynamicSinkOptions.CACHE_REFRESH_MS.key()) + .flinkConfig(FlinkDynamicSinkOptions.CACHE_REFRESH_MS) + .defaultValue(FlinkDynamicSinkOptions.CACHE_REFRESH_MS.defaultValue()) + .parse(); + } + + int inputSchemasPerTableCacheMaxSize() { + return confParser + .intConf() + .option(FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE.key()) + .flinkConfig(FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE) + .defaultValue(FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE.defaultValue()) + .parse(); + } + + boolean caseSensitive() { + return confParser + .booleanConf() + .option(FlinkDynamicSinkOptions.CASE_SENSITIVE.key()) + .flinkConfig(FlinkDynamicSinkOptions.CASE_SENSITIVE) + .defaultValue(FlinkDynamicSinkOptions.CASE_SENSITIVE.defaultValue()) + .parse(); + } +} diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkOptions.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkOptions.java new file mode 100644 index 000000000000..7a4f038219d9 --- /dev/null +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkOptions.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import org.apache.flink.annotation.Experimental; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; + +@Experimental +public class FlinkDynamicSinkOptions { + + private FlinkDynamicSinkOptions() {} + + public static final ConfigOption CACHE_MAX_SIZE = + ConfigOptions.key("dynamic-sink.cache-max-size") + .intType() + .defaultValue(100) + .withDescription( + "Maximum size of the caches used in Dynamic Sink for table data and serializers."); + + public static final ConfigOption IMMEDIATE_TABLE_UPDATE = + ConfigOptions.key("dynamic-sink.immediate-table-update") + .booleanType() + .defaultValue(false) + .withDescription( + "Controls whether table schema and partition updates should be applied immediately in Dynamic Sink."); + + public static final ConfigOption DROP_UNUSED_COLUMNS = + ConfigOptions.key("dynamic-sink.drop-unused-columns") + .booleanType() + .defaultValue(false) + .withDescription( + "Allows dropping unused columns during schema evolution in Dynamic Sink."); + + public static final ConfigOption CACHE_REFRESH_MS = + ConfigOptions.key("dynamic-sink.cache-refresh-ms") + .longType() + .defaultValue(1_000L) + .withDescription( + "Cache refresh interval for dynamic table metadata in Dynamic Sink in milliseconds."); + + public static final ConfigOption INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE = + ConfigOptions.key("dynamic-sink.input-schemas-per-table-cache-max-size") + .intType() + .defaultValue(10) + .withDescription( + "Maximum input schema objects to cache per each table in Dynamic Sink for performance."); + + public static final ConfigOption CASE_SENSITIVE = + ConfigOptions.key("dynamic-sink.case-sensitive") + .booleanType() + .defaultValue(true) + .withDescription( + "Controls whether schema field name matching should be case-sensitive in Dynamic Sink."); +} diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java index fca45bf882e0..61a850212bf4 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java @@ -88,7 +88,7 @@ int generateKey( dynamicRecord.schema(), dynamicRecord.spec(), dynamicRecord.equalityFields(), - MoreObjects.firstNonNull(dynamicRecord.distributionMode(), DistributionMode.NONE), + dynamicRecord.distributionMode(), Math.min(dynamicRecord.writeParallelism(), maxWriteParallelism)); KeySelector keySelector = keySelectorCache.computeIfAbsent( @@ -98,8 +98,7 @@ int generateKey( tableIdent, MoreObjects.firstNonNull(tableSchema, dynamicRecord.schema()), MoreObjects.firstNonNull(tableSpec, dynamicRecord.spec()), - MoreObjects.firstNonNull( - dynamicRecord.distributionMode(), DistributionMode.NONE), + dynamicRecord.distributionMode(), MoreObjects.firstNonNull( dynamicRecord.equalityFields(), Collections.emptySet()), Math.min(dynamicRecord.writeParallelism(), maxWriteParallelism))); diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java index 8ef1f1fbb833..d74b8b9d620f 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java @@ -23,8 +23,6 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.flink.annotation.Internal; -import org.apache.flink.formats.avro.RowDataToAvroConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.LogicalType; @@ -32,6 +30,8 @@ import org.apache.flink.table.types.utils.TypeConversions; import org.apache.iceberg.avro.AvroSchemaUtil; import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.formats.avro.RowDataToAvroConverters; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; /** * This is not serializable because Avro {@link Schema} is not actually serializable, even though it diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java index bac7c05bdfef..1c6644238c3d 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java @@ -136,23 +136,26 @@ void validate() { if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) { Preconditions.checkArgument( startSnapshotId != null, - "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); + "Invalid starting snapshot id for %s strategy: null", + startingStrategy); Preconditions.checkArgument( startSnapshotTimestamp == null, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + "Invalid starting snapshot timestamp for %s strategy: not null", + startingStrategy); } if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) { Preconditions.checkArgument( startSnapshotTimestamp != null, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); + "Invalid starting snapshot timestamp for %s strategy: null", + startingStrategy); Preconditions.checkArgument( startSnapshotId == null, - "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + "Invalid starting snapshot id for %s strategy: not null", + startingStrategy); } Preconditions.checkArgument( - tag == null, - String.format("Cannot scan table using ref %s configured for streaming reader", tag)); + tag == null, "Cannot scan table using ref %s configured for streaming reader", tag); Preconditions.checkArgument( snapshotId == null, "Cannot set snapshot-id option for streaming reader"); Preconditions.checkArgument( diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java index b158b0871a53..cfef780a4daa 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java @@ -21,8 +21,6 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.formats.avro.RowDataToAvroConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; import org.apache.flink.formats.avro.typeutils.GenericRecordAvroTypeInfo; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.DataType; @@ -31,6 +29,8 @@ import org.apache.flink.table.types.utils.TypeConversions; import org.apache.iceberg.avro.AvroSchemaUtil; import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.formats.avro.RowDataToAvroConverters; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; public class AvroGenericRecordConverter implements RowDataConverter { private final Schema avroSchema; diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java index 02ef57d344b1..3af9957875e8 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java @@ -34,7 +34,7 @@ class WatermarkExtractorRecordEmitter implements SerializableRecordEmitter private static final Logger LOG = LoggerFactory.getLogger(WatermarkExtractorRecordEmitter.class); private final SplitWatermarkExtractor timeExtractor; private String lastSplitId = null; - private long watermark; + private long watermark = Long.MIN_VALUE; WatermarkExtractorRecordEmitter(SplitWatermarkExtractor timeExtractor) { this.timeExtractor = timeExtractor; @@ -44,7 +44,10 @@ class WatermarkExtractorRecordEmitter implements SerializableRecordEmitter public void emitRecord( RecordAndPosition element, SourceOutput output, IcebergSourceSplit split) { if (!split.splitId().equals(lastSplitId)) { - long newWatermark = timeExtractor.extractWatermark(split); + long extracted = timeExtractor.extractWatermark(split); + // Subtract 1 because watermark W means all records with eventTime <= W have arrived; + // records in this split have eventTime == extracted, so watermark must be extracted - 1. + long newWatermark = extracted > Long.MIN_VALUE ? extracted - 1 : Long.MIN_VALUE; if (newWatermark < watermark) { LOG.info( "Received a new split with lower watermark. Previous watermark = {}, current watermark = {}, previous split = {}, current split = {}", diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java index e2cd411d7069..795c4fa5a766 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java @@ -75,6 +75,11 @@ public static class Primitives implements DataGenerator { OffsetDateTime.of(2022, 1, 10, 0, 0, 0, 0, ZoneOffset.UTC); private static final LocalDateTime JAVA_LOCAL_DATE_TIME_20220110 = LocalDateTime.of(2022, 1, 10, 0, 0, 0); + private static final OffsetDateTime JAVA_OFFSET_DATE_TIME_MAX_NANO = + OffsetDateTime.of(2262, 4, 11, 23, 47, 16, 854_775_807, ZoneOffset.UTC); + private static final LocalDateTime JAVA_LOCAL_DATE_TIME_MAX_NANO = + LocalDateTime.of(2262, 4, 11, 23, 47, 16, 854_775_807); + private static final long ICEBERG_MAX_NANOS_EPOCH = 9223372036854775807L; private static final BigDecimal BIG_DECIMAL_NEGATIVE = new BigDecimal("-1.50"); private static final byte[] FIXED_BYTES = "012345689012345".getBytes(StandardCharsets.UTF_8); @@ -96,7 +101,11 @@ public static class Primitives implements DataGenerator { Types.NestedField.required(12, "uuid_field", Types.UUIDType.get()), Types.NestedField.required(13, "binary_field", Types.BinaryType.get()), Types.NestedField.required(14, "decimal_field", Types.DecimalType.of(9, 2)), - Types.NestedField.required(15, "fixed_field", Types.FixedType.ofLength(16))); + Types.NestedField.required(15, "fixed_field", Types.FixedType.ofLength(16)), + Types.NestedField.required( + 16, "ts_ns_with_zone_field", Types.TimestampNanoType.withZone()), + Types.NestedField.required( + 17, "ts_ns_without_zone_field", Types.TimestampNanoType.withoutZone())); private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); @@ -171,6 +180,8 @@ public GenericRecord generateIcebergGenericRecord() { genericRecord.setField("time_field", JAVA_LOCAL_TIME_HOUR8); genericRecord.setField("ts_with_zone_field", JAVA_OFFSET_DATE_TIME_20220110); genericRecord.setField("ts_without_zone_field", JAVA_LOCAL_DATE_TIME_20220110); + genericRecord.setField("ts_ns_with_zone_field", JAVA_OFFSET_DATE_TIME_MAX_NANO); + genericRecord.setField("ts_ns_without_zone_field", JAVA_LOCAL_DATE_TIME_MAX_NANO); byte[] uuidBytes = new byte[16]; for (int i = 0; i < 16; ++i) { @@ -220,7 +231,11 @@ public GenericRowData generateFlinkRowData() { uuidBytes, binaryBytes, DecimalData.fromBigDecimal(BIG_DECIMAL_NEGATIVE, 9, 2), - FIXED_BYTES); + FIXED_BYTES, + TimestampData.fromEpochMillis( + ICEBERG_MAX_NANOS_EPOCH / 1_000_000, (int) (ICEBERG_MAX_NANOS_EPOCH % 1_000_000)), + TimestampData.fromEpochMillis( + ICEBERG_MAX_NANOS_EPOCH / 1_000_000, (int) (ICEBERG_MAX_NANOS_EPOCH % 1_000_000))); } @Override @@ -236,10 +251,12 @@ public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { genericRecord.put("date_field", DAYS_BTW_EPOC_AND_20220110); genericRecord.put("time_field", HOUR_8_IN_MILLI); - // Although Avro logical type for timestamp fields are in micro seconds, - // AvroToRowDataConverters only looks for long value in milliseconds. - genericRecord.put("ts_with_zone_field", JODA_DATETIME_20220110.getMillis()); - genericRecord.put("ts_without_zone_field", JODA_DATETIME_20220110.getMillis()); + // Now that AvroToRowDataConverters correctly supports microseconds, + // we must inject correct microsecond scale values into the Avro data. + genericRecord.put("ts_with_zone_field", JODA_DATETIME_20220110.getMillis() * 1000L); + genericRecord.put("ts_without_zone_field", JODA_DATETIME_20220110.getMillis() * 1000L); + genericRecord.put("ts_ns_with_zone_field", ICEBERG_MAX_NANOS_EPOCH); + genericRecord.put("ts_ns_without_zone_field", ICEBERG_MAX_NANOS_EPOCH); byte[] uuidBytes = new byte[16]; for (int i = 0; i < 16; ++i) { @@ -554,7 +571,11 @@ public static class ArrayOfPrimitive implements DataGenerator { new Schema( Types.NestedField.required(1, "row_id", Types.StringType.get()), Types.NestedField.required( - 2, "array_of_int", Types.ListType.ofOptional(101, Types.IntegerType.get()))); + 2, "array_of_int", Types.ListType.ofOptional(101, Types.IntegerType.get())), + Types.NestedField.optional( + 3, + "array_of_ts_ns", + Types.ListType.ofRequired(102, Types.TimestampNanoType.withoutZone()))); private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); @@ -581,13 +602,33 @@ public GenericRecord generateIcebergGenericRecord() { GenericRecord genericRecord = GenericRecord.create(icebergSchema); genericRecord.setField("row_id", "row_id_value"); genericRecord.setField("array_of_int", Arrays.asList(1, 2, 3)); + + LocalDateTime posNanos = LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789); + LocalDateTime negNanos = LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321); + genericRecord.setField("array_of_ts_ns", Arrays.asList(posNanos, negNanos)); return genericRecord; } @Override public GenericRowData generateFlinkRowData() { Integer[] arr = {1, 2, 3}; - return GenericRowData.of(StringData.fromString("row_id_value"), new GenericArrayData(arr)); + + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + TimestampData[] tsArr = { + TimestampData.fromEpochMillis( + Math.floorDiv(posNanos, 1_000_000L), (int) Math.floorMod(posNanos, 1_000_000L)), + TimestampData.fromEpochMillis( + Math.floorDiv(negNanos, 1_000_000L), (int) Math.floorMod(negNanos, 1_000_000L)) + }; + return GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericArrayData(arr), + new GenericArrayData(tsArr)); } @Override @@ -595,6 +636,14 @@ public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); genericRecord.put("row_id", "row_id_value"); genericRecord.put("array_of_int", Arrays.asList(1, 2, 3)); + + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + genericRecord.put("array_of_ts_ns", Arrays.asList(posNanos, negNanos)); return genericRecord; } } @@ -808,7 +857,12 @@ public static class MapOfPrimitives implements DataGenerator { 2, "map_of_primitives", Types.MapType.ofRequired( - 101, 102, Types.StringType.get(), Types.IntegerType.get()))); + 101, 102, Types.StringType.get(), Types.IntegerType.get())), + Types.NestedField.optional( + 3, + "map_of_ts_ns", + Types.MapType.ofRequired( + 103, 104, Types.StringType.get(), Types.TimestampNanoType.withoutZone()))); private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); @@ -835,15 +889,37 @@ public GenericRecord generateIcebergGenericRecord() { GenericRecord genericRecord = GenericRecord.create(icebergSchema); genericRecord.setField("row_id", "row_id_value"); genericRecord.setField("map_of_primitives", ImmutableMap.of("Jane", 1, "Joe", 2)); + + LocalDateTime posNanos = LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789); + LocalDateTime negNanos = LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321); + genericRecord.setField( + "map_of_ts_ns", ImmutableMap.of("positive", posNanos, "negative", negNanos)); return genericRecord; } @Override public GenericRowData generateFlinkRowData() { + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + return GenericRowData.of( StringData.fromString("row_id_value"), new GenericMapData( - ImmutableMap.of(StringData.fromString("Jane"), 1, StringData.fromString("Joe"), 2))); + ImmutableMap.of(StringData.fromString("Jane"), 1, StringData.fromString("Joe"), 2)), + new GenericMapData( + ImmutableMap.of( + StringData.fromString("positive"), + TimestampData.fromEpochMillis( + Math.floorDiv(posNanos, 1_000_000L), + (int) Math.floorMod(posNanos, 1_000_000L)), + StringData.fromString("negative"), + TimestampData.fromEpochMillis( + Math.floorDiv(negNanos, 1_000_000L), + (int) Math.floorMod(negNanos, 1_000_000L))))); } @Override @@ -851,6 +927,15 @@ public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); genericRecord.put("row_id", "row_id_value"); genericRecord.put("map_of_primitives", ImmutableMap.of("Jane", 1, "Joe", 2)); + + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + genericRecord.put( + "map_of_ts_ns", ImmutableMap.of("positive", posNanos, "negative", negNanos)); return genericRecord; } } diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java index abcc2d1da199..0e7635a33e87 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java @@ -30,7 +30,6 @@ import org.apache.iceberg.data.Record; import org.apache.iceberg.flink.data.RandomRowData; import org.apache.iceberg.util.StructLikeWrapper; -import org.junit.jupiter.api.Disabled; public class TestRowDataWrapper extends RecordWrapperTestBase { @@ -92,16 +91,4 @@ protected void generateAndValidate( assertThat(actual).isExhausted(); assertThat(expected).isExhausted(); } - - @Disabled - @Override - public void testTimestampNanoWithoutZone() { - // Flink does not support nanosecond timestamp without zone. - } - - @Disabled - @Override - public void testTimestampNanoWithZone() { - // Flink does not support nanosecond timestamp with zone. - } } diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java index 8c99fdf52110..1f0fe70ac53b 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java @@ -19,13 +19,17 @@ package org.apache.iceberg.flink.data; import java.util.List; +import org.apache.flink.table.data.GenericRowData; import org.apache.flink.table.data.RowData; +import org.apache.iceberg.PartitionData; import org.apache.iceberg.Schema; import org.apache.iceberg.data.BaseFormatModelTests; import org.apache.iceberg.data.Record; import org.apache.iceberg.flink.FlinkSchemaUtil; import org.apache.iceberg.flink.RowDataConverter; import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; public class TestFlinkFormatModel extends BaseFormatModelTests { @@ -48,4 +52,26 @@ protected RowData convertToEngine(Record record, Schema schema) { protected void assertEquals(Schema schema, List expected, List actual) { TestHelpers.assertRows(actual, expected, FlinkSchemaUtil.convert(schema)); } + + @Override + protected Object convertConstantToEngine(Type type, Object value) { + if (value instanceof PartitionData partitionData) { + Types.StructType structType = type.asStructType(); + List fields = structType.fields(); + GenericRowData rowData = new GenericRowData(fields.size()); + int sourceSize = partitionData.size(); + for (int i = 0; i < fields.size(); i++) { + if (i < sourceSize) { + Object fieldValue = partitionData.get(i, Object.class); + rowData.setField(i, convertConstantToEngine(fields.get(i).type(), fieldValue)); + } else { + rowData.setField(i, null); + } + } + + return rowData; + } + + return RowDataUtil.convertConstant(type, value); + } } diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java index 4a70802f2a2e..b7b0a54156cc 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java @@ -49,6 +49,11 @@ protected boolean allowsWritingNullValuesForRequiredFields() { return true; } + @Override + protected boolean supportsTimestampNanos() { + return true; + } + @Override protected void writeAndValidate(Schema schema) throws IOException { List expectedRecords = RandomGenericData.generate(schema, NUM_RECORDS, 1990L); diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java index 4e5b38ffb026..a2411da1e344 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java @@ -271,18 +271,19 @@ public void testMapOfPrimitivesProjection() { GenericRowData.of( StringData.fromString("row_id_value"), new GenericMapData( - ImmutableMap.of(StringData.fromString("foo"), 1, StringData.fromString("bar"), 2))); + ImmutableMap.of(StringData.fromString("foo"), 1, StringData.fromString("bar"), 2)), + null); testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData, true); testEqualsAndHashCode(schema, mapOnly, rowData, copyRowData, otherRowData); testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); GenericRowData rowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("row_id_value"), null); + GenericRowData.of(StringData.fromString("row_id_value"), null, null); GenericRowData copyRowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("row_id_value"), null); + GenericRowData.of(StringData.fromString("row_id_value"), null, null); // modify the map field value GenericRowData otherRowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("other_row_id_value"), null); + GenericRowData.of(StringData.fromString("other_row_id_value"), null, null); testEqualsAndHashCode( schema, idOnly, @@ -432,7 +433,8 @@ public void testArrayOfPrimitiveProjection() { GenericRowData otherRowData = GenericRowData.of( StringData.fromString("other_row_id_value"), - new GenericArrayData(new Integer[] {4, 5, 6})); + new GenericArrayData(new Integer[] {4, 5, 6}), + null); testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); testEqualsAndHashCode(schema, arrayOnly, rowData, copyRowData, otherRowData); testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); @@ -440,16 +442,19 @@ public void testArrayOfPrimitiveProjection() { GenericRowData rowDataNullOptionalFields = GenericRowData.of( StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {1, null, 3})); + new GenericArrayData(new Integer[] {1, null, 3}), + null); GenericRowData copyRowDataNullOptionalFields = GenericRowData.of( StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {1, null, 3})); + new GenericArrayData(new Integer[] {1, null, 3}), + null); // modify the map field value GenericRowData otherRowDataNullOptionalFields = GenericRowData.of( StringData.fromString("other_row_id_value"), - new GenericArrayData(new Integer[] {4, null, 6})); + new GenericArrayData(new Integer[] {4, null, 6}), + null); testEqualsAndHashCode( schema, idOnly, diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java index 3cb18ffbb77e..4d35792e440e 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java @@ -19,11 +19,18 @@ package org.apache.iceberg.flink.maintenance.api; import static org.apache.iceberg.flink.maintenance.api.JdbcLockFactory.INIT_LOCK_TABLES_PROPERTY; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import java.sql.SQLTransientConnectionException; import java.util.Map; import java.util.UUID; +import java.util.concurrent.atomic.AtomicInteger; import org.apache.iceberg.jdbc.JdbcCatalog; +import org.apache.iceberg.jdbc.JdbcClientPool; +import org.apache.iceberg.jdbc.UncheckedSQLException; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.Test; class TestJdbcLockFactory extends TestLockFactoryBase { @Override @@ -38,4 +45,62 @@ TriggerLockFactory lockFactory(String tableName) { tableName, properties); } + + @Test + void testSQLExceptionEnablesRetryInClientPool() throws Exception { + // Regression test for #15759: verify that removing the inner try-catch allows + // ClientPoolImpl to retry on transient connection failures. + // + // Before the fix: inner catch converted SQLException -> UncheckedSQLException + // (RuntimeException) inside the lambda. ClientPoolImpl only catches the declared + // exception type (SQLException), so RuntimeException bypasses retry entirely. + // After the fix: SQLException propagates naturally, ClientPoolImpl catches it, + // and retries on transient connection exceptions. + Map props = Maps.newHashMap(); + props.put("username", "user"); + props.put("password", "password"); + String uri = "jdbc:sqlite:file::memory:?ic" + UUID.randomUUID().toString().replace("-", ""); + + try (JdbcClientPool pool = new JdbcClientPool(1, uri, props)) { + AtomicInteger attempts = new AtomicInteger(0); + + String result = + pool.run( + conn -> { + if (attempts.incrementAndGet() == 1) { + throw new SQLTransientConnectionException("transient failure"); + } + + return "success"; + }); + + assertThat(result).isEqualTo("success"); + assertThat(attempts.get()).isGreaterThan(1); + } + } + + @Test + void testUncheckedSQLExceptionBypassesRetry() throws Exception { + // Companion test: demonstrates that wrapping SQLException as UncheckedSQLException + // (the OLD behavior before the fix) prevents ClientPoolImpl from retrying. + Map props = Maps.newHashMap(); + props.put("username", "user"); + props.put("password", "password"); + String uri = "jdbc:sqlite:file::memory:?ic" + UUID.randomUUID().toString().replace("-", ""); + + try (JdbcClientPool pool = new JdbcClientPool(1, uri, props)) { + assertThatThrownBy( + () -> + pool.run( + conn -> { + try { + throw new SQLTransientConnectionException("transient failure"); + } catch (java.sql.SQLException e) { + throw new UncheckedSQLException(e, "wrapped"); + } + })) + .isInstanceOf(UncheckedSQLException.class) + .hasMessageContaining("wrapped"); + } + } } diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java index bb53b5265655..88b949a9a7f8 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java @@ -29,9 +29,11 @@ import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.REMOVED_DATA_FILE_SIZE_METRIC; import static org.assertj.core.api.Assertions.assertThat; +import java.time.Instant; import java.util.List; import java.util.stream.StreamSupport; import org.apache.flink.streaming.api.graph.StreamGraphGenerator; +import org.apache.iceberg.FileFormat; import org.apache.iceberg.ManifestFiles; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; @@ -43,8 +45,14 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.FieldSource; class TestRewriteDataFiles extends MaintenanceTaskTestBase { + + private static final FileFormat[] FILE_FORMATS = + new FileFormat[] {FileFormat.AVRO, FileFormat.PARQUET, FileFormat.ORC}; + @Test void testRewriteUnpartitioned() throws Exception { Table table = createTable(); @@ -82,13 +90,14 @@ void testRewriteUnpartitioned() throws Exception { createRecord(4, "d"))); } - @Test - void testRewriteUnpartitionedPreserveLineage() throws Exception { - Table table = createTable(3); - insert(table, 1, "a"); - insert(table, 2, "b"); - insert(table, 3, "c"); - insert(table, 4, "d"); + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testRewriteUnpartitionedPreserveLineage(FileFormat fileFormat) throws Exception { + Table table = createTable(3, fileFormat); + insert(table, 1, "a", fileFormat); + insert(table, 2, "b", fileFormat); + insert(table, 3, "c", fileFormat); + insert(table, 4, "d", fileFormat); assertFileNum(table, 4, 0); @@ -122,15 +131,17 @@ void testRewriteUnpartitionedPreserveLineage() throws Exception { schema); } - @Test - void testRewriteTheSameFilePreserveLineage() throws Exception { - Table table = createTable(3); - insert(table, 1, "a"); - insert(table, 2, "b"); + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testRewriteTheSameFilePreserveLineage(FileFormat fileFormat) throws Exception { + Table table = createTable(3, fileFormat); + insert(table, 1, "a", fileFormat); + insert(table, 2, "b", fileFormat); // Create a file with two lines of data to verify that the rowid is read correctly. insert( table, - ImmutableList.of(SimpleDataUtil.createRecord(3, "c"), SimpleDataUtil.createRecord(4, "d"))); + ImmutableList.of(SimpleDataUtil.createRecord(3, "c"), SimpleDataUtil.createRecord(4, "d")), + fileFormat); assertFileNum(table, 3, 0); @@ -166,13 +177,14 @@ void testRewriteTheSameFilePreserveLineage() throws Exception { schema); } - @Test - void testRewritePartitionedPreserveLineage() throws Exception { - Table table = createPartitionedTable(3); - insertPartitioned(table, 1, "p1"); - insertPartitioned(table, 2, "p1"); - insertPartitioned(table, 3, "p2"); - insertPartitioned(table, 4, "p2"); + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testRewritePartitionedPreserveLineage(FileFormat fileFormat) throws Exception { + Table table = createPartitionedTable(3, fileFormat); + insertPartitioned(table, 1, "p1", fileFormat); + insertPartitioned(table, 2, "p1", fileFormat); + insertPartitioned(table, 3, "p2", fileFormat); + insertPartitioned(table, 4, "p2", fileFormat); assertFileNum(table, 4, 0); @@ -529,6 +541,57 @@ void testRewriteWithFilter() throws Exception { createRecord(4, "d"))); } + /** + * By verifying that the creation time of the data content in the builder is later than the + * creation time of the filter condition — if the filter condition is actually created in the + * planner, then all files can be compacted; otherwise, not all files can be compacted — we can + * confirm whether the filter condition is actually created in the planner. + */ + @Test + void testRewriteWithFilterSupplier() throws Exception { + Table table = createTable(); + + appendRewriteDataFiles( + RewriteDataFiles.builder() + .parallelism(2) + .deleteFileThreshold(10) + .targetFileSizeBytes(1_000_000L) + .maxFileGroupSizeBytes(10_000_000L) + .maxFileSizeBytes(2_000_000L) + .minFileSizeBytes(500_000L) + .minInputFiles(2) + // Rewrite data files where id is less than current timestamp in planner + .filter(() -> Expressions.lessThan("id", (int) Instant.now().getEpochSecond())) + .partialProgressEnabled(true) + .partialProgressMaxCommits(1) + .maxRewriteBytes(100_000L) + .rewriteAll(false)); + + insert(table, 1, "a"); + insert(table, 2, "b"); + insert(table, 3, "c"); + + int epochSecond = (int) Instant.now().getEpochSecond(); + insert(table, epochSecond, "d"); + + assertFileNum(table, 4, 0); + + Thread.sleep(1_000L); + runAndWaitForSuccess(infra.env(), infra.source(), infra.sink()); + + // There is four files, only id is less than current timestamp will be rewritten. so expect 2 + // files. + assertFileNum(table, 1, 0); + + SimpleDataUtil.assertTableRecords( + table, + ImmutableList.of( + createRecord(1, "a"), + createRecord(2, "b"), + createRecord(3, "c"), + createRecord(epochSecond, "d"))); + } + @Test void testBranch() throws Exception { Table table = createTable(); diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java index b9422a63d646..d6563e782e43 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java @@ -24,7 +24,10 @@ import java.io.File; import java.io.IOException; import java.nio.file.Path; +import java.time.LocalDateTime; +import java.time.ZoneOffset; import java.util.List; +import java.util.concurrent.TimeUnit; import javax.annotation.Nullable; import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.MetricOptions; @@ -79,6 +82,12 @@ public class OperatorTestBase { ImmutableMap.of(), ImmutableSet.of(SimpleDataUtil.SCHEMA.columns().get(0).fieldId())); + private static final Schema SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "ts", Types.TimestampType.withoutZone())); + protected static final String UID_SUFFIX = "UID-Dummy"; protected static final String SLOT_SHARING_GROUP = "SlotSharingGroup"; protected static final TriggerLockFactory LOCK_FACTORY = new MemoryLockFactory(); @@ -124,10 +133,14 @@ void after() throws IOException { } protected static Table createTable() { - return createTable(2); + return createTable(2, FileFormat.PARQUET); } protected static Table createTable(int formatVersion) { + return createPartitionedTable(formatVersion, FileFormat.PARQUET); + } + + protected static Table createTable(int formatVersion, FileFormat fileFormat) { return CATALOG_EXTENSION .catalog() .createTable( @@ -136,12 +149,29 @@ protected static Table createTable(int formatVersion) { PartitionSpec.unpartitioned(), null, ImmutableMap.of( + "write.format.default", + fileFormat.name(), TableProperties.FORMAT_VERSION, String.valueOf(formatVersion), "flink.max-continuous-empty-commits", "100000")); } + protected static Table createTableWithTimestampWithoutZone() { + return CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE, + PartitionSpec.builderFor(SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE).identity("ts").build(), + null, + ImmutableMap.of( + TableProperties.FORMAT_VERSION, + "2", + "flink.max-continuous-empty-commits", + "100000")); + } + protected static Table createTableWithDelete() { return createTableWithDelete(2); } @@ -158,7 +188,7 @@ protected static Table createTableWithDelete(int formatVersion) { "format-version", String.valueOf(formatVersion), "write.upsert.enabled", "true")); } - protected static Table createPartitionedTable(int formatVersion) { + protected static Table createPartitionedTable(int formatVersion, FileFormat fileFormat) { return CATALOG_EXTENSION .catalog() .createTable( @@ -167,6 +197,8 @@ protected static Table createPartitionedTable(int formatVersion) { PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build(), null, ImmutableMap.of( + "write.format.default", + fileFormat.name(), "format-version", String.valueOf(formatVersion), "flink.max-continuous-empty-commits", @@ -174,17 +206,27 @@ protected static Table createPartitionedTable(int formatVersion) { } protected static Table createPartitionedTable() { - return createPartitionedTable(2); + return createPartitionedTable(2, FileFormat.PARQUET); } protected void insert(Table table, Integer id, String data) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + insert(table, id, data, FileFormat.PARQUET); + } + + protected void insert(Table table, Integer id, String data, FileFormat fileFormat) + throws IOException { + new GenericAppenderHelper(table, fileFormat, warehouseDir) .appendToTable(Lists.newArrayList(SimpleDataUtil.createRecord(id, data))); table.refresh(); } protected void insert(Table table, List records) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir).appendToTable(records); + insert(table, records, FileFormat.PARQUET); + } + + protected void insert(Table table, List records, FileFormat fileFormat) + throws IOException { + new GenericAppenderHelper(table, fileFormat, warehouseDir).appendToTable(records); table.refresh(); } @@ -194,6 +236,20 @@ protected void insert(Table table, Integer id, String data, String extra) throws table.refresh(); } + protected void insertWithTimestampWithoutZone( + Table table, Integer id, String data, LocalDateTime ts) throws IOException { + GenericRecord record = GenericRecord.create(SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE); + record.setField("id", id); + record.setField("data", data); + record.setField("ts", ts); + long tsMicros = + TimeUnit.SECONDS.toMicros(ts.toEpochSecond(ZoneOffset.UTC)) + + TimeUnit.NANOSECONDS.toMicros(ts.getNano()); + new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + .appendToTable(TestHelpers.Row.of(tsMicros), Lists.newArrayList(record)); + table.refresh(); + } + /** * For the same identifier column id this methods simulate the following row operations: *
  • add an equality delete on oldData @@ -271,7 +327,12 @@ protected void update(Table table, Integer id, String oldData, String tempData, } protected void insertPartitioned(Table table, Integer id, String data) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + insertPartitioned(table, id, data, FileFormat.PARQUET); + } + + protected void insertPartitioned(Table table, Integer id, String data, FileFormat fileFormat) + throws IOException { + new GenericAppenderHelper(table, fileFormat, warehouseDir) .appendToTable( TestHelpers.Row.of(data), Lists.newArrayList(SimpleDataUtil.createRecord(id, data))); table.refresh(); diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java index 8a8a2fa194d4..7b8f638b7e2f 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java @@ -57,7 +57,7 @@ static List planDataFileRewrite( 11, 10_000_000L, rewriterOptions, - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java index 16d524f05cf7..8300df8c94eb 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java @@ -24,6 +24,9 @@ import static org.apache.iceberg.flink.maintenance.operator.RewriteUtil.planDataFileRewrite; import static org.assertj.core.api.Assertions.assertThat; +import java.time.Duration; +import java.time.LocalDateTime; +import java.time.ZoneOffset; import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -107,7 +110,7 @@ void testError() throws Exception { 11, 1L, ImmutableMap.of(MIN_INPUT_FILES, "2"), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); @@ -174,7 +177,7 @@ void testMaxRewriteBytes() throws Exception { 11, maxRewriteBytes, ImmutableMap.of(MIN_INPUT_FILES, "2"), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); @@ -228,7 +231,7 @@ void testBranch() throws Exception { 11, 10_000_000L, ImmutableMap.of(MIN_INPUT_FILES, "2"), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, branchName))) { testHarness.open(); @@ -243,6 +246,46 @@ void testBranch() throws Exception { } } + @Test + void testFilterSupplierWithTimestamp() throws Exception { + Table table = createTableWithTimestampWithoutZone(); + + LocalDateTime oldTs = LocalDateTime.now().minusDays(10); + insertWithTimestampWithoutZone(table, 1, "old_a", oldTs); + insertWithTimestampWithoutZone(table, 2, "old_b", oldTs); + + LocalDateTime recentTs = LocalDateTime.now().minusHours(1); + insertWithTimestampWithoutZone(table, 3, "new_a", recentTs); + insertWithTimestampWithoutZone(table, 4, "new_b", recentTs); + + try (OneInputStreamOperatorTestHarness + testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new DataFileRewritePlanner( + OperatorTestBase.DUMMY_TABLE_NAME, + OperatorTestBase.DUMMY_TABLE_NAME, + 0, + tableLoader(), + 11, + 10_000_000L, + ImmutableMap.of(MIN_INPUT_FILES, "2"), + () -> + Expressions.greaterThanOrEqual( + "ts", + LocalDateTime.now(ZoneOffset.UTC).minus(Duration.ofDays(3)).toString()), + SnapshotRef.MAIN_BRANCH))) { + testHarness.open(); + + trigger(testHarness); + + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); + List planned = testHarness.extractOutputValues(); + + assertThat(planned).hasSize(1); + assertThat(planned.get(0).group().fileScanTasks()).hasSize(2); + } + } + void assertRewriteFileGroup( DataFileRewritePlanner.PlannedGroup plannedGroup, Table table, Set files) { assertThat(plannedGroup.table().currentSnapshot().snapshotId()) diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java index 9202a1df92af..62b29e7c017a 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java @@ -309,7 +309,7 @@ void testSplitSize() throws Exception { "2", TARGET_FILE_SIZE_BYTES, String.valueOf(targetFileSize)), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriterMetrics.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriterMetrics.java new file mode 100644 index 000000000000..42bbfc0d3628 --- /dev/null +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriterMetrics.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatNoException; + +import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; +import org.apache.iceberg.io.WriteResult; +import org.junit.jupiter.api.Test; + +public class TestIcebergStreamWriterMetrics { + + @Test + void histogramsCreatedWhenDropwizardAvailable() { + IcebergStreamWriterMetrics metrics = + new IcebergStreamWriterMetrics( + UnregisteredMetricsGroup.createSinkWriterMetricGroup(), "db.table"); + + assertThat(metrics.dataFilesSizeHistogram()).isNotNull(); + assertThat(metrics.deleteFilesSizeHistogram()).isNotNull(); + + assertThatNoException() + .isThrownBy(() -> metrics.updateFlushResult(WriteResult.builder().build())); + } +} diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java index 27b1e3d84a8c..89befb9e8ea2 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java @@ -43,7 +43,10 @@ import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.RestartStrategyOptions; import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; +import org.apache.flink.runtime.OperatorIDPair; import org.apache.flink.runtime.client.JobExecutionException; +import org.apache.flink.runtime.jobgraph.JobVertex; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; @@ -75,6 +78,7 @@ import org.apache.iceberg.flink.CatalogLoader; import org.apache.iceberg.flink.FlinkSchemaUtil; import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.SimpleDataUtil; import org.apache.iceberg.flink.TestHelpers; @@ -83,6 +87,7 @@ import org.apache.iceberg.flink.sink.dynamic.TestDynamicCommitter.FailBeforeAndAfterCommit; import org.apache.iceberg.inmemory.InMemoryInputFile; import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; @@ -119,6 +124,7 @@ private static class DynamicIcebergDataImpl implements Serializable { PartitionSpec partitionSpec; boolean upsertMode; Set equalityFields; + int writeParallelism; private DynamicIcebergDataImpl( Schema schemaProvided, String tableName, String branch, PartitionSpec partitionSpec) { @@ -130,7 +136,8 @@ private DynamicIcebergDataImpl( partitionSpec, false, Collections.emptySet(), - false); + false, + 10); } private DynamicIcebergDataImpl( @@ -147,7 +154,8 @@ private DynamicIcebergDataImpl( partitionSpec, false, Collections.emptySet(), - false); + false, + 10); } private DynamicIcebergDataImpl( @@ -166,7 +174,8 @@ private DynamicIcebergDataImpl( partitionSpec, upsertMode, equalityFields, - isDuplicate); + isDuplicate, + 10); } private DynamicIcebergDataImpl( @@ -177,7 +186,8 @@ private DynamicIcebergDataImpl( PartitionSpec partitionSpec, boolean upsertMode, Set equalityFields, - boolean isDuplicate) { + boolean isDuplicate, + int writeParallelism) { this.rowProvided = randomRow(schemaProvided, isDuplicate ? seed : ++seed); this.rowExpected = isDuplicate ? null : rowProvided; this.schemaProvided = schemaProvided; @@ -187,6 +197,7 @@ private DynamicIcebergDataImpl( this.partitionSpec = partitionSpec; this.upsertMode = upsertMode; this.equalityFields = equalityFields; + this.writeParallelism = writeParallelism; } } @@ -206,6 +217,56 @@ public void generate(DynamicIcebergDataImpl row, Collector out) { converter(schema).toInternal(row.rowProvided), spec, spec.isPartitioned() ? DistributionMode.HASH : DistributionMode.NONE, + row.writeParallelism); + dynamicRecord.setUpsertMode(row.upsertMode); + dynamicRecord.setEqualityFields(row.equalityFields); + out.collect(dynamicRecord); + } + } + + /** Generator that always emits forward (null distributionMode) records. */ + private static class ForwardGenerator implements DynamicRecordGenerator { + @Override + public void generate(DynamicIcebergDataImpl row, Collector out) { + TableIdentifier tableIdentifier = TableIdentifier.of(DATABASE, row.tableName); + Schema schema = row.schemaProvided; + PartitionSpec spec = row.partitionSpec; + DynamicRecord dynamicRecord = + new DynamicRecord( + tableIdentifier, + row.branch, + schema, + converter(schema).toInternal(row.rowProvided), + spec); + dynamicRecord.setUpsertMode(row.upsertMode); + dynamicRecord.setEqualityFields(row.equalityFields); + out.collect(dynamicRecord); + } + } + + /** + * Generator that alternates between forward (null distributionMode) and shuffle records. Even + * indices go forward, odd indices go through shuffle. + */ + private static class MixedGenerator implements DynamicRecordGenerator { + private int count = 0; + + @Override + public void generate(DynamicIcebergDataImpl row, Collector out) { + TableIdentifier tableIdentifier = TableIdentifier.of(DATABASE, row.tableName); + Schema schema = row.schemaProvided; + PartitionSpec spec = row.partitionSpec; + boolean forward = (count++ % 2 == 0); + DistributionMode mode = + forward ? null : (spec.isPartitioned() ? DistributionMode.HASH : DistributionMode.NONE); + DynamicRecord dynamicRecord = + new DynamicRecord( + tableIdentifier, + row.branch, + schema, + converter(schema).toInternal(row.rowProvided), + spec, + mode, 10); dynamicRecord.setUpsertMode(row.upsertMode); dynamicRecord.setEqualityFields(row.equalityFields); @@ -238,6 +299,109 @@ void testWrite() throws Exception { runTest(rows); } + @Test + void testNoShuffleTopology() throws Exception { + DataStream dataStream = + env.fromData( + Collections.emptyList(), TypeInformation.of(new TypeHint() {})); + DynamicIcebergSink.forInput(dataStream) + .generator(new ForwardGenerator()) + .catalogLoader(CATALOG_EXTENSION.catalogLoader()) + .writeParallelism(2) + .immediateTableUpdate(false) + .overwrite(false) + .append(); + + boolean generatorAndSinkChained = false; + for (JobVertex vertex : env.getStreamGraph().getJobGraph().getVertices()) { + boolean generatorInThisVertex = false; + boolean sinkInThisVertex = false; + for (OperatorIDPair operatorID : vertex.getOperatorIDs()) { + String uid = operatorID.getUserDefinedOperatorUid(); + if (uid == null) { + continue; + } + + if (uid.endsWith("-forward-writer")) { + sinkInThisVertex = true; + } else if (uid.endsWith("-generator")) { + generatorInThisVertex = true; + } + } + + generatorAndSinkChained = generatorInThisVertex && sinkInThisVertex; + if (generatorAndSinkChained) { + break; + } + } + + assertThat(generatorAndSinkChained).isTrue(); + } + + @Test + void testForwardWrite() throws Exception { + runForwardWriteTest(new ForwardGenerator()); + } + + @Test + void testMixedForwardAndShuffleWrite() throws Exception { + runForwardWriteTest(new MixedGenerator()); + } + + private void runForwardWriteTest(DynamicRecordGenerator generator) + throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned())); + + DataStream dataStream = + env.fromData(rows, TypeInformation.of(new TypeHint<>() {})); + env.setParallelism(1); + + DynamicIcebergSink.forInput(dataStream) + .generator(generator) + .catalogLoader(CATALOG_EXTENSION.catalogLoader()) + .writeParallelism(1) + .immediateTableUpdate(true) + .append(); + + env.execute(); + + verifyResults(rows); + } + + @Test + void testWriteWithNullBranch() throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", null, PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", null, PartitionSpec.unpartitioned())); + + runTest( + rows, this.env, false, 1, ImmutableMap.of(FlinkWriteOptions.BRANCH.key(), "test-branch")); + } + @Test void testWritePartitioned() throws Exception { PartitionSpec spec = PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).bucket("id", 10).build(); @@ -1170,8 +1334,9 @@ void testOperatorUidsFormat() { // pre commit topology was off, but since it is stateless, users will still be able to restore // state, but we must keep the stateful operators UUIds like the committer consistent. assertThat(sinkUids) - .contains( + .containsOnly( "test--sink", + "test--forward-writer", "test--generator", "test--updater", "test--sink: test--pre-commit-topology", @@ -1179,8 +1344,9 @@ void testOperatorUidsFormat() { sinkUids = createSinkAndReturnUIds(""); assertThat(sinkUids) - .contains( + .containsOnly( "--sink", + "--forward-writer", "--generator", "--updater", "--sink: --pre-commit-topology", @@ -1188,14 +1354,71 @@ void testOperatorUidsFormat() { sinkUids = createSinkAndReturnUIds(null); assertThat(sinkUids) - .contains( + .containsOnly( "--sink", + "--forward-writer", "--generator", "--updater", "--sink: --pre-commit-topology", "Sink Committer: --sink"); } + @Test + void testGeneratorDefaultParallelism() { + StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); + streamEnv.setParallelism(4); + + DataStreamSource source = + streamEnv.fromData(Collections.emptySet(), TypeInformation.of(new TypeHint<>() {})); + source.setParallelism(8); + + DynamicIcebergSink.forInput(source) + .generator(new Generator()) + .catalogLoader(CATALOG_EXTENSION.catalogLoader()) + .uidPrefix("test") + .append(); + + // Since the generator parallelism is not directly accessible via the returned DataStreamSink, + // inspect the stream graph to verify the generator inherits the input source parallelism. + int generatorParallelism = + streamEnv.getStreamGraph().getStreamNodes().stream() + .filter(node -> "test--generator".equals(node.getTransformationUID())) + .findFirst() + .map(StreamNode::getParallelism) + .orElseThrow(() -> new AssertionError("Generator node not found")); + + assertThat(generatorParallelism).isEqualTo(source.getParallelism()); + } + + @Test + void testFallBackParallelismFromConfig() throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned(), + false, + Collections.emptySet(), + false, + -1), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned(), + false, + Collections.emptySet(), + false, + 0)); + + runTest( + rows, this.env, true, 2, ImmutableMap.of(FlinkWriteOptions.WRITE_PARALLELISM.key(), "1")); + } + private Set createSinkAndReturnUIds(String uidPrefix) { StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); @@ -1304,6 +1527,18 @@ private void runTest( verifyResults(dynamicData); } + private void runTest( + List dynamicData, + StreamExecutionEnvironment env, + boolean immediateUpdate, + int parallelism, + Map writeProperties) + throws Exception { + executeDynamicSink( + dynamicData, env, immediateUpdate, parallelism, null, false, writeProperties); + verifyResults(dynamicData, writeProperties); + } + private void executeDynamicSink( List dynamicData, StreamExecutionEnvironment env, @@ -1311,7 +1546,8 @@ private void executeDynamicSink( int parallelism, @Nullable CommitHook commitHook) throws Exception { - executeDynamicSink(dynamicData, env, immediateUpdate, parallelism, commitHook, false); + executeDynamicSink( + dynamicData, env, immediateUpdate, parallelism, commitHook, false, Maps.newHashMap()); } private void executeDynamicSink( @@ -1322,6 +1558,19 @@ private void executeDynamicSink( @Nullable CommitHook commitHook, boolean overwrite) throws Exception { + executeDynamicSink( + dynamicData, env, immediateUpdate, parallelism, commitHook, overwrite, Maps.newHashMap()); + } + + private void executeDynamicSink( + List dynamicData, + StreamExecutionEnvironment env, + boolean immediateUpdate, + int parallelism, + @Nullable CommitHook commitHook, + boolean overwrite, + Map writeProperties) + throws Exception { DataStream dataStream = env.fromData(dynamicData, TypeInformation.of(new TypeHint<>() {})); env.setParallelism(parallelism); @@ -1335,6 +1584,7 @@ private void executeDynamicSink( .immediateTableUpdate(immediateUpdate) .setSnapshotProperty("commit.retry.num-retries", "0") .overwrite(overwrite) + .setAll(writeProperties) .append(); } else { DynamicIcebergSink.forInput(dataStream) @@ -1343,6 +1593,7 @@ private void executeDynamicSink( .writeParallelism(parallelism) .immediateTableUpdate(immediateUpdate) .overwrite(overwrite) + .setAll(writeProperties) .append(); } @@ -1359,7 +1610,9 @@ static class CommitHookEnabledDynamicIcebergSink extends DynamicIcebergSink.B @Override DynamicIcebergSink instantiateSink( - Map writeProperties, Configuration flinkConfig) { + Map writeProperties, + Configuration flinkConfig, + DataStream> forwardWriteResults) { return new CommitHookDynamicIcebergSink( commitHook, CATALOG_EXTENSION.catalogLoader(), @@ -1367,7 +1620,7 @@ DynamicIcebergSink instantiateSink( "uidPrefix", writeProperties, flinkConfig, - 100); + forwardWriteResults); } } @@ -1383,14 +1636,15 @@ static class CommitHookDynamicIcebergSink extends DynamicIcebergSink { String uidPrefix, Map writeProperties, Configuration flinkConfig, - int cacheMaximumSize) { + DataStream> forwardWritten) { super( catalogLoader, snapshotProperties, uidPrefix, writeProperties, flinkConfig, - cacheMaximumSize); + 100, + forwardWritten); this.commitHook = commitHook; this.overwriteMode = new FlinkWriteConf(writeProperties, flinkConfig).overwriteMode(); } @@ -1409,6 +1663,12 @@ public Committer createCommitter(CommitterInitContext contex } private void verifyResults(List dynamicData) throws IOException { + verifyResults(dynamicData, Maps.newHashMap()); + } + + private void verifyResults( + List dynamicData, Map writeProperties) + throws IOException { // Calculate the expected result Map, List> expectedData = Maps.newHashMap(); Map expectedSchema = Maps.newHashMap(); @@ -1422,9 +1682,12 @@ private void verifyResults(List dynamicData) throws IOEx dynamicData.forEach( r -> { + String branch = + MoreObjects.firstNonNull( + r.branch, writeProperties.get(FlinkWriteOptions.BRANCH.key())); List data = expectedData.computeIfAbsent( - Tuple2.of(r.tableName, r.branch), unused -> Lists.newArrayList()); + Tuple2.of(r.tableName, branch), unused -> Lists.newArrayList()); data.addAll( convertToRowData(expectedSchema.get(r.tableName), ImmutableList.of(r.rowExpected))); }); diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordWithConfig.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordWithConfig.java new file mode 100644 index 000000000000..de55621475ed --- /dev/null +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordWithConfig.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Collections; +import java.util.Set; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +class TestDynamicRecordWithConfig { + + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); + + private static final TableIdentifier TABLE_IDENTIFIER = TableIdentifier.of("db", "table"); + private static final PartitionSpec UNPARTITIONED = PartitionSpec.unpartitioned(); + private static final RowData ROW_DATA = GenericRowData.of(1, StringData.fromString("test")); + + @Test + void testBranchFallBack() { + String defaultBranch = "default-branch"; + FlinkWriteConf conf = + new FlinkWriteConf( + ImmutableMap.of(FlinkWriteOptions.BRANCH.key(), defaultBranch), new Configuration()); + DynamicRecordWithConfig dynamicRecordWithConfig = new DynamicRecordWithConfig(conf); + + DynamicRecord dynamicRecord = + new DynamicRecord(TABLE_IDENTIFIER, null, SCHEMA, ROW_DATA, UNPARTITIONED); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).branch()).isEqualTo(defaultBranch); + + String customBranch = "custom-branch"; + dynamicRecord.setBranch(customBranch); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).branch()).isEqualTo(customBranch); + } + + @Test + void testWriteParallelismFallBack() { + int defaultParallelism = 4; + FlinkWriteConf conf = + new FlinkWriteConf( + ImmutableMap.of( + FlinkWriteOptions.WRITE_PARALLELISM.key(), String.valueOf(defaultParallelism)), + new Configuration()); + DynamicRecordWithConfig dynamicRecordWithConfig = new DynamicRecordWithConfig(conf); + + DynamicRecord dynamicRecord = + new DynamicRecord(TABLE_IDENTIFIER, null, SCHEMA, ROW_DATA, UNPARTITIONED, null, -1); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).writeParallelism()) + .isEqualTo(defaultParallelism); + + dynamicRecord.writeParallelism(0); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).writeParallelism()) + .isEqualTo(defaultParallelism); + + dynamicRecord.writeParallelism(8); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).writeParallelism()).isEqualTo(8); + } + + @Test + void testDelegatesToWrappedRecord() { + FlinkWriteConf conf = new FlinkWriteConf(Collections.emptyMap(), new Configuration()); + PartitionSpec partitioned = PartitionSpec.builderFor(SCHEMA).identity("id").build(); + Set equalityFields = ImmutableSet.of("id", "data"); + + DynamicRecord dynamicRecord = + new DynamicRecord( + TABLE_IDENTIFIER, + SnapshotRef.MAIN_BRANCH, + SCHEMA, + ROW_DATA, + partitioned, + DistributionMode.HASH, + 2); + dynamicRecord.setUpsertMode(true); + dynamicRecord.setEqualityFields(equalityFields); + + DynamicRecordWithConfig record = new DynamicRecordWithConfig(conf).wrap(dynamicRecord); + + assertThat(record.tableIdentifier()).isEqualTo(TABLE_IDENTIFIER); + assertThat(record.schema()).isEqualTo(SCHEMA); + assertThat(record.spec()).isEqualTo(partitioned); + assertThat(record.rowData()).isSameAs(ROW_DATA); + assertThat(record.distributionMode()).isEqualTo(DistributionMode.HASH); + assertThat(record.upsertMode()).isTrue(); + assertThat(record.equalityFields()).isEqualTo(equalityFields); + } +} diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java index 1c8e6df8591d..f6b2b368c2be 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java @@ -23,12 +23,14 @@ import static org.assertj.core.api.Assertions.assertThat; import java.util.Collections; +import org.apache.flink.configuration.Configuration; import org.apache.flink.table.data.GenericRowData; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.RegisterExtension; @@ -57,9 +59,6 @@ class TestDynamicTableUpdateOperator { @Test void testDynamicTableUpdateOperatorNewTable() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); @@ -67,12 +66,8 @@ void testDynamicTableUpdateOperatorNewTable() throws Exception { DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_SENSITIVE, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(CASE_SENSITIVE, PRESERVE_COLUMNS)); operator.open(null); DynamicRecordInternal input = @@ -93,21 +88,14 @@ void testDynamicTableUpdateOperatorNewTable() throws Exception { @Test void testDynamicTableUpdateOperatorSchemaChange() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_SENSITIVE, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(CASE_SENSITIVE, PRESERVE_COLUMNS)); operator.open(null); catalog.createTable(table, SCHEMA1); @@ -135,9 +123,6 @@ void testDynamicTableUpdateOperatorSchemaChange() throws Exception { @ParameterizedTest @ValueSource(booleans = {true, false}) void testCaseInSensitivity(boolean caseSensitive) throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); @@ -148,12 +133,8 @@ void testCaseInSensitivity(boolean caseSensitive) throws Exception { DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - caseSensitive, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(caseSensitive, PRESERVE_COLUMNS)); operator.open(null); catalog.createTable(table, initialSchema); @@ -187,21 +168,14 @@ void testCaseInSensitivity(boolean caseSensitive) throws Exception { @Test void testDynamicTableUpdateOperatorPreserveUnusedColumns() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_SENSITIVE, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(CASE_SENSITIVE, PRESERVE_COLUMNS)); operator.open(null); catalog.createTable(table, SCHEMA2); @@ -228,21 +202,14 @@ void testDynamicTableUpdateOperatorPreserveUnusedColumns() throws Exception { @Test void testDynamicTableUpdateOperatorDropUnusedColumns() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_INSENSITIVE, - DROP_COLUMNS); + flinkDynamicSinkConfiguration(CASE_INSENSITIVE, DROP_COLUMNS)); operator.open(null); catalog.createTable(table, SCHEMA2); @@ -265,4 +232,13 @@ void testDynamicTableUpdateOperatorDropUnusedColumns() throws Exception { assertThat(tableSchema.findField("data")).isNull(); assertThat(input).isEqualTo(output); } + + private static FlinkDynamicSinkConf flinkDynamicSinkConfiguration( + boolean caseSensitive, boolean dropUnusedColumns) { + return new FlinkDynamicSinkConf( + ImmutableMap.of( + FlinkDynamicSinkOptions.CASE_SENSITIVE.key(), String.valueOf(caseSensitive), + FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.key(), String.valueOf(dropUnusedColumns)), + new Configuration()); + } } diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java index c65f96b12cbb..9a485fafaf47 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java @@ -25,6 +25,7 @@ import java.util.Map; import java.util.Set; import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.configuration.Configuration; import org.apache.flink.runtime.state.KeyGroupRangeAssignment; import org.apache.flink.table.data.GenericRowData; import org.apache.flink.table.data.RowData; @@ -34,6 +35,9 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.Test; @@ -229,6 +233,38 @@ void testFailOnNonPositiveWriteParallelism() { }); } + @Test + void testNonPositiveWriteParallelismConfigFallback() throws Exception { + int maxWriteParallelism = 5; + HashKeyGenerator generator = new HashKeyGenerator(16, maxWriteParallelism); + PartitionSpec unpartitioned = PartitionSpec.unpartitioned(); + FlinkWriteConf flinkWriteConf = + new FlinkWriteConf( + ImmutableMap.of(FlinkWriteOptions.WRITE_PARALLELISM.key(), "2"), new Configuration()); + + Set writeKeys = Sets.newHashSet(); + for (int i = 0; i < 20; i++) { + GenericRowData row = GenericRowData.of(i, StringData.fromString("z")); + writeKeys.add( + getWriteKey( + generator, + unpartitioned, + DistributionMode.NONE, + i % 2 == 0 ? 0 : -1, + Collections.emptySet(), + row, + flinkWriteConf)); + } + + assertThat(writeKeys).hasSize(2); + assertThat( + writeKeys.stream() + .map(key -> getSubTaskId(key, 2, maxWriteParallelism)) + .distinct() + .count()) + .isEqualTo(2); + } + @Test void testCapAtMaxWriteParallelism() throws Exception { int writeParallelism = 10; @@ -477,10 +513,31 @@ private static int getWriteKey( Set equalityFields, GenericRowData row) throws Exception { - DynamicRecord record = + return getWriteKey( + generator, + spec, + mode, + writeParallelism, + equalityFields, + row, + new FlinkWriteConf(Collections.emptyMap(), new Configuration())); + } + + private static int getWriteKey( + HashKeyGenerator generator, + PartitionSpec spec, + DistributionMode mode, + int writeParallelism, + Set equalityFields, + GenericRowData row, + FlinkWriteConf flinkWriteConf) + throws Exception { + DynamicRecord inputRecord = new DynamicRecord(TABLE_IDENTIFIER, BRANCH, SCHEMA, row, spec, mode, writeParallelism); - record.setEqualityFields(equalityFields); - return generator.generateKey(record); + inputRecord.setEqualityFields(equalityFields); + + DynamicRecordWithConfig dynamicRecordWithConfig = new DynamicRecordWithConfig(flinkWriteConf); + return generator.generateKey(dynamicRecordWithConfig.wrap(inputRecord)); } private static int getSubTaskId(int writeKey1, int writeParallelism, int maxWriteParallelism) { diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java index f84cf7fb1aae..ec9333674d03 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java @@ -225,6 +225,68 @@ public void apply( 3))); } + /** + * Integration test verifying that records with eventTime equal to the minimum timestamp of their + * split are correctly included in windows. The {@link + * org.apache.iceberg.flink.source.reader.WatermarkExtractorRecordEmitter} emits the watermark as + * {@code minSplitTs - 1}, so records at exactly {@code minSplitTs} are on-time rather than late. + * + *

    The test writes 3 records at epoch (t=0). The split's column-stats lower-bound is 0, so the + * extracted watermark is 0ms and the emitted watermark is -1ms. Records at t=0 are strictly after + * that watermark and therefore belong to the [0, 5min) window. A later split is then appended to + * advance the watermark past the window boundary and trigger its evaluation. + */ + @Test + public void testWindowingWithRecordsAtSplitMinTimestamp() throws Exception { + GenericAppenderHelper dataAppender = appender(); + + // File 1: 3 records at exactly t=0 (epoch). Extracted watermark = 0ms, emitted = -1ms. + List batch = + ImmutableList.of( + generateRecord(0, "file_1-recordTs_0_a"), + generateRecord(0, "file_1-recordTs_0_b"), + generateRecord(0, "file_1-recordTs_0_c")); + dataAppender.appendToTable(batch); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(1); + + DataStream stream = + env.fromSource( + source(), + WatermarkStrategy.noWatermarks() + .withTimestampAssigner(new RowDataTimestampAssigner()), + SOURCE_NAME, + TypeInformation.of(RowData.class)); + + stream + .windowAll(TumblingEventTimeWindows.of(Duration.ofMinutes(5))) + .apply( + new AllWindowFunction() { + @Override + public void apply( + TimeWindow window, Iterable values, Collector out) { + AtomicInteger count = new AtomicInteger(0); + values.forEach(a -> count.incrementAndGet()); + out.collect(row(window.getStart(), count.get())); + WINDOWS.put(window.getStart(), count.get()); + } + }); + + WINDOWS.clear(); + env.executeAsync("Iceberg Source Min Timestamp Windowing Test"); + + // Append a file with much later timestamps to advance the watermark past [0, 5min) + dataAppender.appendToTable( + dataAppender.writeFile(ImmutableList.of(generateRecord(1500, "last-record")))); + + // The [0, 5min) window should fire with all 3 records written at epoch + Awaitility.await() + .pollInterval(Duration.ofMillis(10)) + .atMost(30, TimeUnit.SECONDS) + .until(() -> Integer.valueOf(3).equals(WINDOWS.get(0L))); + } + /** * This is an integration test for watermark handling and throttling. Integration testing the * following: diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java index 5dd7de545e11..09639a8a9568 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java @@ -31,7 +31,7 @@ void testIncrementalFromSnapshotId() { .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) .build(); assertException( - context, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); + context, "Invalid starting snapshot id for INCREMENTAL_FROM_SNAPSHOT_ID strategy: null"); context = ScanContext.builder() @@ -42,7 +42,7 @@ void testIncrementalFromSnapshotId() { .build(); assertException( context, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + "Invalid starting snapshot timestamp for INCREMENTAL_FROM_SNAPSHOT_ID strategy: not null"); } @Test @@ -54,7 +54,7 @@ void testIncrementalFromSnapshotTimestamp() { .build(); assertException( context, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); + "Invalid starting snapshot timestamp for INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP strategy: null"); context = ScanContext.builder() @@ -64,7 +64,8 @@ void testIncrementalFromSnapshotTimestamp() { .startSnapshotTimestamp(1L) .build(); assertException( - context, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + context, + "Invalid starting snapshot id for INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP strategy: not null"); } @Test diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestWatermarkExtractorRecordEmitter.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestWatermarkExtractorRecordEmitter.java new file mode 100644 index 000000000000..1cb7be03c6a7 --- /dev/null +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestWatermarkExtractorRecordEmitter.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import java.util.Map; +import org.apache.flink.api.common.eventtime.Watermark; +import org.apache.flink.api.connector.source.SourceOutput; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestWatermarkExtractorRecordEmitter { + @TempDir protected Path temporaryFolder; + + @Test + public void testWatermarkIsDecrementedByOne() throws IOException { + long extractedWatermark = 1000L; + IcebergSourceSplit split = createSplit(1L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> extractedWatermark); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + emitter.emitRecord(new RecordAndPosition<>("record", 0, 0L), output, split); + + assertThat(output.watermarks).hasSize(1); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(extractedWatermark - 1); + } + + @Test + public void testWatermarkEmittedOnlyOncePerSplit() throws IOException { + IcebergSourceSplit split = createSplit(1L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> 1000L); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + RecordAndPosition element = new RecordAndPosition<>("record", 0, 0L); + emitter.emitRecord(element, output, split); + emitter.emitRecord(element, output, split); + emitter.emitRecord(element, output, split); + + assertThat(output.watermarks).hasSize(1); + assertThat(output.records).hasSize(3); + } + + @Test + public void testWatermarkNotEmittedWhenNewSplitHasLowerValue() throws IOException { + IcebergSourceSplit split1 = createSplit(1L); + IcebergSourceSplit split2 = createSplit(2L); + + Map watermarkMap = Maps.newHashMap(); + watermarkMap.put(split1.splitId(), 2000L); + watermarkMap.put(split2.splitId(), 1000L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> watermarkMap.get(s.splitId())); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + RecordAndPosition element = new RecordAndPosition<>("record", 0, 0L); + emitter.emitRecord(element, output, split1); + emitter.emitRecord(element, output, split2); + + // Only split1's watermark is emitted; split2 has a lower value so it's skipped + assertThat(output.watermarks).hasSize(1); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(1999L); + } + + @Test + public void testWatermarkEmittedForEachHigherSplit() throws IOException { + IcebergSourceSplit split1 = createSplit(1L); + IcebergSourceSplit split2 = createSplit(2L); + + Map watermarkMap = Maps.newHashMap(); + watermarkMap.put(split1.splitId(), 1000L); + watermarkMap.put(split2.splitId(), 2000L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> watermarkMap.get(s.splitId())); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + RecordAndPosition element = new RecordAndPosition<>("record", 0, 0L); + emitter.emitRecord(element, output, split1); + emitter.emitRecord(element, output, split2); + + assertThat(output.watermarks).hasSize(2); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(999L); + assertThat(output.watermarks.get(1).getTimestamp()).isEqualTo(1999L); + } + + @Test + public void testWatermarkAtLongMinValueDoesNotOverflow() throws IOException { + IcebergSourceSplit split = createSplit(1L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> Long.MIN_VALUE); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + emitter.emitRecord(new RecordAndPosition<>("record", 0, 0L), output, split); + + assertThat(output.watermarks).hasSize(1); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(Long.MIN_VALUE); + } + + private IcebergSourceSplit createSplit(long seed) throws IOException { + List> recordBatchList = + ReaderUtil.createRecordBatchList(seed, TestFixtures.SCHEMA, 1, 1); + return IcebergSourceSplit.fromCombinedScanTask( + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, FileFormat.PARQUET, TestFixtures.SCHEMA)); + } + + private static class CapturingSourceOutput implements SourceOutput { + final List watermarks = Lists.newArrayList(); + final List records = Lists.newArrayList(); + + @Override + public void collect(T record) { + records.add(record); + } + + @Override + public void collect(T record, long timestamp) { + records.add(record); + } + + @Override + public void emitWatermark(Watermark watermark) { + watermarks.add(watermark); + } + + @Override + public void markIdle() {} + + @Override + public void markActive() {} + } +} diff --git a/format/gcm-stream-spec.md b/format/gcm-stream-spec.md index 4d241ca3ef24..8168780e118b 100644 --- a/format/gcm-stream-spec.md +++ b/format/gcm-stream-spec.md @@ -41,7 +41,7 @@ The output stream, produced by a metadata or data writer, is split into equal-si ## Encryption algorithm -AES GCM Stream uses the standard AEG GCM cipher, and supports all AES key sizes: 128, 192 and 256 bits. +AES GCM Stream uses the standard AES GCM cipher, and supports all AES key sizes: 128, 192 and 256 bits. AES GCM is an authenticated encryption. Besides data confidentiality (encryption), it supports two levels of integrity verification (authentication): of the data (default), and of the data combined with an optional AAD (“additional authenticated data”). An AAD is a free text to be authenticated, together with the data. The structure of AES GCM Stream AADs is described below. @@ -80,7 +80,7 @@ AES GCM Stream encrypts all blocks by the GCM cipher, without padding. The AES G ### Additional Authenticated Data -The AES GCM cipher protects against byte replacement inside a ciphertext block - but, without an AAD, it can't prevent replacement of one ciphertext block with another (encrypted with the same key). AES GCM Stream leverages AADs to protect against swapping ciphertext blocks inside a file or between files. AES GCM Stream can also protect against swapping full files - for example, replacement of a metadata file with an old version. AADs are built to reflects the identity of a file and of the blocks inside the file. +The AES GCM cipher protects against byte replacement inside a ciphertext block - but, without an AAD, it can't prevent replacement of one ciphertext block with another (encrypted with the same key). AES GCM Stream leverages AADs to protect against swapping ciphertext blocks inside a file or between files. AES GCM Stream can also protect against swapping full files - for example, replacement of a metadata file with an old version. AADs are built to reflect the identity of a file and of the blocks inside the file. AES GCM Stream constructs a block AAD from two components: an AAD prefix - a string provided by Iceberg for the file (with the file ID), and an AAD suffix - the block sequence number in the file, as an int in a 4-byte little-endian form. The block AAD is a direct concatenation of the prefix and suffix parts. diff --git a/format/puffin-spec.md b/format/puffin-spec.md index 06c7ad565dd7..8617ae6d8e96 100644 --- a/format/puffin-spec.md +++ b/format/puffin-spec.md @@ -188,7 +188,6 @@ codecs listed below. For maximal interoperability, other codecs are not supporte |------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | lz4 | Single [LZ4 compression frame](https://github.com/lz4/lz4/blob/77d1b93f72628af7bbde0243b4bba9205c3138d9/doc/lz4_Frame_format.md), with content size present | | zstd | Single [Zstandard compression frame](https://github.com/facebook/zstd/blob/8af64f41161f6c2e0ba842006fe238c664a6a437/doc/zstd_compression_format.md#zstandard-frames), with content size present | -__ ### Common properties diff --git a/format/spec.md b/format/spec.md index 0d3c79762c6c..94651da0fa86 100644 --- a/format/spec.md +++ b/format/spec.md @@ -600,14 +600,15 @@ A manifest stores files for a single partition spec. When a table’s partition A manifest file must store the partition spec and other metadata as properties in the Avro file's key-value metadata: -| v1 | v2 | Key | Value | -|------------|------------|---------------------|---------------------------------------------------------------------------------------------------------------------------------------------| -| _required_ | _required_ | `schema` | JSON representation of the table schema at the time the manifest was written | -| _optional_ | _required_ | `schema-id` | ID of the schema used to write the manifest as a string | -| _required_ | _required_ | `partition-spec` | JSON representation of only the partition fields array of the partition spec used to write the manifest. See [Appendix C](#partition-specs) | -| _optional_ | _required_ | `partition-spec-id` | ID of the partition spec used to write the manifest as a string | -| _optional_ | _required_ | `format-version` | Table format version number of the manifest as a string | -| | _required_ | `content` | Type of content files tracked by the manifest: "data" or "deletes" | +=== "v1 - v3" + | v1 | v2 and v3 | Key | Value | + |------------|------------|---------------------|---------------------------------------------------------------------------------------------------------------------------------------------| + | _required_ | _required_ | `schema` | JSON representation of the table schema at the time the manifest was written | + | _optional_ | _required_ | `schema-id` | ID of the schema used to write the manifest as a string | + | _required_ | _required_ | `partition-spec` | JSON representation of only the partition fields array of the partition spec used to write the manifest. See [Appendix C](#partition-specs) | + | _optional_ | _required_ | `partition-spec-id` | ID of the partition spec used to write the manifest as a string | + | _optional_ | _required_ | `format-version` | Table format version number of the manifest as a string | + | | _required_ | `content` | Type of content files tracked by the manifest: "data" or "deletes" | The schema of a manifest file is defined by the `manifest_entry` struct, described in the following section. @@ -615,13 +616,14 @@ The schema of a manifest file is defined by the `manifest_entry` struct, describ The `manifest_entry` struct consists of the following fields: -| v1 | v2 | Field id, name | Type | Description | -| ---------- | ---------- |-------------------------------|-----------------------------------------------------------|-------------| -| _required_ | _required_ | **`0 status`** | `int` with meaning: `0: EXISTING` `1: ADDED` `2: DELETED` | Used to track additions and deletions. Deletes are informational only and not used in scans. | -| _required_ | _optional_ | **`1 snapshot_id`** | `long` | Snapshot id where the file was added, or deleted if status is 2. Inherited when null. | -| | _optional_ | **`3 sequence_number`** | `long` | Data sequence number of the file. Inherited when null and status is 1 (added). | -| | _optional_ | **`4 file_sequence_number`** | `long` | File sequence number indicating when the file was added. Inherited when null and status is 1 (added). | -| _required_ | _required_ | **`2 data_file`** | `data_file` `struct` (see below) | File path, partition tuple, metrics, ... | +=== "v1 - v3" + | v1 | v2 and v3 | Field id, name | Type | Description | + | ---------- | ---------- |-------------------------------|-----------------------------------------------------------|-------------| + | _required_ | _required_ | **`0 status`** | `int` with meaning: `0: EXISTING` `1: ADDED` `2: DELETED` | Used to track additions and deletions. Deletes are informational only and not used in scans. | + | _required_ | _optional_ | **`1 snapshot_id`** | `long` | Snapshot id where the file was added, or deleted if status is 2. Inherited when null. | + | | _optional_ | **`3 sequence_number`** | `long` | Data sequence number of the file. Inherited when null and status is 1 (added). | + | | _optional_ | **`4 file_sequence_number`** | `long` | File sequence number indicating when the file was added. Inherited when null and status is 1 (added). | + | _required_ | _required_ | **`2 data_file`** | `data_file` `struct` (see below) | File path, partition tuple, metrics, ... | The manifest entry fields are used to keep track of the snapshot in which files were added or logically deleted. The `data_file` struct, defined below, is nested inside the manifest entry so that it can be easily passed to job planning without the manifest entry fields. @@ -643,32 +645,33 @@ Notes: The `data_file` struct consists of the following fields: -| v1 | v2 | v3 | Field id, name | Type | Description | -| ---------- |------------|------------|-----------------------------------|-----------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| | _required_ | _required_ | **`134 content`** | `int` with meaning: `0: DATA`, `1: POSITION DELETES`, `2: EQUALITY DELETES` | Type of content stored by the data file: data, equality deletes, or position deletes (all v1 files are data files) | -| _required_ | _required_ | _required_ | **`100 file_path`** | `string` | Full URI for the file with FS scheme | -| _required_ | _required_ | _required_ | **`101 file_format`** | `string` | String file format name, `avro`, `orc`, `parquet`, or `puffin` | -| _required_ | _required_ | _required_ | **`102 partition`** | `struct<...>` | Partition data tuple, schema based on the partition spec output using partition field ids for the struct field ids | -| _required_ | _required_ | _required_ | **`103 record_count`** | `long` | Number of records in this file, or the cardinality of a deletion vector | -| _required_ | _required_ | _required_ | **`104 file_size_in_bytes`** | `long` | Total file size in bytes | -| _required_ | | | ~~**`105 block_size_in_bytes`**~~ | `long` | **Deprecated. Always write a default in v1. Do not write in v2 or v3.** | -| _optional_ | | | ~~**`106 file_ordinal`**~~ | `int` | **Deprecated. Do not write.** | -| _optional_ | | | ~~**`107 sort_columns`**~~ | `list<112: int>` | **Deprecated. Do not write.** | -| _optional_ | _optional_ | _optional_ | **`108 column_sizes`** | `map<117: int, 118: long>` | Map from column id to the total size on disk of all regions that store the column. Does not include bytes necessary to read other columns, like footers. Leave null for row-oriented formats (Avro) | -| _optional_ | _optional_ | _optional_ | **`109 value_counts`** | `map<119: int, 120: long>` | Map from column id to number of values in the column (including null and NaN values) | -| _optional_ | _optional_ | _optional_ | **`110 null_value_counts`** | `map<121: int, 122: long>` | Map from column id to number of null values in the column | -| _optional_ | _optional_ | _optional_ | **`137 nan_value_counts`** | `map<138: int, 139: long>` | Map from column id to number of NaN values in the column | -| _optional_ | _optional_ | | ~~**`111 distinct_counts`**~~ | `map<123: int, 124: long>` | **Deprecated. Do not write.** | -| _optional_ | _optional_ | _optional_ | **`125 lower_bounds`** | `map<126: int, 127: binary>` | Map from column id to lower bound in the column serialized as binary [1]. Each value must be less than or equal to all non-null, non-NaN values in the column for the file [2] | -| _optional_ | _optional_ | _optional_ | **`128 upper_bounds`** | `map<129: int, 130: binary>` | Map from column id to upper bound in the column serialized as binary [1]. Each value must be greater than or equal to all non-null, non-Nan values in the column for the file [2] | -| _optional_ | _optional_ | _optional_ | **`131 key_metadata`** | `binary` | Implementation-specific key metadata for encryption | -| _optional_ | _optional_ | _optional_ | **`132 split_offsets`** | `list<133: long>` | Split offsets for the data file. For example, all row group offsets in a Parquet file. Must be sorted ascending | -| | _optional_ | _optional_ | **`135 equality_ids`** | `list<136: int>` | Field ids used to determine row equality in equality delete files. Required when `content=2` and should be null otherwise. Fields with ids listed in this column must be present in the delete file | -| _optional_ | _optional_ | _optional_ | **`140 sort_order_id`** | `int` | ID representing sort order for this file [3]. | -| | | _optional_ | **`142 first_row_id`** | `long` | The `_row_id` for the first row in the data file. See [First Row ID Inheritance](#first-row-id-inheritance) | -| | _optional_ | _optional_ | **`143 referenced_data_file`** | `string` | Fully qualified location (URI with FS scheme) of a data file that all deletes reference [4] | -| | | _optional_ | **`144 content_offset`** | `long` | The offset in the file where the content starts [5] | -| | | _optional_ | **`145 content_size_in_bytes`** | `long` | The length of a referenced content stored in the file; required if `content_offset` is present [5] | +=== "v1 - v3" + | v1 | v2 | v3 | Field id, name | Type | Description | + | ---------- |------------|------------|-----------------------------------|-----------------------------------------------------------------------------|-------------| + | | _required_ | _required_ | **`134 content`** | `int` with meaning: `0: DATA`, `1: POSITION DELETES`, `2: EQUALITY DELETES` | Type of content stored by the data file: data, equality deletes, or position deletes (all v1 files are data files) | + | _required_ | _required_ | _required_ | **`100 file_path`** | `string` | Full URI for the file with FS scheme | + | _required_ | _required_ | _required_ | **`101 file_format`** | `string` | String file format name, `avro`, `orc`, `parquet`, or `puffin` | + | _required_ | _required_ | _required_ | **`102 partition`** | `struct<...>` | Partition data tuple, schema based on the partition spec output using partition field ids for the struct field ids | + | _required_ | _required_ | _required_ | **`103 record_count`** | `long` | Number of records in this file, or the cardinality of a deletion vector | + | _required_ | _required_ | _required_ | **`104 file_size_in_bytes`** | `long` | Total file size in bytes | + | _required_ | | | ~~**`105 block_size_in_bytes`**~~ | `long` | **Deprecated. Always write a default in v1. Do not write in v2 or v3.** | + | _optional_ | | | ~~**`106 file_ordinal`**~~ | `int` | **Deprecated. Do not write.** | + | _optional_ | | | ~~**`107 sort_columns`**~~ | `list<112: int>` | **Deprecated. Do not write.** | + | _optional_ | _optional_ | _optional_ | **`108 column_sizes`** | `map<117: int, 118: long>` | Map from column id to the total size on disk of all regions that store the column. **Does not include bytes necessary to read other columns, like footers.** Leave null for row-oriented formats (Avro) | + | _optional_ | _optional_ | _optional_ | **`109 value_counts`** | `map<119: int, 120: long>` | Map from column id to number of values in the column (including null and NaN values) | + | _optional_ | _optional_ | _optional_ | **`110 null_value_counts`** | `map<121: int, 122: long>` | Map from column id to number of null values in the column | + | _optional_ | _optional_ | _optional_ | **`137 nan_value_counts`** | `map<138: int, 139: long>` | Map from column id to number of NaN values in the column | + | _optional_ | _optional_ | | ~~**`111 distinct_counts`**~~ | `map<123: int, 124: long>` | **Deprecated. Do not write.** | + | _optional_ | _optional_ | _optional_ | **`125 lower_bounds`** | `map<126: int, 127: binary>` | Map from column id to lower bound in the column serialized as binary [1]. Each value must be less than or equal to all non-null, non-NaN values in the column for the file [2] | + | _optional_ | _optional_ | _optional_ | **`128 upper_bounds`** | `map<129: int, 130: binary>` | Map from column id to upper bound in the column serialized as binary [1]. Each value must be greater than or equal to all non-null, non-Nan values in the column for the file [2] | + | _optional_ | _optional_ | _optional_ | **`131 key_metadata`** | `binary` | Implementation-specific key metadata for encryption | + | _optional_ | _optional_ | _optional_ | **`132 split_offsets`** | `list<133: long>` | Split offsets for the data file. For example, all row group offsets in a Parquet file. Must be sorted ascending | + | | _optional_ | _optional_ | **`135 equality_ids`** | `list<136: int>` | Field ids used to determine row equality in equality delete files. Required when `content=2` and should be null otherwise. Fields with ids listed in this column must be present in the delete file | + | _optional_ | _optional_ | _optional_ | **`140 sort_order_id`** | `int` | ID representing sort order for this file [3]. | + | | | _optional_ | **`142 first_row_id`** | `long` | The `_row_id` for the first row in the data file. See [First Row ID Inheritance](#first-row-id-inheritance) | + | | _optional_ | _optional_ | **`143 referenced_data_file`** | `string` | Fully qualified location (URI with FS scheme) of a data file that all deletes reference [4] | + | | | _optional_ | **`144 content_offset`** | `long` | The offset in the file where the content starts [5] | + | | | _optional_ | **`145 content_size_in_bytes`** | `long` | The length of a referenced content stored in the file; required if `content_offset` is present [5] | The `partition` struct stores the tuple of partition values for each file. Its type is derived from the partition fields of the partition spec used to write the manifest file. In v2, the partition struct's field ids must match the ids from the partition spec. @@ -733,20 +736,20 @@ Any null (unassigned) `first_row_id` must be assigned via inheritance, even if t ### Snapshots A snapshot consists of the following fields: - -| v1 | v2 | v3 | Field | Description | -| ---------- | ---------- |------------|------------------------------|------------------------------------------------------------------------------------------------------------------------------------| -| _required_ | _required_ | _required_ | **`snapshot-id`** | A unique long ID | -| _optional_ | _optional_ | _optional_ | **`parent-snapshot-id`** | The snapshot ID of the snapshot's parent. Omitted for any snapshot with no parent | -| | _required_ | _required_ | **`sequence-number`** | A monotonically increasing long that tracks the order of changes to a table | -| _required_ | _required_ | _required_ | **`timestamp-ms`** | A timestamp when the snapshot was created, used for garbage collection and table inspection | -| _optional_ | _required_ | _required_ | **`manifest-list`** | The location of a manifest list for this snapshot that tracks manifest files with additional metadata | -| _optional_ | | | **`manifests`** | A list of manifest file locations. Must be omitted if `manifest-list` is present | -| _optional_ | _required_ | _required_ | **`summary`** | A string map that summarizes the snapshot changes, including `operation` as a _required_ field (see below) | -| _optional_ | _optional_ | _optional_ | **`schema-id`** | ID of the table's current schema when the snapshot was created | -| | | _required_ | **`first-row-id`** | The first `_row_id` assigned to the first row in the first data file in the first manifest, see [Row Lineage](#row-lineage) | -| | | _required_ | **`added-rows`** | The upper bound of the number of rows with assigned row IDs, see [Row Lineage](#row-lineage) | -| | | _optional_ | **`key-id`** | ID of the encryption key that encrypts the manifest list key metadata | +=== "v1 - v3" + | v1 | v2 | v3 | Field | Description | + | ---------- | ---------- |------------|------------------------------|-------------| + | _required_ | _required_ | _required_ | **`snapshot-id`** | A unique long ID | + | _optional_ | _optional_ | _optional_ | **`parent-snapshot-id`** | The snapshot ID of the snapshot's parent. Omitted for any snapshot with no parent | + | | _required_ | _required_ | **`sequence-number`** | A monotonically increasing long that tracks the order of changes to a table | + | _required_ | _required_ | _required_ | **`timestamp-ms`** | A timestamp when the snapshot was created, used for garbage collection and table inspection | + | _optional_ | _required_ | _required_ | **`manifest-list`** | The location of a manifest list for this snapshot that tracks manifest files with additional metadata | + | _optional_ | | | **`manifests`** | A list of manifest file locations. Must be omitted if `manifest-list` is present | + | _optional_ | _required_ | _required_ | **`summary`** | A string map that summarizes the snapshot changes, including `operation` as a _required_ field (see below) | + | _optional_ | _optional_ | _optional_ | **`schema-id`** | ID of the table's current schema when the snapshot was created | + | | | _required_ | **`first-row-id`** | The first `_row_id` assigned to the first row in the first data file in the first manifest, see [Row Lineage](#row-lineage) | + | | | _required_ | **`added-rows`** | The upper bound of the number of rows with assigned row IDs, see [Row Lineage](#row-lineage) | + | | | _optional_ | **`key-id`** | ID of the encryption key that encrypts the manifest list key metadata | The snapshot summary's `operation` field is used by some operations, like snapshot expiration, to skip processing certain snapshots. Possible `operation` values are: @@ -790,33 +793,34 @@ A manifest list is a valid Iceberg data file: files must use valid Iceberg forma Manifest list files store `manifest_file`, a struct with the following fields: -| v1 | v2 | v3 | Field id, name | Type | Description | -| ---------- | ---------- |------------|----------------------------------|---------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------| -| _required_ | _required_ | _required_ | **`500 manifest_path`** | `string` | Location of the manifest file | -| _required_ | _required_ | _required_ | **`501 manifest_length`** | `long` | Length of the manifest file in bytes | -| _required_ | _required_ | _required_ | **`502 partition_spec_id`** | `int` | ID of a partition spec used to write the manifest; must be listed in table metadata `partition-specs` | -| | _required_ | _required_ | **`517 content`** | `int` with meaning: `0: data`, `1: deletes` | The type of files tracked by the manifest, either data or delete files; 0 for all v1 manifests | -| | _required_ | _required_ | **`515 sequence_number`** | `long` | The sequence number when the manifest was added to the table; use 0 when reading v1 manifest lists | -| | _required_ | _required_ | **`516 min_sequence_number`** | `long` | The minimum data sequence number of all live data or delete files in the manifest; use 0 when reading v1 manifest lists | -| _required_ | _required_ | _required_ | **`503 added_snapshot_id`** | `long` | ID of the snapshot where the manifest file was added | -| _optional_ | _required_ | _required_ | **`504 added_files_count`** | `int` | Number of entries in the manifest that have status `ADDED` (1), when `null` this is assumed to be non-zero | -| _optional_ | _required_ | _required_ | **`505 existing_files_count`** | `int` | Number of entries in the manifest that have status `EXISTING` (0), when `null` this is assumed to be non-zero | -| _optional_ | _required_ | _required_ | **`506 deleted_files_count`** | `int` | Number of entries in the manifest that have status `DELETED` (2), when `null` this is assumed to be non-zero | -| _optional_ | _required_ | _required_ | **`512 added_rows_count`** | `long` | Number of rows in all of files in the manifest that have status `ADDED`, when `null` this is assumed to be non-zero | -| _optional_ | _required_ | _required_ | **`513 existing_rows_count`** | `long` | Number of rows in all of files in the manifest that have status `EXISTING`, when `null` this is assumed to be non-zero | -| _optional_ | _required_ | _required_ | **`514 deleted_rows_count`** | `long` | Number of rows in all of files in the manifest that have status `DELETED`, when `null` this is assumed to be non-zero | -| _optional_ | _optional_ | _optional_ | **`507 partitions`** | `list<508: field_summary>` (see below) | A list of field summaries for each partition field in the spec. Each field in the list corresponds to a field in the manifest file’s partition spec. | -| _optional_ | _optional_ | _optional_ | **`519 key_metadata`** | `binary` | Implementation-specific key metadata for encryption | -| | | _optional_ | **`520 first_row_id`** | `long` | The starting `_row_id` to assign to rows added by `ADDED` data files [First Row ID Assignment](#first-row-id-assignment) | +=== "v1 - v3" + | v1 | v2 | v3 | Field id, name | Type | Description | + | ---------- | ---------- |------------|-------------------------------------|---------------------------------------------|-------------| + | _required_ | _required_ | _required_ | **`500 manifest_path`** | `string` | Location of the manifest file | + | _required_ | _required_ | _required_ | **`501 manifest_length`** | `long` | Length of the manifest file in bytes | + | _required_ | _required_ | _required_ | **`502 partition_spec_id`** | `int` | ID of a partition spec used to write the manifest; must be listed in table metadata `partition-specs` | + | | _required_ | _required_ | **`517 content`** | `int` with meaning: `0: data`, `1: deletes` | The type of files tracked by the manifest, either data or delete files; 0 for all v1 manifests | + | | _required_ | _required_ | **`515 sequence_number`** | `long` | The sequence number when the manifest was added to the table; use 0 when reading v1 manifest lists | + | | _required_ | _required_ | **`516 min_sequence_number`** | `long` | The minimum data sequence number of all live data or delete files in the manifest; use 0 when reading v1 manifest lists | + | _required_ | _required_ | _required_ | **`503 added_snapshot_id`** | `long` | ID of the snapshot where the manifest file was added | + | _optional_ | _required_ | _required_ | **`504 added_files_count`** | `int` | Number of entries in the manifest that have status `ADDED` (1), when `null` this is assumed to be non-zero | + | _optional_ | _required_ | _required_ | **`505 existing_files_count`** | `int` | Number of entries in the manifest that have status `EXISTING` (0), when `null` this is assumed to be non-zero | + | _optional_ | _required_ | _required_ | **`506 deleted_files_count`** | `int` | Number of entries in the manifest that have status `DELETED` (2), when `null` this is assumed to be non-zero | + | _optional_ | _required_ | _required_ | **`512 added_rows_count`** | `long` | Number of rows in all of files in the manifest that have status `ADDED`, when `null` this is assumed to be non-zero | + | _optional_ | _required_ | _required_ | **`513 existing_rows_count`** | `long` | Number of rows in all of files in the manifest that have status `EXISTING`, when `null` this is assumed to be non-zero | + | _optional_ | _required_ | _required_ | **`514 deleted_rows_count`** | `long` | Number of rows in all of files in the manifest that have status `DELETED`, when `null` this is assumed to be non-zero | + | _optional_ | _optional_ | _optional_ | **`507 partitions`** | `list<508: field_summary>` **(see below)** | A list of field summaries for each partition field in the spec. Each field in the list corresponds to a field in the manifest file’s partition spec. | + | _optional_ | _optional_ | _optional_ | **`519 key_metadata`** | `binary` | Implementation-specific key metadata for encryption | + | | | _optional_ | **`520 first_row_id`** | `long` | The starting `_row_id` to assign to rows added by `ADDED` data files [First Row ID Assignment](#first-row-id-assignment) | `field_summary` is a struct with the following fields: - -| v1 | v2 | Field id, name | Type | Description | -| ---------- | ---------- |-------------------------|---------------|-------------| -| _required_ | _required_ | **`509 contains_null`** | `boolean` | Whether the manifest contains at least one partition with a null value for the field | -| _optional_ | _optional_ | **`518 contains_nan`** | `boolean` | Whether the manifest contains at least one partition with a NaN value for the field | -| _optional_ | _optional_ | **`510 lower_bound`** | `bytes` [1] | Lower bound for the non-null, non-NaN values in the partition field, or null if all values are null or NaN [2] | -| _optional_ | _optional_ | **`511 upper_bound`** | `bytes` [1] | Upper bound for the non-null, non-NaN values in the partition field, or null if all values are null or NaN [2] | +=== "v1 - v3" + | v1 | v2 and v3 | Field id, name | Type | Description | + | ---------- | ---------- |-------------------------|---------------|-------------| + | _required_ | _required_ | **`509 contains_null`** | `boolean` | Whether the manifest contains at least one partition with a null value for the field | + | _optional_ | _optional_ | **`518 contains_nan`** | `boolean` | Whether the manifest contains at least one partition with a NaN value for the field | + | _optional_ | _optional_ | **`510 lower_bound`** | `bytes` [1] | Lower bound for the non-null, non-NaN values in the partition field, or null if all values are null or NaN [2] | + | _optional_ | _optional_ | **`511 upper_bound`** | `bytes` [1] | Upper bound for the non-null, non-NaN values in the partition field, or null if all values are null or NaN [2] | Notes: @@ -885,13 +889,14 @@ Tags are labels for individual snapshots. Branches are mutable named references The snapshot reference object records all the information of a reference including snapshot ID, reference type and [Snapshot Retention Policy](#snapshot-retention-policy). -| v1 | v2 | Field name | Type | Description | -| ---------- | ---------- | ---------------------------- | --------- | ----------- | -| _required_ | _required_ | **`snapshot-id`** | `long` | A reference's snapshot ID. The tagged snapshot or latest snapshot of a branch. | -| _required_ | _required_ | **`type`** | `string` | Type of the reference, `tag` or `branch` | -| _optional_ | _optional_ | **`min-snapshots-to-keep`** | `int` | For `branch` type only, a positive number for the minimum number of snapshots to keep in a branch while expiring snapshots. Defaults to table property `history.expire.min-snapshots-to-keep`. | -| _optional_ | _optional_ | **`max-snapshot-age-ms`** | `long` | For `branch` type only, a positive number for the max age of snapshots to keep when expiring, including the latest snapshot. Defaults to table property `history.expire.max-snapshot-age-ms`. | -| _optional_ | _optional_ | **`max-ref-age-ms`** | `long` | For snapshot references except the `main` branch, a positive number for the max age of the snapshot reference to keep while expiring snapshots. Defaults to table property `history.expire.max-ref-age-ms`. The `main` branch never expires. | +=== "v1 - v3" + | v1 | v2 and v3 | Field name | Type | Description | + | ---------- | ---------- | ---------------------------- | --------- | ----------- | + | _required_ | _required_ | **`snapshot-id`** | `long` | A reference's snapshot ID. The tagged snapshot or latest snapshot of a branch. | + | _required_ | _required_ | **`type`** | `string` | Type of the reference, `tag` or `branch` | + | _optional_ | _optional_ | **`min-snapshots-to-keep`** | `int` | For `branch` type only, a positive number for the minimum number of snapshots to keep in a branch while expiring snapshots. Defaults to table property `history.expire.min-snapshots-to-keep`. | + | _optional_ | _optional_ | **`max-snapshot-age-ms`** | `long` | For `branch` type only, a positive number for the max age of snapshots to keep when expiring, including the latest snapshot. Defaults to table property `history.expire.max-snapshot-age-ms`. | + | _optional_ | _optional_ | **`max-ref-age-ms`** | `long` | For snapshot references except the `main` branch, a positive number for the max age of the snapshot reference to keep while expiring snapshots. Defaults to table property `history.expire.max-ref-age-ms`. The `main` branch never expires. | Valid snapshot references are stored as the values of the `refs` map in table metadata. For serialization, see Appendix C. @@ -921,33 +926,34 @@ The atomic operation used to commit metadata depends on how tables are tracked a Table metadata consists of the following fields: -| v1 | v2 | v3 | Field | Description | -| ---------- | ---------- |------------|-----------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| _required_ | _required_ | _required_ | **`format-version`** | An integer version number for the format. Implementations must throw an exception if a table's version is higher than the supported version. | -| _optional_ | _required_ | _required_ | **`table-uuid`** | A UUID that identifies the table, generated when the table is created. Implementations must throw an exception if a table's UUID does not match the expected UUID after refreshing metadata. | -| _required_ | _required_ | _required_ | **`location`** | The table's base location. This is used by writers to determine where to store data files, manifest files, and table metadata files. | -| | _required_ | _required_ | **`last-sequence-number`** | The table's highest assigned sequence number, a monotonically increasing long that tracks the order of snapshots in a table. | -| _required_ | _required_ | _required_ | **`last-updated-ms`** | Timestamp in milliseconds from the unix epoch when the table was last updated. Each table metadata file should update this field just before writing. | -| _required_ | _required_ | _required_ | **`last-column-id`** | An integer; the highest assigned column ID for the table. This is used to ensure columns are always assigned an unused ID when evolving schemas. | -| _required_ | | | **`schema`** | The table’s current schema. (**Deprecated**: use `schemas` and `current-schema-id` instead) | -| _optional_ | _required_ | _required_ | **`schemas`** | A list of schemas, stored as objects with `schema-id`. | -| _optional_ | _required_ | _required_ | **`current-schema-id`** | ID of the table's current schema. | -| _required_ | | | **`partition-spec`** | The table’s current partition spec, stored as only fields. Note that this is used by writers to partition data, but is not used when reading because reads use the specs stored in manifest files. (**Deprecated**: use `partition-specs` and `default-spec-id` instead) | -| _optional_ | _required_ | _required_ | **`partition-specs`** | A list of partition specs, stored as full partition spec objects. | -| _optional_ | _required_ | _required_ | **`default-spec-id`** | ID of the "current" spec that writers should use by default. | -| _optional_ | _required_ | _required_ | **`last-partition-id`** | An integer; the highest assigned partition field ID across all partition specs for the table. This is used to ensure partition fields are always assigned an unused ID when evolving specs. | -| _optional_ | _optional_ | _optional_ | **`properties`** | A string to string map of table properties. This is used to control settings that affect reading and writing and is not intended to be used for arbitrary metadata. For example, `commit.retry.num-retries` is used to control the number of commit retries. | -| _optional_ | _optional_ | _optional_ | **`current-snapshot-id`** | `long` ID of the current table snapshot; must be the same as the current ID of the `main` branch in `refs`. | -| _optional_ | _optional_ | _optional_ | **`snapshots`** | A list of valid snapshots. Valid snapshots are snapshots for which all data files exist in the file system. A data file must not be deleted from the file system until the last snapshot in which it was listed is garbage collected. | -| _optional_ | _optional_ | _optional_ | **`snapshot-log`** | A list (optional) of timestamp and snapshot ID pairs that encodes changes to the current snapshot for the table. Each time the current-snapshot-id is changed, a new entry should be added with the last-updated-ms and the new current-snapshot-id. When snapshots are expired from the list of valid snapshots, all entries before a snapshot that has expired should be removed. | -| _optional_ | _optional_ | _optional_ | **`metadata-log`** | A list (optional) of timestamp and metadata file location pairs that encodes changes to the previous metadata files for the table. Each time a new metadata file is created, a new entry of the previous metadata file location should be added to the list. Tables can be configured to remove oldest metadata log entries and keep a fixed-size log of the most recent entries after a commit. | -| _optional_ | _required_ | _required_ | **`sort-orders`** | A list of sort orders, stored as full sort order objects. | -| _optional_ | _required_ | _required_ | **`default-sort-order-id`** | Default sort order id of the table. Note that this could be used by writers, but is not used when reading because reads use the specs stored in manifest files. | -| | _optional_ | _optional_ | **`refs`** | A map of snapshot references. The map keys are the unique snapshot reference names in the table, and the map values are snapshot reference objects. There is always a `main` branch reference pointing to the `current-snapshot-id` even if the `refs` map is null. | -| _optional_ | _optional_ | _optional_ | **`statistics`** | A list (optional) of [table statistics](#table-statistics). | -| _optional_ | _optional_ | _optional_ | **`partition-statistics`** | A list (optional) of [partition statistics](#partition-statistics). | -| | | _required_ | **`next-row-id`** | A `long` higher than all assigned row IDs; the next snapshot's `first-row-id`. See [Row Lineage](#row-lineage). | -| | | _optional_ | **`encryption-keys`** | A list (optional) of [encryption keys](#encryption-keys) used for table encryption. | +=== "v1 - v3" + | v1 | v2 | v3 | Field | Description | + | ---------- | ---------- |------------|-----------------------------| ------------| + | _required_ | _required_ | _required_ | **`format-version`** | An integer version number for the format. Implementations must throw an exception if a table’s version is higher than the supported version. | + | _optional_ | _required_ | _required_ | **`table-uuid`** | A UUID that identifies the table, generated when the table is created. Implementations must throw an exception if a table’s UUID does not match the expected UUID after refreshing metadata. | + | _required_ | _required_ | _required_ | **`location`** | The table’s base location. This is used by writers to determine where to store data files, manifest files, and table metadata files. | + | | _required_ | _required_ | **`last-sequence-number`** | The table’s highest assigned sequence number, a monotonically increasing long that tracks the order of snapshots in a table. | + | _required_ | _required_ | _required_ | **`last-updated-ms`** | Timestamp in milliseconds from the unix epoch when the table was last updated. Each table metadata file should update this field just before writing. | + | _required_ | _required_ | _required_ | **`last-column-id`** | An integer; the highest assigned column ID for the table. This is used to ensure columns are always assigned an unused ID when evolving schemas. | + | _required_ | | | **`schema`** | The table’s current schema. (**Deprecated**: use `schemas` and `current-schema-id` instead) | + | _optional_ | _required_ | _required_ | **`schemas`** | A list of schemas, stored as objects with `schema-id`. | + | _optional_ | _required_ | _required_ | **`current-schema-id`** | ID of the table’s current schema. | + | _required_ | | | **`partition-spec`** | The table’s current partition spec, stored as only fields. (**Deprecated**: use `partition-specs` and `default-spec-id` instead) Note that this is used by writers to partition data, but is not used when reading because reads use the specs stored in manifest files. | + | _optional_ | _required_ | _required_ | **`partition-specs`** | A list of partition specs, stored as full partition spec objects. | + | _optional_ | _required_ | _required_ | **`default-spec-id`** | ID of the "current" spec that writers should use by default. | + | _optional_ | _required_ | _required_ | **`last-partition-id`** | An integer; the highest assigned partition field ID across all partition specs for the table. This is used to ensure partition fields are always assigned an unused ID when evolving specs. | + | _optional_ | _optional_ | _optional_ | **`properties`** | A string to string map of table properties. This is used to control settings that affect reading and writing and is not intended to be used for arbitrary metadata. For example, `commit.retry.num-retries` is used to control the number of commit retries. | + | _optional_ | _optional_ | _optional_ | **`current-snapshot-id`** | `long` ID of the current table snapshot; must be the same as the current ID of the `main` branch in `refs`. | + | _optional_ | _optional_ | _optional_ | **`snapshots`** | A list of valid snapshots. Valid snapshots are snapshots for which all data files exist in the file system. A data file must not be deleted from the file system until the last snapshot in which it was listed is garbage collected. | + | _optional_ | _optional_ | _optional_ | **`snapshot-log`** | A list (optional) of timestamp and snapshot ID pairs that encodes changes to the current snapshot for the table. Each time the current-snapshot-id is changed, a new entry should be added with the last-updated-ms and the new current-snapshot-id. When snapshots are expired from the list of valid snapshots, all entries before a snapshot that has expired should be removed. | + | _optional_ | _optional_ | _optional_ | **`metadata-log`** | A list (optional) of timestamp and metadata file location pairs that encodes changes to the previous metadata files for the table. Each time a new metadata file is created, a new entry of the previous metadata file location should be added to the list. Tables can be configured to remove oldest metadata log entries and keep a fixed-size log of the most recent entries after a commit. | + | _optional_ | _required_ | _required_ | **`sort-orders`** | A list of sort orders, stored as full sort order objects. | + | _optional_ | _required_ | _required_ | **`default-sort-order-id`** | Default sort order id of the table. Note that this could be used by writers, but is not used when reading because reads use the specs stored in manifest files. | + | | _optional_ | _optional_ | **`refs`** | A map of snapshot references. The map keys are the unique snapshot reference names in the table, and the map values are snapshot reference objects. There is always a `main` branch reference pointing to the `current-snapshot-id` even if the `refs` map is null. | + | _optional_ | _optional_ | _optional_ | **`statistics`** | A list (optional) of [table statistics](#table-statistics). | + | _optional_ | _optional_ | _optional_ | **`partition-statistics`** | A list (optional) of [partition statistics](#partition-statistics). | + | | | _required_ | **`next-row-id`** | A `long` higher than all assigned row IDs; the next snapshot’s `first-row-id`. See [Row Lineage](#row-lineage). | + | | | _optional_ | **`encryption-keys`** | A list (optional) of [encryption keys](#encryption-keys) used for table encryption. | For serialization details, see Appendix C. @@ -963,24 +969,26 @@ many statistics files associated with different table snapshots. Statistics files metadata within `statistics` table metadata field is a struct with the following fields: -| v1 | v2 | Field name | Type | Description | -|----|----|------------|------|-------------| -| _required_ | _required_ | **`snapshot-id`** | `long` | ID of the Iceberg table's snapshot the statistics file is associated with. | -| _required_ | _required_ | **`statistics-path`** | `string` | Path of the statistics file. See [Puffin file format](puffin-spec.md). | -| _required_ | _required_ | **`file-size-in-bytes`** | `long` | Size of the statistics file. | -| _required_ | _required_ | **`file-footer-size-in-bytes`** | `long` | Total size of the statistics file's footer (not the footer payload size). See [Puffin file format](puffin-spec.md) for footer definition. | -| _optional_ | _optional_ | **`key-metadata`** | Base64-encoded implementation-specific key metadata for encryption. | -| _required_ | _required_ | **`blob-metadata`** | `list` (see below) | A list of the blob metadata for statistics contained in the file with structure described below. | +=== "v1 - v3" + | v1 | v2 and v3 | Field name | Type | Description | + | ---------- | ---------- |---------------------------------|-----------------------|-------------| + | _required_ | _required_ | **`snapshot-id`** | `long` | ID of the Iceberg table's snapshot the statistics file is associated with. | + | _required_ | _required_ | **`statistics-path`** | `string` | Path of the statistics file. See [Puffin file format](puffin-spec.md). | + | _required_ | _required_ | **`file-size-in-bytes`** | `long` | Size of the statistics file. | + | _required_ | _required_ | **`file-footer-size-in-bytes`** | `long` | Total size of the statistics file's footer (not the footer payload size). See [Puffin file format](puffin-spec.md) for footer definition. | + | _optional_ | _optional_ | **`key-metadata`** | | Base64-encoded implementation-specific key metadata for encryption. | + | _required_ | _required_ | **`blob-metadata`** | `list` (see below) | A list of the blob metadata for statistics contained in the file with structure described below. | Blob metadata is a struct with the following fields: -| v1 | v2 | Field name | Type | Description | -|----|----|------------|------|-------------| -| _required_ | _required_ | **`type`** | `string` | Type of the blob. Matches Blob type in the Puffin file. | -| _required_ | _required_ | **`snapshot-id`** | `long` | ID of the Iceberg table's snapshot the blob was computed from. | -| _required_ | _required_ | **`sequence-number`** | `long` | Sequence number of the Iceberg table's snapshot the blob was computed from. | -| _required_ | _required_ | **`fields`** | `list` | Ordered list of fields, given by field ID, on which the statistic was calculated. | -| _optional_ | _optional_ | **`properties`** | `map` | Additional properties associated with the statistic. Subset of Blob properties in the Puffin file. | +=== "v1 - v3" + | v1 | v2 and v3 | Field name | Type | Description | + | ---------- | ---------- |-----------------------|-----------------------|-------------| + | _required_ | _required_ | **`type`** | `string` | Type of the blob. Matches Blob type in the Puffin file. | + | _required_ | _required_ | **`snapshot-id`** | `long` | ID of the Iceberg table's snapshot the blob was computed from. | + | _required_ | _required_ | **`sequence-number`** | `long` | Sequence number of the Iceberg table's snapshot the blob was computed from. | + | _required_ | _required_ | **`fields`** | `list` | Ordered list of fields, given by field ID, on which the statistic was calculated. | + | _optional_ | _optional_ | **`properties`** | `map` | Additional properties associated with the statistic. Subset of Blob properties in the Puffin file. | #### Partition Statistics @@ -992,11 +1000,12 @@ Partition statistics file must be registered in the table metadata file to be co `partition-statistics` field of table metadata is an optional list of structs with the following fields: -| v1 | v2 | v3 | Field name | Type | Description | -|----|----|----|------------|------|-------------| -| _required_ | _required_ | _required_ | **`snapshot-id`** | `long` | ID of the Iceberg table's snapshot the partition statistics file is associated with. | -| _required_ | _required_ | _required_ | **`statistics-path`** | `string` | Path of the partition statistics file. See [Partition statistics file](#partition-statistics-file). | -| _required_ | _required_ | _required_ | **`file-size-in-bytes`** | `long` | Size of the partition statistics file. | +=== "v1 - v3" + | v1 | v2 | v3 | Field name | Type | Description | + | ---------- | ---------- |------------|--------------------------|----------|-------------| + | _required_ | _required_ | _required_ | **`snapshot-id`** | `long` | ID of the Iceberg table's snapshot the partition statistics file is associated with. | + | _required_ | _required_ | _required_ | **`statistics-path`** | `string` | Path of the partition statistics file. See [Partition statistics file](#partition-statistics-file). | + | _required_ | _required_ | _required_ | **`file-size-in-bytes`** | `long` | Size of the partition statistics file. | ##### Partition Statistics File @@ -1005,21 +1014,22 @@ These rows must be sorted (in ascending manner with NULL FIRST) by `partition` f The schema of the partition statistics file is as follows: -| v1 | v2 | v3 | Field id, name | Type | Description | -|----|----|----|----------------|------|-------------| -| _required_ | _required_ | _required_ | **`1 partition`** | `struct<..>` | Partition data tuple, schema based on the unified partition type considering all specs in a table | -| _required_ | _required_ | _required_ | **`2 spec_id`** | `int` | Partition spec id | -| _required_ | _required_ | _required_ | **`3 data_record_count`** | `long` | Count of records in data files | -| _required_ | _required_ | _required_ | **`4 data_file_count`** | `int` | Count of data files | -| _required_ | _required_ | _required_ | **`5 total_data_file_size_in_bytes`** | `long` | Total size of data files in bytes | -| _optional_ | _optional_ | _required_ | **`6 position_delete_record_count`** | `long` | Count of position deletes across position delete files and deletion vectors | -| _optional_ | _optional_ | _required_ | **`7 position_delete_file_count`** | `int` | Count of position delete files ignoring deletion vectors | -| | | _required_ | **`13 dv_count`** | `int` | Count of deletion vectors | -| _optional_ | _optional_ | _required_ | **`8 equality_delete_record_count`** | `long` | Count of records in equality delete files | -| _optional_ | _optional_ | _required_ | **`9 equality_delete_file_count`** | `int` | Count of equality delete files | -| _optional_ | _optional_ | _optional_ | **`10 total_record_count`** | `long` | Accurate count of records in a partition after applying deletes if any | -| _optional_ | _optional_ | _optional_ | **`11 last_updated_at`** | `long` | Timestamp in milliseconds from the unix epoch when the partition was last updated | -| _optional_ | _optional_ | _optional_ | **`12 last_updated_snapshot_id`** | `long` | ID of snapshot that last updated this partition | +=== "v1 - v3" + | v1 | v2 | v3 | Field id, name | Type | Description | + | ---------- | ---------- |------------|------------------------------------------|--------------|-------------| + | _required_ | _required_ | _required_ | **`1 partition`** | `struct<..>` | Partition data tuple, schema based on the unified partition type considering all specs in a table | + | _required_ | _required_ | _required_ | **`2 spec_id`** | `int` | Partition spec id | + | _required_ | _required_ | _required_ | **`3 data_record_count`** | `long` | Count of records in data files | + | _required_ | _required_ | _required_ | **`4 data_file_count`** | `int` | Count of data files | + | _required_ | _required_ | _required_ | **`5 total_data_file_size_in_bytes`** | `long` | Total size of data files in bytes | + | _optional_ | _optional_ | _required_ | **`6 position_delete_record_count`** | `long` | Count of position deletes across position delete files and deletion vectors | + | _optional_ | _optional_ | _required_ | **`7 position_delete_file_count`** | `int` | Count of position delete files ignoring deletion vectors | + | | | _required_ | **`13 dv_count`** | `int` | Count of deletion vectors | + | _optional_ | _optional_ | _required_ | **`8 equality_delete_record_count`** | `long` | Count of records in equality delete files | + | _optional_ | _optional_ | _required_ | **`9 equality_delete_file_count`** | `int` | Count of equality delete files | + | _optional_ | _optional_ | _optional_ | **`10 total_record_count`** | `long` | Accurate count of records in a partition after applying deletes if any | + | _optional_ | _optional_ | _optional_ | **`11 last_updated_at`** | `long` | Timestamp in milliseconds from the unix epoch when the partition was last updated | + | _optional_ | _optional_ | _optional_ | **`12 last_updated_snapshot_id`** | `long` | ID of snapshot that last updated this partition | Note that partition data tuple's schema is based on the partition spec output using partition field ids for the struct field ids. The unified partition type is a struct containing all fields that have ever been a part of any spec in the table @@ -1044,13 +1054,13 @@ If a table has no deletes or only deletion vectors, implementations are encourag #### Encryption Keys Keys used for table encryption can be tracked in table metadata as a list named `encryption-keys`. The schema of each key is a struct with the following fields: - -| v1 | v2 | v3 | Field name | Type. | Description | -|----|----|------------|------------------------------|-----------------------|-------------| -| | | _required_ | **`key-id`** | `string` | ID of the encryption key | -| | | _required_ | **`encrypted-key-metadata`** | `string` | Encrypted key and metadata, base64 encoded [1] | -| | | _optional_ | **`encrypted-by-id`** | `string` | Optional ID of the key used to encrypt or wrap `key-metadata` | -| | | _optional_ | **`properties`** | `map` | A string to string map of additional metadata used by the table's encryption scheme | +=== "v1 - v3" + | v1 | v2 | v3 | Field name | Type | Description | + |----|----|------------|-------------------------------|-----------------------|-------------| + | | | _required_ | **`key-id`** | `string` | ID of the encryption key | + | | | _required_ | **`encrypted-key-metadata`** | `string` | Encrypted key and metadata, base64 encoded [1] | + | | | _optional_ | **`encrypted-by-id`** | `string` | Optional ID of the key used to encrypt or wrap `key-metadata` | + | | | _optional_ | **`properties`** | `map` | A string to string map of additional metadata used by the table's encryption scheme | Notes: diff --git a/gcp-bundle/LICENSE b/gcp-bundle/LICENSE index d6d61ae4c578..54822a830a72 100644 --- a/gcp-bundle/LICENSE +++ b/gcp-bundle/LICENSE @@ -210,6 +210,94 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles FastDoubleParser (via Jackson JSON Processor). + +Copyright: 2023 Werner Randelshofer, Switzerland +Project URL: https://github.com/wrandelshofer/FastDoubleParser +License: MIT + +| Copyright (c) 2023 Werner Randelshofer, Switzerland +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles fast_float (bundled by FastDoubleParser). + +Copyright: 2021 The fast_float authors +Project URL: https://github.com/fastfloat/fast_float +License: MIT + +| Copyright (c) 2021 The fast_float authors +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles bigint (bundled by FastDoubleParser). + +Copyright: 2022 Tim Buktu +Project URL: https://github.com/tbuktu/bigint +License: BSD 2-Clause + +| Copyright 2022 Tim Buktu +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| 1. Redistributions of source code must retain the above copyright notice, this +| list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + This product bundles Android Annotations. Project URL: http://source.android.com/ @@ -219,12 +307,14 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 This product bundles Google API Common. +Project URL: https://github.com/googleapis/api-common-java License: BSD 3-Clause + | Copyright 2016, Google Inc. | Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are | met: -| +| | * Redistributions of source code must retain the above copyright | notice, this list of conditions and the following disclaimer. | * Redistributions in binary form must reproduce the above @@ -234,7 +324,7 @@ License: BSD 3-Clause | * Neither the name of Google Inc. nor the names of its | contributors may be used to endorse or promote products derived from | this software without specific prior written permission. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -253,12 +343,13 @@ This product bundles Google GAX. Project URL: https://github.com/googleapis/gax-java License: BSD 3-Clause + | Copyright 2016, Google Inc. All rights reserved. -| +| | Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are | met: -| +| | * Redistributions of source code must retain the above copyright | notice, this list of conditions and the following disclaimer. | * Redistributions in binary form must reproduce the above @@ -268,7 +359,7 @@ License: BSD 3-Clause | * Neither the name of Google Inc. nor the names of its | contributors may be used to endorse or promote products derived from | this software without specific prior written permission. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -313,24 +404,26 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 This product bundles Google Auth Library. +Project URL: https://github.com/googleapis/google-auth-library-java License: BSD 3-Clause + | Copyright 2014, Google Inc. All rights reserved. -| +| | Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are | met: -| +| | * Redistributions of source code must retain the above copyright | notice, this list of conditions and the following disclaimer. | * Redistributions in binary form must reproduce the above | copyright notice, this list of conditions and the following disclaimer | in the documentation and/or other materials provided with the | distribution. -| +| | * Neither the name of Google Inc. nor the names of its | contributors may be used to endorse or promote products derived from | this software without specific prior written permission. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -366,7 +459,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Google Cloud Open-Telemetry Operations Exporters for Java +This product bundles Google Cloud Open-Telemetry Operations Exporters for Java. Project URL: https://github.com/GoogleCloudPlatform/opentelemetry-operations-java License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 @@ -380,13 +473,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Findbugs jsr305. - -Project URL: http://findbugs.sourceforge.net/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - This product bundles Google Error Prone Annotations. Project URL: https://github.com/google/error-prone @@ -408,6 +494,387 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles the Mozilla Public Suffix List (via Google Guava). + +Project URL: https://publicsuffix.org/ +License: Mozilla Public License, Version 2.0 - https://mozilla.org/MPL/2.0/ + +| Mozilla Public License Version 2.0 +| ================================== +| +| 1. Definitions +| -------------- +| +| 1.1. "Contributor" +| means each individual or legal entity that creates, contributes to +| the creation of, or owns Covered Software. +| +| 1.2. "Contributor Version" +| means the combination of the Contributions of others (if any) used +| by a Contributor and that particular Contributor's Contribution. +| +| 1.3. "Contribution" +| means Covered Software of a particular Contributor. +| +| 1.4. "Covered Software" +| means Source Code Form to which the initial Contributor has attached +| the notice in Exhibit A, the Executable Form of such Source Code +| Form, and Modifications of such Source Code Form, in each case +| including portions thereof. +| +| 1.5. "Incompatible With Secondary Licenses" +| means +| +| (a) that the initial Contributor has attached the notice described +| in Exhibit B to the Covered Software; or +| +| (b) that the Covered Software was made available under the terms of +| version 1.1 or earlier of the License, but not also under the +| terms of a Secondary License. +| +| 1.6. "Executable Form" +| means any form of the work other than Source Code Form. +| +| 1.7. "Larger Work" +| means a work that combines Covered Software with other material, in +| a separate file or files, that is not Covered Software. +| +| 1.8. "License" +| means this document. +| +| 1.9. "Licensable" +| means having the right to grant, to the maximum extent possible, +| whether at the time of the initial grant or subsequently, any and +| all of the rights conveyed by this License. +| +| 1.10. "Modifications" +| means any of the following: +| +| (a) any file in Source Code Form that results from an addition to, +| deletion from, or modification of the contents of Covered +| Software; or +| +| (b) any new file in Source Code Form that contains any Covered +| Software. +| +| 1.11. "Patent Claims" of a Contributor +| means any patent claim(s), including without limitation, method, +| process, and apparatus claims, in any patent Licensable by such +| Contributor that would be infringed, but for the grant of the +| License, by the making, using, selling, offering for sale, having +| made, import, or transfer of either its Contributions or its +| Contributor Version. +| +| 1.12. "Secondary License" +| means either the GNU General Public License, Version 2.0, the GNU +| Lesser General Public License, Version 2.1, the GNU Affero General +| Public License, Version 3.0, or any later versions of those +| licenses. +| +| 1.13. "Source Code Form" +| means the form of the work preferred for making modifications. +| +| 1.14. "You" (or "Your") +| means an individual or a legal entity exercising rights under this +| License. For legal entities, "You" includes any entity that +| controls, is controlled by, or is under common control with You. For +| purposes of this definition, "control" means (a) the power, direct +| or indirect, to cause the direction or management of such entity, +| whether by contract or otherwise, or (b) ownership of more than +| fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants and Conditions +| -------------------------------- +| +| 2.1. Grants +| +| Each Contributor hereby grants You a world-wide, royalty-free, +| non-exclusive license: +| +| (a) under intellectual property rights (other than patent or trademark) +| Licensable by such Contributor to use, reproduce, make available, +| modify, display, perform, distribute, and otherwise exploit its +| Contributions, either on an unmodified basis, with Modifications, or +| as part of a Larger Work; and +| +| (b) under Patent Claims of such Contributor to make, use, sell, offer +| for sale, have made, import, and otherwise transfer either its +| Contributions or its Contributor Version. +| +| 2.2. Effective Date +| +| The licenses granted in Section 2.1 with respect to any Contribution +| become effective for each Contribution on the date the Contributor first +| distributes such Contribution. +| +| 2.3. Limitations on Grant Scope +| +| The licenses granted in this Section 2 are the only rights granted under +| this License. No additional rights or licenses will be implied from the +| distribution or licensing of Covered Software under this License. +| Notwithstanding Section 2.1(b) above, no patent license is granted by a +| Contributor: +| +| (a) for any code that a Contributor has removed from Covered Software; +| or +| +| (b) for infringements caused by: (i) Your and any other third party's +| modifications of Covered Software, or (ii) the combination of its +| Contributions with other software (except as part of its Contributor +| Version); or +| +| (c) under Patent Claims infringed by Covered Software in the absence of +| its Contributions. +| +| This License does not grant any rights in the trademarks, service marks, +| or logos of any Contributor (except as may be necessary to comply with +| the notice requirements in Section 3.4). +| +| 2.4. Subsequent Licenses +| +| No Contributor makes additional grants as a result of Your choice to +| distribute the Covered Software under a subsequent version of this +| License (see Section 10.2) or under the terms of a Secondary License (if +| permitted under the terms of Section 3.3). +| +| 2.5. Representation +| +| Each Contributor represents that the Contributor believes its +| Contributions are its original creation(s) or it has sufficient rights +| to grant the rights to its Contributions conveyed by this License. +| +| 2.6. Fair Use +| +| This License is not intended to limit any rights You have under +| applicable copyright doctrines of fair use, fair dealing, or other +| equivalents. +| +| 2.7. Conditions +| +| Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +| in Section 2.1. +| +| 3. Responsibilities +| ------------------- +| +| 3.1. Distribution of Source Form +| +| All distribution of Covered Software in Source Code Form, including any +| Modifications that You create or to which You contribute, must be under +| the terms of this License. You must inform recipients that the Source +| Code Form of the Covered Software is governed by the terms of this +| License, and how they can obtain a copy of this License. You may not +| attempt to alter or restrict the recipients' rights in the Source Code +| Form. +| +| 3.2. Distribution of Executable Form +| +| If You distribute Covered Software in Executable Form then: +| +| (a) such Covered Software must also be made available in Source Code +| Form, as described in Section 3.1, and You must inform recipients of +| the Executable Form how they can obtain a copy of such Source Code +| Form by reasonable means in a timely manner, at a charge no more +| than the cost of distribution to the recipient; and +| +| (b) You may distribute such Executable Form under the terms of this +| License, or sublicense it under different terms, provided that the +| license for the Executable Form does not attempt to limit or alter +| the recipients' rights in the Source Code Form under this License. +| +| 3.3. Distribution of a Larger Work +| +| You may create and distribute a Larger Work under terms of Your choice, +| provided that You also comply with the requirements of this License for +| the Covered Software. If the Larger Work is a combination of Covered +| Software with a work governed by one or more Secondary Licenses, and the +| Covered Software is not Incompatible With Secondary Licenses, this +| License permits You to additionally distribute such Covered Software +| under the terms of such Secondary License(s), so that the recipient of +| the Larger Work may, at their option, further distribute the Covered +| Software under the terms of either this License or such Secondary +| License(s). +| +| 3.4. Notices +| +| You may not remove or alter the substance of any license notices +| (including copyright notices, patent notices, disclaimers of warranty, +| or limitations of liability) contained within the Source Code Form of +| the Covered Software, except that You may alter any license notices to +| the extent required to remedy known factual inaccuracies. +| +| 3.5. Application of Additional Terms +| +| You may choose to offer, and to charge a fee for, warranty, support, +| indemnity or liability obligations to one or more recipients of Covered +| Software. However, You may do so only on Your own behalf, and not on +| behalf of any Contributor. You must make it absolutely clear that any +| such warranty, support, indemnity, or liability obligation is offered by +| You alone, and You hereby agree to indemnify every Contributor for any +| liability incurred by such Contributor as a result of warranty, support, +| indemnity or liability terms You offer. You may include additional +| disclaimers of warranty and limitations of liability specific to any +| jurisdiction. +| +| 4. Inability to Comply Due to Statute or Regulation +| --------------------------------------------------- +| +| If it is impossible for You to comply with any of the terms of this +| License with respect to some or all of the Covered Software due to +| statute, judicial order, or regulation then You must: (a) comply with +| the terms of this License to the maximum extent possible; and (b) +| describe the limitations and the code they affect. Such description must +| be placed in a text file included with all distributions of the Covered +| Software under this License. Except to the extent prohibited by statute +| or regulation, such description must be sufficiently detailed for a +| recipient of ordinary skill to be able to understand it. +| +| 5. Termination +| -------------- +| +| 5.1. The rights granted under this License will terminate automatically +| if You fail to comply with any of its terms. However, if You become +| compliant, then the rights granted under this License from a particular +| Contributor are reinstated (a) provisionally, unless and until such +| Contributor explicitly and finally terminates Your grants, and (b) on an +| ongoing basis, if such Contributor fails to notify You of the +| non-compliance by some reasonable means prior to 60 days after You have +| come back into compliance. Moreover, Your grants from a particular +| Contributor are reinstated on an ongoing basis if such Contributor +| notifies You of the non-compliance by some reasonable means, this is the +| first time You have received notice of non-compliance with this License +| from such Contributor, and You become compliant prior to 30 days after +| Your receipt of the notice. +| +| 5.2. If You initiate litigation against any entity by asserting a patent +| infringement claim (excluding declaratory judgment actions, +| counter-claims, and cross-claims) alleging that a Contributor Version +| directly or indirectly infringes any patent, then the rights granted to +| You by any and all Contributors for the Covered Software under Section +| 2.1 of this License shall terminate. +| +| 5.3. In the event of termination under Sections 5.1 or 5.2 above, all +| end user license agreements (excluding distributors and resellers) which +| have been validly granted by You or Your distributors under this License +| prior to termination shall survive termination. +| +| ************************************************************************ +| * * +| * 6. Disclaimer of Warranty * +| * ------------------------- * +| * * +| * Covered Software is provided under this License on an "as is" * +| * basis, without warranty of any kind, either expressed, implied, or * +| * statutory, including, without limitation, warranties that the * +| * Covered Software is free of defects, merchantable, fit for a * +| * particular purpose or non-infringing. The entire risk as to the * +| * quality and performance of the Covered Software is with You. * +| * Should any Covered Software prove defective in any respect, You * +| * (not any Contributor) assume the cost of any necessary servicing, * +| * repair, or correction. This disclaimer of warranty constitutes an * +| * essential part of this License. No use of any Covered Software is * +| * authorized under this License except under this disclaimer. * +| * * +| ************************************************************************ +| +| ************************************************************************ +| * * +| * 7. Limitation of Liability * +| * -------------------------- * +| * * +| * Under no circumstances and under no legal theory, whether tort * +| * (including negligence), contract, or otherwise, shall any * +| * Contributor, or anyone who distributes Covered Software as * +| * permitted above, be liable to You for any direct, indirect, * +| * special, incidental, or consequential damages of any character * +| * including, without limitation, damages for lost profits, loss of * +| * goodwill, work stoppage, computer failure or malfunction, or any * +| * and all other commercial damages or losses, even if such party * +| * shall have been informed of the possibility of such damages. This * +| * limitation of liability shall not apply to liability for death or * +| * personal injury resulting from such party's negligence to the * +| * extent applicable law prohibits such limitation. Some * +| * jurisdictions do not allow the exclusion or limitation of * +| * incidental or consequential damages, so this exclusion and * +| * limitation may not apply to You. * +| * * +| ************************************************************************ +| +| 8. Litigation +| ------------- +| +| Any litigation relating to this License may be brought only in the +| courts of a jurisdiction where the defendant maintains its principal +| place of business and such litigation shall be governed by laws of that +| jurisdiction, without reference to its conflict-of-law provisions. +| Nothing in this Section shall prevent a party's ability to bring +| cross-claims or counter-claims. +| +| 9. Miscellaneous +| ---------------- +| +| This License represents the complete agreement concerning the subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. Any law or regulation which provides +| that the language of a contract shall be construed against the drafter +| shall not be used to construe this License against a Contributor. +| +| 10. Versions of the License +| --------------------------- +| +| 10.1. New Versions +| +| Mozilla Foundation is the license steward. Except as provided in Section +| 10.3, no one other than the license steward has the right to modify or +| publish new versions of this License. Each version will be given a +| distinguishing version number. +| +| 10.2. Effect of New Versions +| +| You may distribute the Covered Software under the terms of the version +| of the License under which You originally received the Covered Software, +| or under the terms of any subsequent version published by the license +| steward. +| +| 10.3. Modified Versions +| +| If you create software not governed by this License, and you want to +| create a new license for such software, you may create and use a +| modified version of this License if you rename the license and remove +| any references to the name of the license steward (except to note that +| such modified license differs from this License). +| +| 10.4. Distributing Source Code Form that is Incompatible With Secondary +| Licenses +| +| If You choose to distribute Source Code Form that is Incompatible With +| Secondary Licenses under the terms of this version of the License, the +| notice described in Exhibit B of this License must be attached. +| +| Exhibit A - Source Code Form License Notice +| ------------------------------------------- +| +| This Source Code Form is subject to the terms of the Mozilla Public +| License, v. 2.0. If a copy of the MPL was not distributed with this +| file, You can obtain one at http://mozilla.org/MPL/2.0/. +| +| If it is not possible or desirable to put the notice in a particular +| file, then You may include the notice in a location (such as a LICENSE +| file in a relevant directory) where a recipient would be likely to look +| for such a notice. +| +| You may add additional accurate notices of copyright ownership. +| +| Exhibit B - "Incompatible With Secondary Licenses" Notice +| --------------------------------------------------------- +| +| This Source Code Form is "Incompatible With Secondary Licenses", as +| defined by the Mozilla Public License, v. 2.0. + +-------------------------------------------------------------------------------- + This product bundles Google Http Client. Project URL: https://www.google.com/ @@ -433,12 +900,13 @@ This product bundles Google protobuf. Project URL: https://developers.google.com/protocol-buffers/ License: BSD 3-Clause + | Copyright 2008 Google Inc. All rights reserved. -| +| | Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are | met: -| +| | * Redistributions of source code must retain the above copyright | notice, this list of conditions and the following disclaimer. | * Redistributions in binary form must reproduce the above @@ -448,7 +916,7 @@ License: BSD 3-Clause | * Neither the name of Google Inc. nor the names of its | contributors may be used to endorse or promote products derived from | this software without specific prior written permission. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -460,7 +928,7 @@ License: BSD 3-Clause | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -| +| | Code generated by the Protocol Buffer compiler is owned by the owner | of the input file used when generating it. This code is not | standalone and requires a support library to be linked with it. This @@ -472,27 +940,28 @@ This product bundles Google re2j. Project URL: http://github.com/google/re2j License: Go License + | This is a work derived from Russ Cox's RE2 in Go, whose license | http://golang.org/LICENSE is as follows: -| +| | Copyright (c) 2009 The Go Authors. All rights reserved. -| +| | Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are | met: -| +| | * Redistributions of source code must retain the above copyright | notice, this list of conditions and the following disclaimer. -| +| | * Redistributions in binary form must reproduce the above copyright | notice, this list of conditions and the following disclaimer in | the documentation and/or other materials provided with the | distribution. -| +| | * Neither the name of Google Inc. nor the names of its contributors | may be used to endorse or promote products derived from this | software without specific prior written permission. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -535,6 +1004,56 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles Apache Tomcat Native (statically linked into netty-tcnative, bundled by gRPC-netty-shaded). + +Project URL: https://tomcat.apache.org/native-doc/ +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles BoringSSL (statically linked into netty-tcnative-boringssl-static, bundled by gRPC-netty-shaded). + +Project URL: https://boringssl.googlesource.com/boringssl/ +License: Apache License, Version 2.0 (with portions under the Go License - BSD 3-Clause) - https://boringssl.googlesource.com/boringssl/+/HEAD/LICENSE + +| +| Licenses for support code +| ------------------------- +| +| Parts of the TLS test suite are under the Go license. This code is not included +| in BoringSSL (i.e. libcrypto and libssl) when compiled, however, so +| distributing code linked against BoringSSL does not trigger this license: +| +| Copyright (c) 2009 The Go Authors. All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are +| met: +| +| * Redistributions of source code must retain the above copyright +| notice, this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above +| copyright notice, this list of conditions and the following disclaimer +| in the documentation and/or other materials provided with the +| distribution. +| * Neither the name of Google Inc. nor the names of its +| contributors may be used to endorse or promote products derived from +| this software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + This product bundles OpenCensus. Project URL: https://github.com/census-instrumentation/opencensus-java @@ -553,14 +1072,366 @@ This product bundles javax.annotation-api. Project URL: https://javaee.github.io/glassfish Project URL: http://jcp.org/en/jsr/detail?id=250 -License: CDDL - https://github.com/javaee/javax.annotation/blob/master/LICENSE +License: CDDL 1.1 - https://github.com/javaee/javax.annotation/blob/master/LICENSE + +| COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.1 +| +| 1. Definitions. +| +| 1.1. "Contributor" means each individual or entity that creates or +| contributes to the creation of Modifications. +| +| 1.2. "Contributor Version" means the combination of the Original +| Software, prior Modifications used by a Contributor (if any), and +| the Modifications made by that particular Contributor. +| +| 1.3. "Covered Software" means (a) the Original Software, or (b) +| Modifications, or (c) the combination of files containing Original +| Software with files containing Modifications, in each case including +| portions thereof. +| +| 1.4. "Executable" means the Covered Software in any form other than +| Source Code. +| +| 1.5. "Initial Developer" means the individual or entity that first +| makes Original Software available under this License. +| +| 1.6. "Larger Work" means a work which combines Covered Software or +| portions thereof with code not governed by the terms of this License. +| +| 1.7. "License" means this document. +| +| 1.8. "Licensable" means having the right to grant, to the maximum +| extent possible, whether at the time of the initial grant or +| subsequently acquired, any and all of the rights conveyed herein. +| +| 1.9. "Modifications" means the Source Code and Executable form of +| any of the following: +| +| A. Any file that results from an addition to, deletion from or +| modification of the contents of a file containing Original Software +| or previous Modifications; +| +| B. Any new file that contains any part of the Original Software or +| previous Modification; or +| +| C. Any new file that is contributed or otherwise made available +| under the terms of this License. +| +| 1.10. "Original Software" means the Source Code and Executable form +| of computer software code that is originally released under this +| License. +| +| 1.11. "Patent Claims" means any patent claim(s), now owned or +| hereafter acquired, including without limitation, method, process, +| and apparatus claims, in any patent Licensable by grantor. +| +| 1.12. "Source Code" means (a) the common form of computer software +| code in which modifications are made and (b) associated +| documentation included in or with such code. +| +| 1.13. "You" (or "Your") means an individual or a legal entity +| exercising rights under, and complying with all of the terms of, +| this License. For legal entities, "You" includes any entity which +| controls, is controlled by, or is under common control with You. +| For purposes of this definition, "control" means (a) the power, +| direct or indirect, to cause the direction or management of such +| entity, whether by contract or otherwise, or (b) ownership of more +| than fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants. +| +| 2.1. The Initial Developer Grant. +| +| Conditioned upon Your compliance with Section 3.1 below and subject +| to third party intellectual property claims, the Initial Developer +| hereby grants You a world-wide, royalty-free, non-exclusive license: +| +| (a) under intellectual property rights (other than patent or +| trademark) Licensable by Initial Developer, to use, reproduce, +| modify, display, perform, sublicense and distribute the Original +| Software (or portions thereof), with or without Modifications, +| and/or as part of a Larger Work; and +| +| (b) under Patent Claims infringed by the making, using or selling +| of Original Software, to make, have made, use, practice, sell, and +| offer for sale, and/or otherwise dispose of the Original Software +| (or portions thereof). +| +| (c) The licenses granted in Sections 2.1(a) and (b) are effective +| on the date Initial Developer first distributes or otherwise makes +| the Original Software available to a third party under the terms of +| this License. +| +| (d) Notwithstanding Section 2.1(b) above, no patent license is +| granted: (1) for code that You delete from the Original Software, +| or (2) for infringements caused by: (i) the modification of the +| Original Software, or (ii) the combination of the Original Software +| with other software or devices. +| +| 2.2. Contributor Grant. +| +| Conditioned upon Your compliance with Section 3.1 below and subject +| to third party intellectual property claims, each Contributor hereby +| grants You a world-wide, royalty-free, non-exclusive license: +| +| (a) under intellectual property rights (other than patent or +| trademark) Licensable by Contributor to use, reproduce, modify, +| display, perform, sublicense and distribute the Modifications +| created by such Contributor (or portions thereof), either on an +| unmodified basis, with other Modifications, as Covered Software +| and/or as part of a Larger Work; and +| +| (b) under Patent Claims infringed by the making, using, or selling +| of Modifications made by that Contributor either alone and/or in +| combination with its Contributor Version (or portions of such +| combination), to make, use, sell, offer for sale, have made, and/or +| otherwise dispose of: (1) Modifications made by that Contributor +| (or portions thereof); and (2) the combination of Modifications +| made by that Contributor with its Contributor Version (or portions +| of such combination). +| +| (c) The licenses granted in Sections 2.2(a) and 2.2(b) are +| effective on the date Contributor first makes Commercial Use of the +| Covered Software. +| +| (d) Notwithstanding Section 2.2(b) above, no patent license is +| granted: (1) for any code that Contributor has deleted from the +| Contributor Version; (2) for infringements caused by: (i) third +| party modifications of Contributor Version, or (ii) the combination +| of Modifications made by that Contributor with other software +| (except as part of the Contributor Version) or other devices; or +| (3) under Patent Claims infringed by Covered Software in the +| absence of Modifications made by that Contributor. +| +| 3. Distribution Obligations. +| +| 3.1. Availability of Source Code. +| +| Any Covered Software that You distribute or otherwise make available +| in Executable form must also be made available in Source Code form +| and that Source Code form must be distributed only under the terms +| of this License. You must include a copy of this License with every +| copy of the Source Code form of the Covered Software You distribute +| or otherwise make available. You must inform recipients of any such +| Covered Software in Executable form as to how they can obtain such +| Covered Software in Source Code form in a reasonable manner on or +| through a medium customarily used for software exchange. +| +| 3.2. Modifications. +| +| The Modifications that You create or to which You contribute are +| governed by the terms of this License. You represent that You +| believe Your Modifications are Your original creation(s) and/or You +| have sufficient rights to grant the rights conveyed by this License. +| +| 3.3. Required Notices. +| +| You must include a notice in each of Your Modifications that +| identifies You as the Contributor of the Modification. You may not +| remove or alter any copyright, patent or trademark notices contained +| within the Covered Software, or any notices of licensing or any +| descriptive text giving attribution to any Contributor or the +| Initial Developer. +| +| 3.4. Application of Additional Terms. +| +| You may not offer or impose any terms on any Covered Software in +| Source Code form that alters or restricts the applicable version of +| this License or the recipients' rights hereunder. You may choose to +| offer, and to charge a fee for, warranty, support, indemnity or +| liability obligations to one or more recipients of Covered Software. +| However, you may do so only on Your own behalf, and not on behalf of +| the Initial Developer or any Contributor. You must make it absolutely +| clear that any such warranty, support, indemnity or liability +| obligation is offered by You alone, and You hereby agree to indemnify +| the Initial Developer and every Contributor for any liability +| incurred by the Initial Developer or such Contributor as a result of +| warranty, support, indemnity or liability terms You offer. +| +| 3.5. Distribution of Executable Versions. +| +| You may distribute the Executable form of the Covered Software under +| the terms of this License or under the terms of a license of Your +| choice, which may contain terms different from this License, provided +| that You are in compliance with the terms of this License and that +| the license for the Executable form does not attempt to limit or +| alter the recipient's rights in the Source Code form from the rights +| set forth in this License. If You distribute the Covered Software in +| Executable form under a different license, You must make it +| absolutely clear that any terms which differ from this License are +| offered by You alone, not by the Initial Developer or Contributor. +| You hereby agree to indemnify the Initial Developer and every +| Contributor for any liability incurred by the Initial Developer or +| such Contributor as a result of any such terms You offer. +| +| 3.6. Larger Works. +| +| You may create a Larger Work by combining Covered Software with +| other code not governed by the terms of this License and distribute +| the Larger Work as a single product. In such a case, You must make +| sure the requirements of this License are fulfilled for the Covered +| Software. +| +| 4. Versions of the License. +| +| 4.1. New Versions. +| +| Oracle is the initial license steward and may publish revised and/or +| new versions of this License from time to time. Each version will be +| given a distinguishing version number. Except as provided in Section +| 4.3, no one other than the license steward has the right to modify +| this License. +| +| 4.2. Effect of New Versions. +| +| You may always continue to use, distribute or otherwise make the +| Covered Software available under the terms of the version of the +| License under which You originally received the Covered Software. If +| the Initial Developer includes a notice in the Original Software +| prohibiting it from being distributed or otherwise made available +| under any subsequent version of the License, You must distribute and +| make the Covered Software available under the terms of the version +| of the License under which You originally received the Covered +| Software. Otherwise, You may also choose to use, distribute or +| otherwise make the Covered Software available under the terms of any +| subsequent version of the License published by the license steward. +| +| 4.3. Modified Versions. +| +| When You are an Initial Developer and You want to create a new +| license for Your Original Software, You may create and use a +| modified version of this License if You: (a) rename the license and +| remove any references to the name of the license steward (except to +| note that the modified license differs from this License); and (b) +| otherwise make it clear that the license contains terms which differ +| from this License. +| +| 5. DISCLAIMER OF WARRANTY. +| +| COVERED SOFTWARE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" BASIS, +| WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, +| INCLUDING, WITHOUT LIMITATION, WARRANTIES THAT THE COVERED SOFTWARE +| IS FREE OF DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR +| NON-INFRINGING. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE +| OF THE COVERED SOFTWARE IS WITH YOU. SHOULD ANY COVERED SOFTWARE +| PROVE DEFECTIVE IN ANY RESPECT, YOU (NOT THE INITIAL DEVELOPER OR +| ANY OTHER CONTRIBUTOR) ASSUME THE COST OF ANY NECESSARY SERVICING, +| REPAIR OR CORRECTION. THIS DISCLAIMER OF WARRANTY CONSTITUTES AN +| ESSENTIAL PART OF THIS LICENSE. NO USE OF ANY COVERED SOFTWARE IS +| AUTHORIZED HEREUNDER EXCEPT UNDER THIS DISCLAIMER. +| +| 6. TERMINATION. +| +| 6.1. This License and the rights granted hereunder will terminate +| automatically if You fail to comply with terms herein and fail to +| cure such breach within 30 days of becoming aware of the breach. +| Provisions which, by their nature, must remain in effect beyond the +| termination of this License shall survive. +| +| 6.2. If You assert a patent infringement claim (excluding +| declaratory judgment actions) against Initial Developer or a +| Contributor (the Initial Developer or Contributor against whom You +| assert such claim is referred to as "Participant") alleging that the +| Participant Software (meaning the Contributor Version where the +| Participant is a Contributor or the Original Software where the +| Participant is the Initial Developer) directly or indirectly +| infringes any patent, then any and all rights granted directly or +| indirectly to You by such Participant, the Initial Developer (if the +| Initial Developer is not the Participant) and all Contributors under +| Sections 2.1 and/or 2.2 of this License shall, upon 60 days notice +| from Participant terminate prospectively and automatically at the +| expiration of such 60 day notice period, unless if within such 60 +| day period You withdraw Your claim with respect to the Participant +| Software against such Participant either unilaterally or pursuant to +| a written agreement with Participant. +| +| 6.3. If You assert a patent infringement claim against Participant +| alleging that the Participant Software directly or indirectly +| infringes any patent where such claim is resolved (such as by +| license or settlement) prior to the initiation of patent +| infringement litigation, then the reasonable value of the licenses +| granted by such Participant under Sections 2.1 or 2.2 shall be +| taken into account in determining the amount or value of any payment +| or license. +| +| 6.4. In the event of termination under Sections 6.1 or 6.2 above, +| all end user licenses that have been validly granted by You or any +| distributor hereunder prior to termination (excluding licenses +| granted to You by any distributor) shall survive termination. +| +| 7. LIMITATION OF LIABILITY. +| +| UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT +| (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE +| INITIAL DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF +| COVERED SOFTWARE, OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE LIABLE +| TO ANY PERSON FOR ANY INDIRECT, SPECIAL, INCIDENTAL, OR +| CONSEQUENTIAL DAMAGES OF ANY CHARACTER INCLUDING, WITHOUT +| LIMITATION, DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER +| FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR +| LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN INFORMED OF THE +| POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF LIABILITY SHALL NOT +| APPLY TO LIABILITY FOR DEATH OR PERSONAL INJURY RESULTING FROM SUCH +| PARTY'S NEGLIGENCE TO THE EXTENT APPLICABLE LAW PROHIBITS SUCH +| LIMITATION. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OR +| LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO THIS EXCLUSION +| AND LIMITATION MAY NOT APPLY TO YOU. +| +| 8. U.S. GOVERNMENT END USERS. +| +| The Covered Software is a "commercial item," as that term is defined +| in 48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial computer +| software" (as that term is defined at 48 C.F.R. 252.227-7014(a)(1)) +| and "commercial computer software documentation" as such terms are +| used in 48 C.F.R. 12.212 (Sept. 1995). Consistent with 48 C.F.R. +| 12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (June 1995), all +| U.S. Government End Users acquire Covered Software with only those +| rights set forth herein. This U.S. Government Rights clause is in +| lieu of, and supersedes, any other FAR, DFAR, or other clause or +| provision that addresses Government rights in computer software +| under this License. +| +| 9. MISCELLANEOUS. +| +| This License represents the complete agreement concerning subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. This License shall be governed by +| the law of the jurisdiction specified in a notice contained within +| the Original Software (except to the extent applicable law, if any, +| provides otherwise), excluding such jurisdiction's conflict-of-law +| provisions. Any litigation relating to this License shall be subject +| to the jurisdiction of the courts located in the jurisdiction and +| venue specified in a notice contained within the Original Software, +| with the losing party responsible for costs, including, without +| limitation, court costs and reasonable attorneys' fees and expenses. +| The application of the United Nations Convention on Contracts for +| the International Sale of Goods is expressly excluded. Any law or +| regulation which provides that the language of a contract shall be +| construed against the drafter shall not apply to this License. You +| agree that You alone are responsible for compliance with the United +| States export administration regulations (and the export control +| laws and regulation of any other countries) when You use, distribute +| or otherwise make available any Covered Software. +| +| 10. RESPONSIBILITY FOR CLAIMS. +| +| As between Initial Developer and the Contributors, each party is +| responsible for claims and damages arising, directly or indirectly, +| out of its utilization of rights under this License and You agree to +| work with Initial Developer and Contributors to distribute such +| responsibility on an equitable basis. Nothing herein is intended or +| shall be deemed to constitute any admission of liability. -------------------------------------------------------------------------------- -This product bundles checkerframework checker-qual. +This product bundles checkerframework checker-qual and checker-compat-qual. Project URL: https://checkerframework.org/ License: MIT + | The annotations are licensed under the MIT License. (The text of this | license appears below.) More specifically, all the parts of the Checker | Framework that you might want to include with your own program use the @@ -570,17 +1441,17 @@ License: MIT | In addition, the cleanroom implementations of third-party annotations, | which the Checker Framework recognizes as aliases for its own | annotations, are licensed under the MIT License. -| +| | Permission is hereby granted, free of charge, to any person obtaining a copy | of this software and associated documentation files (the "Software"), to deal | in the Software without restriction, including without limitation the rights | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | copies of the Software, and to permit persons to whom the Software is | furnished to do so, subject to the following conditions: -| +| | The above copyright notice and this permission notice shall be included in | all copies or substantial portions of the Software. -| +| | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -593,21 +1464,21 @@ License: MIT This product bundles Animal Sniffer Annotations. +Project URL: https://github.com/mojohaus/animal-sniffer License: MIT -| The MIT License -| + | Copyright (c) 2009 codehaus.org. -| +| | Permission is hereby granted, free of charge, to any person obtaining a copy | of this software and associated documentation files (the "Software"), to deal | in the Software without restriction, including without limitation the rights | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | copies of the Software, and to permit persons to whom the Software is | furnished to do so, subject to the following conditions: -| +| | The above copyright notice and this permission notice shall be included in | all copies or substantial portions of the Software. -| +| | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -629,24 +1500,25 @@ This product bundles ThreeTen BP. Project URL: https://www.threeten.org/threetenbp License: BSD 3-Clause + | Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. -| +| | All rights reserved. -| +| | Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are met: -| +| | * Redistributions of source code must retain the above copyright notice, | this list of conditions and the following disclaimer. -| +| | * Redistributions in binary form must reproduce the above copyright notice, | this list of conditions and the following disclaimer in the documentation | and/or other materials provided with the distribution. -| +| | * Neither the name of JSR-310 nor the names of its contributors | may be used to endorse or promote products derived from this software | without specific prior written permission. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -691,26 +1563,27 @@ License: Public Domain - https://github.com/stleary/JSON-java/blob/master/LICENS This product bundles ThreeTen Extra. -Project URL: https://www.threeten.org/threeten-extra -License: BSD 3-clause +Project URL: https://www.threeten.org/threeten-extra +License: BSD 3-Clause + | Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. -| +| | All rights reserved. -| +| | * Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are met: -| +| | * Redistributions of source code must retain the above copyright notice, | this list of conditions and the following disclaimer. -| +| | * Redistributions in binary form must reproduce the above copyright notice, | this list of conditions and the following disclaimer in the documentation | and/or other materials provided with the distribution. -| +| | * Neither the name of JSR-310 nor the names of its contributors | may be used to endorse or promote products derived from this software | without specific prior written permission. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -732,6 +1605,27 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles the Common Expression Language (CEL) specification (shaded by gRPC-xds). + +Project URL: https://github.com/google/cel-spec +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles xDS data plane API definitions (shaded by gRPC-xds). + +Project URL: https://github.com/cncf/xds +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles UDPA (Universal Data Plane API) definitions (shaded by gRPC-xds). + +Project URL: https://github.com/cncf/udpa +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + This product bundles JSpecify. Project URL: https://github.com/jspecify/jspecify @@ -743,21 +1637,20 @@ This product bundles Stax2 API. Project URL: http://github.com/FasterXML/stax2-api License: BSD 2-Clause -| BSD 2-Clause License -| + | Copyright (c) 2008+, FasterXML, LLC | All rights reserved. -| +| | Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are met: -| +| | * Redistributions of source code must retain the above copyright notice, this | list of conditions and the following disclaimer. -| +| | * Redistributions in binary form must reproduce the above copyright notice, | this list of conditions and the following disclaimer in the documentation | and/or other materials provided with the distribution. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -778,7 +1671,444 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles JCTools (via Netty). +This product bundles JCTools (via Netty and OpenTelemetry). Project URL: https://github.com/JCTools/JCTools License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles WeakConcurrentMap (via OpenTelemetry). + +Copyright: 2014 Rafael Winterhalter +Project URL: https://github.com/raphw/weak-lock-free +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles MSV xsdlib (bundled by Woodstox). + +Project URL: https://github.com/xmlark/msv +License: BSD 3-Clause + +| Copyright 2001-2013 Oracle and/or its affiliates. All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| 1. Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| 3. Neither the name of the copyright holder nor the names of its contributors +| may be used to endorse or promote products derived from this software +| without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +This product bundles isorelax (bundled by Woodstox). + +Project URL: https://github.com/relaxng/jing-trang +License: CDDL 1.1 + +| COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.1 +| +| 1. Definitions. +| +| 1.1. "Contributor" means each individual or entity that creates or +| contributes to the creation of Modifications. +| +| 1.2. "Contributor Version" means the combination of the Original +| Software, prior Modifications used by a Contributor (if any), and +| the Modifications made by that particular Contributor. +| +| 1.3. "Covered Software" means (a) the Original Software, or (b) +| Modifications, or (c) the combination of files containing Original +| Software with files containing Modifications, in each case including +| portions thereof. +| +| 1.4. "Executable" means the Covered Software in any form other than +| Source Code. +| +| 1.5. "Initial Developer" means the individual or entity that first +| makes Original Software available under this License. +| +| 1.6. "Larger Work" means a work which combines Covered Software or +| portions thereof with code not governed by the terms of this License. +| +| 1.7. "License" means this document. +| +| 1.8. "Licensable" means having the right to grant, to the maximum +| extent possible, whether at the time of the initial grant or +| subsequently acquired, any and all of the rights conveyed herein. +| +| 1.9. "Modifications" means the Source Code and Executable form of +| any of the following: +| +| A. Any file that results from an addition to, deletion from or +| modification of the contents of a file containing Original Software +| or previous Modifications; +| +| B. Any new file that contains any part of the Original Software or +| previous Modification; or +| +| C. Any new file that is contributed or otherwise made available +| under the terms of this License. +| +| 1.10. "Original Software" means the Source Code and Executable form +| of computer software code that is originally released under this +| License. +| +| 1.11. "Patent Claims" means any patent claim(s), now owned or +| hereafter acquired, including without limitation, method, process, +| and apparatus claims, in any patent Licensable by grantor. +| +| 1.12. "Source Code" means (a) the common form of computer software +| code in which modifications are made and (b) associated +| documentation included in or with such code. +| +| 1.13. "You" (or "Your") means an individual or a legal entity +| exercising rights under, and complying with all of the terms of, +| this License. For legal entities, "You" includes any entity which +| controls, is controlled by, or is under common control with You. +| For purposes of this definition, "control" means (a) the power, +| direct or indirect, to cause the direction or management of such +| entity, whether by contract or otherwise, or (b) ownership of more +| than fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants. +| +| 2.1. The Initial Developer Grant. +| +| Conditioned upon Your compliance with Section 3.1 below and subject +| to third party intellectual property claims, the Initial Developer +| hereby grants You a world-wide, royalty-free, non-exclusive license: +| +| (a) under intellectual property rights (other than patent or +| trademark) Licensable by Initial Developer, to use, reproduce, +| modify, display, perform, sublicense and distribute the Original +| Software (or portions thereof), with or without Modifications, +| and/or as part of a Larger Work; and +| +| (b) under Patent Claims infringed by the making, using or selling +| of Original Software, to make, have made, use, practice, sell, and +| offer for sale, and/or otherwise dispose of the Original Software +| (or portions thereof). +| +| (c) The licenses granted in Sections 2.1(a) and (b) are effective +| on the date Initial Developer first distributes or otherwise makes +| the Original Software available to a third party under the terms of +| this License. +| +| (d) Notwithstanding Section 2.1(b) above, no patent license is +| granted: (1) for code that You delete from the Original Software, +| or (2) for infringements caused by: (i) the modification of the +| Original Software, or (ii) the combination of the Original Software +| with other software or devices. +| +| 2.2. Contributor Grant. +| +| Conditioned upon Your compliance with Section 3.1 below and subject +| to third party intellectual property claims, each Contributor hereby +| grants You a world-wide, royalty-free, non-exclusive license: +| +| (a) under intellectual property rights (other than patent or +| trademark) Licensable by Contributor to use, reproduce, modify, +| display, perform, sublicense and distribute the Modifications +| created by such Contributor (or portions thereof), either on an +| unmodified basis, with other Modifications, as Covered Software +| and/or as part of a Larger Work; and +| +| (b) under Patent Claims infringed by the making, using, or selling +| of Modifications made by that Contributor either alone and/or in +| combination with its Contributor Version (or portions of such +| combination), to make, use, sell, offer for sale, have made, and/or +| otherwise dispose of: (1) Modifications made by that Contributor +| (or portions thereof); and (2) the combination of Modifications +| made by that Contributor with its Contributor Version (or portions +| of such combination). +| +| (c) The licenses granted in Sections 2.2(a) and 2.2(b) are +| effective on the date Contributor first makes Commercial Use of the +| Covered Software. +| +| (d) Notwithstanding Section 2.2(b) above, no patent license is +| granted: (1) for any code that Contributor has deleted from the +| Contributor Version; (2) for infringements caused by: (i) third +| party modifications of Contributor Version, or (ii) the combination +| of Modifications made by that Contributor with other software +| (except as part of the Contributor Version) or other devices; or +| (3) under Patent Claims infringed by Covered Software in the +| absence of Modifications made by that Contributor. +| +| 3. Distribution Obligations. +| +| 3.1. Availability of Source Code. +| +| Any Covered Software that You distribute or otherwise make available +| in Executable form must also be made available in Source Code form +| and that Source Code form must be distributed only under the terms +| of this License. You must include a copy of this License with every +| copy of the Source Code form of the Covered Software You distribute +| or otherwise make available. You must inform recipients of any such +| Covered Software in Executable form as to how they can obtain such +| Covered Software in Source Code form in a reasonable manner on or +| through a medium customarily used for software exchange. +| +| 3.2. Modifications. +| +| The Modifications that You create or to which You contribute are +| governed by the terms of this License. You represent that You +| believe Your Modifications are Your original creation(s) and/or You +| have sufficient rights to grant the rights conveyed by this License. +| +| 3.3. Required Notices. +| +| You must include a notice in each of Your Modifications that +| identifies You as the Contributor of the Modification. You may not +| remove or alter any copyright, patent or trademark notices contained +| within the Covered Software, or any notices of licensing or any +| descriptive text giving attribution to any Contributor or the +| Initial Developer. +| +| 3.4. Application of Additional Terms. +| +| You may not offer or impose any terms on any Covered Software in +| Source Code form that alters or restricts the applicable version of +| this License or the recipients' rights hereunder. You may choose to +| offer, and to charge a fee for, warranty, support, indemnity or +| liability obligations to one or more recipients of Covered Software. +| However, you may do so only on Your own behalf, and not on behalf of +| the Initial Developer or any Contributor. You must make it absolutely +| clear that any such warranty, support, indemnity or liability +| obligation is offered by You alone, and You hereby agree to indemnify +| the Initial Developer and every Contributor for any liability +| incurred by the Initial Developer or such Contributor as a result of +| warranty, support, indemnity or liability terms You offer. +| +| 3.5. Distribution of Executable Versions. +| +| You may distribute the Executable form of the Covered Software under +| the terms of this License or under the terms of a license of Your +| choice, which may contain terms different from this License, provided +| that You are in compliance with the terms of this License and that +| the license for the Executable form does not attempt to limit or +| alter the recipient's rights in the Source Code form from the rights +| set forth in this License. If You distribute the Covered Software in +| Executable form under a different license, You must make it +| absolutely clear that any terms which differ from this License are +| offered by You alone, not by the Initial Developer or Contributor. +| You hereby agree to indemnify the Initial Developer and every +| Contributor for any liability incurred by the Initial Developer or +| such Contributor as a result of any such terms You offer. +| +| 3.6. Larger Works. +| +| You may create a Larger Work by combining Covered Software with +| other code not governed by the terms of this License and distribute +| the Larger Work as a single product. In such a case, You must make +| sure the requirements of this License are fulfilled for the Covered +| Software. +| +| 4. Versions of the License. +| +| 4.1. New Versions. +| +| Oracle is the initial license steward and may publish revised and/or +| new versions of this License from time to time. Each version will be +| given a distinguishing version number. Except as provided in Section +| 4.3, no one other than the license steward has the right to modify +| this License. +| +| 4.2. Effect of New Versions. +| +| You may always continue to use, distribute or otherwise make the +| Covered Software available under the terms of the version of the +| License under which You originally received the Covered Software. If +| the Initial Developer includes a notice in the Original Software +| prohibiting it from being distributed or otherwise made available +| under any subsequent version of the License, You must distribute and +| make the Covered Software available under the terms of the version +| of the License under which You originally received the Covered +| Software. Otherwise, You may also choose to use, distribute or +| otherwise make the Covered Software available under the terms of any +| subsequent version of the License published by the license steward. +| +| 4.3. Modified Versions. +| +| When You are an Initial Developer and You want to create a new +| license for Your Original Software, You may create and use a +| modified version of this License if You: (a) rename the license and +| remove any references to the name of the license steward (except to +| note that the modified license differs from this License); and (b) +| otherwise make it clear that the license contains terms which differ +| from this License. +| +| 5. DISCLAIMER OF WARRANTY. +| +| COVERED SOFTWARE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" BASIS, +| WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, +| INCLUDING, WITHOUT LIMITATION, WARRANTIES THAT THE COVERED SOFTWARE +| IS FREE OF DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR +| NON-INFRINGING. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE +| OF THE COVERED SOFTWARE IS WITH YOU. SHOULD ANY COVERED SOFTWARE +| PROVE DEFECTIVE IN ANY RESPECT, YOU (NOT THE INITIAL DEVELOPER OR +| ANY OTHER CONTRIBUTOR) ASSUME THE COST OF ANY NECESSARY SERVICING, +| REPAIR OR CORRECTION. THIS DISCLAIMER OF WARRANTY CONSTITUTES AN +| ESSENTIAL PART OF THIS LICENSE. NO USE OF ANY COVERED SOFTWARE IS +| AUTHORIZED HEREUNDER EXCEPT UNDER THIS DISCLAIMER. +| +| 6. TERMINATION. +| +| 6.1. This License and the rights granted hereunder will terminate +| automatically if You fail to comply with terms herein and fail to +| cure such breach within 30 days of becoming aware of the breach. +| Provisions which, by their nature, must remain in effect beyond the +| termination of this License shall survive. +| +| 6.2. If You assert a patent infringement claim (excluding +| declaratory judgment actions) against Initial Developer or a +| Contributor (the Initial Developer or Contributor against whom You +| assert such claim is referred to as "Participant") alleging that the +| Participant Software (meaning the Contributor Version where the +| Participant is a Contributor or the Original Software where the +| Participant is the Initial Developer) directly or indirectly +| infringes any patent, then any and all rights granted directly or +| indirectly to You by such Participant, the Initial Developer (if the +| Initial Developer is not the Participant) and all Contributors under +| Sections 2.1 and/or 2.2 of this License shall, upon 60 days notice +| from Participant terminate prospectively and automatically at the +| expiration of such 60 day notice period, unless if within such 60 +| day period You withdraw Your claim with respect to the Participant +| Software against such Participant either unilaterally or pursuant to +| a written agreement with Participant. +| +| 6.3. If You assert a patent infringement claim against Participant +| alleging that the Participant Software directly or indirectly +| infringes any patent where such claim is resolved (such as by +| license or settlement) prior to the initiation of patent +| infringement litigation, then the reasonable value of the licenses +| granted by such Participant under Sections 2.1 or 2.2 shall be +| taken into account in determining the amount or value of any payment +| or license. +| +| 6.4. In the event of termination under Sections 6.1 or 6.2 above, +| all end user licenses that have been validly granted by You or any +| distributor hereunder prior to termination (excluding licenses +| granted to You by any distributor) shall survive termination. +| +| 7. LIMITATION OF LIABILITY. +| +| UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT +| (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE +| INITIAL DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF +| COVERED SOFTWARE, OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE LIABLE +| TO ANY PERSON FOR ANY INDIRECT, SPECIAL, INCIDENTAL, OR +| CONSEQUENTIAL DAMAGES OF ANY CHARACTER INCLUDING, WITHOUT +| LIMITATION, DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER +| FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR +| LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN INFORMED OF THE +| POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF LIABILITY SHALL NOT +| APPLY TO LIABILITY FOR DEATH OR PERSONAL INJURY RESULTING FROM SUCH +| PARTY'S NEGLIGENCE TO THE EXTENT APPLICABLE LAW PROHIBITS SUCH +| LIMITATION. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OR +| LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO THIS EXCLUSION +| AND LIMITATION MAY NOT APPLY TO YOU. +| +| 8. U.S. GOVERNMENT END USERS. +| +| The Covered Software is a "commercial item," as that term is defined +| in 48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial computer +| software" (as that term is defined at 48 C.F.R. 252.227-7014(a)(1)) +| and "commercial computer software documentation" as such terms are +| used in 48 C.F.R. 12.212 (Sept. 1995). Consistent with 48 C.F.R. +| 12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (June 1995), all +| U.S. Government End Users acquire Covered Software with only those +| rights set forth herein. This U.S. Government Rights clause is in +| lieu of, and supersedes, any other FAR, DFAR, or other clause or +| provision that addresses Government rights in computer software +| under this License. +| +| 9. MISCELLANEOUS. +| +| This License represents the complete agreement concerning subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. This License shall be governed by +| the law of the jurisdiction specified in a notice contained within +| the Original Software (except to the extent applicable law, if any, +| provides otherwise), excluding such jurisdiction's conflict-of-law +| provisions. Any litigation relating to this License shall be subject +| to the jurisdiction of the courts located in the jurisdiction and +| venue specified in a notice contained within the Original Software, +| with the losing party responsible for costs, including, without +| limitation, court costs and reasonable attorneys' fees and expenses. +| The application of the United Nations Convention on Contracts for +| the International Sale of Goods is expressly excluded. Any law or +| regulation which provides that the language of a contract shall be +| construed against the drafter shall not apply to this License. You +| agree that You alone are responsible for compliance with the United +| States export administration regulations (and the export control +| laws and regulation of any other countries) when You use, distribute +| or otherwise make available any Covered Software. +| +| 10. RESPONSIBILITY FOR CLAIMS. +| +| As between Initial Developer and the Contributors, each party is +| responsible for claims and damages arising, directly or indirectly, +| out of its utilization of rights under this License and You agree to +| work with Initial Developer and Contributors to distribute such +| responsibility on an equitable basis. Nothing herein is intended or +| shall be deemed to constitute any admission of liability. + +-------------------------------------------------------------------------------- + +This product bundles RELAX NG Datatype API (bundled by Woodstox). + +Project URL: https://github.com/relaxng/relaxng-datatype-java +License: BSD 3-Clause + +| Copyright (c) 2018 Oracle and/or its affiliates. All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| - Redistributions of source code must retain the above copyright +| notice, this list of conditions and the following disclaimer. +| +| - Redistributions in binary form must reproduce the above copyright +| notice, this list of conditions and the following disclaimer in the +| documentation and/or other materials provided with the distribution. +| +| - Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived +| from this software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +| IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +| THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +| PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/gcp-bundle/NOTICE b/gcp-bundle/NOTICE index 97eb794e3b72..98d13b8895d9 100644 --- a/gcp-bundle/NOTICE +++ b/gcp-bundle/NOTICE @@ -9,22 +9,37 @@ The Apache Software Foundation (http://www.apache.org/). This product bundles Jackson JSON Processor with the following in its NOTICE file: | # Jackson JSON processor -| +| | Jackson is a high-performance, Free/Open Source JSON processing library. | It was originally written by Tatu Saloranta (tatu.saloranta@iki.fi), and has | been in development since 2007. | It is currently developed by a community of developers. -| +| +| ## Copyright +| +| Copyright 2007-, Tatu Saloranta (tatu.saloranta@iki.fi) +| | ## Licensing -| +| | Jackson 2.x core and extension components are licensed under Apache License 2.0 | To find the details that apply to this artifact see the accompanying LICENSE file. -| +| | ## Credits -| +| | A list of contributors may be found from CREDITS(-2.x) file, which is included | in some artifacts (usually source distributions); but is always available | from the source code management (SCM) system project uses. +| +| ## FastDoubleParser +| +| jackson-core bundles a shaded copy of FastDoubleParser . +| That code is available under an MIT license +| under the following copyright. +| +| Copyright © 2023 Werner Randelshofer, Switzerland. MIT License. +| +| See FastDoubleParser-NOTICE for details of other source code included in FastDoubleParser +| and the licenses and copyrights that apply to that code. -------------------------------------------------------------------------------- @@ -32,51 +47,51 @@ This product bundles Netty with the following in its NOTICE file: | | The Netty Project | ================= -| +| | Please visit the Netty web site for more information: -| +| | * http://netty.io/ -| +| | Copyright 2016 The Netty Project -| +| | The Netty Project licenses this file to you under the Apache License, | version 2.0 (the "License"); you may not use this file except in compliance | with the License. You may obtain a copy of the License at: -| +| | http://www.apache.org/licenses/LICENSE-2.0 -| +| | Unless required by applicable law or agreed to in writing, software | distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | License for the specific language governing permissions and limitations | under the License. -| +| | ------------------------------------------------------------------------------- | This product contains a forked and modified version of Tomcat Native -| +| | * LICENSE: | * license/LICENSE.tomcat-native.txt (Apache License 2.0) | * HOMEPAGE: | * http://tomcat.apache.org/native-doc/ | * https://svn.apache.org/repos/asf/tomcat/native/ -| +| | This product contains the Maven wrapper scripts from 'Maven Wrapper', that provides an easy way to ensure a user has everything necessary to run the Maven build. -| +| | * LICENSE: | * license/LICENSE.mvn-wrapper.txt (Apache License 2.0) | * HOMEPAGE: | * https://github.com/takari/maven-wrapper -| +| | This product contains small piece of code to support AIX, taken from netbsd. -| +| | * LICENSE: | * license/LICENSE.aix-netbsd.txt (OpenSSL License) | * HOMEPAGE: | * https://ftp.netbsd.org/pub/NetBSD/NetBSD-current/src/crypto/external/bsd/openssl/dist -| -| +| +| | This product contains code from boringssl. -| +| | * LICENSE (Combination ISC and OpenSSL license) | * license/LICENSE.boringssl.txt (Combination ISC and OpenSSL license) | * HOMEPAGE: @@ -86,36 +101,36 @@ This product bundles Netty with the following in its NOTICE file: This product bundles gRPC with the following in its NOTICE file: | Copyright 2014 The gRPC Authors -| +| | Licensed under the Apache License, Version 2.0 (the "License"); | you may not use this file except in compliance with the License. | You may obtain a copy of the License at -| +| | http://www.apache.org/licenses/LICENSE-2.0 -| +| | Unless required by applicable law or agreed to in writing, software | distributed under the License is distributed on an "AS IS" BASIS, | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | See the License for the specific language governing permissions and | limitations under the License. -| +| | ----------------------------------------------------------------------- -| +| | This product contains a modified portion of 'OkHttp', an open source | HTTP & SPDY client for Android and Java applications, which can be obtained | at: -| +| | * LICENSE: | * okhttp/third_party/okhttp/LICENSE (Apache License 2.0) | * HOMEPAGE: | * https://github.com/square/okhttp | * LOCATION_IN_GRPC: | * okhttp/third_party/okhttp -| +| | This product contains a modified portion of 'Envoy', an open source | cloud-native high-performance edge/middle/service proxy, which can be | obtained at: -| +| | * LICENSE: | * xds/third_party/envoy/LICENSE (Apache License 2.0) | * NOTICE: @@ -124,11 +139,11 @@ This product bundles gRPC with the following in its NOTICE file: | * https://www.envoyproxy.io | * LOCATION_IN_GRPC: | * xds/third_party/envoy -| +| | This product contains a modified portion of 'protoc-gen-validate (PGV)', | an open source protoc plugin to generate polyglot message validators, | which can be obtained at: -| +| | * LICENSE: | * xds/third_party/protoc-gen-validate/LICENSE (Apache License 2.0) | * NOTICE: @@ -137,10 +152,10 @@ This product bundles gRPC with the following in its NOTICE file: | * https://github.com/envoyproxy/protoc-gen-validate | * LOCATION_IN_GRPC: | * xds/third_party/protoc-gen-validate -| +| | This product contains a modified portion of 'udpa', | an open source universal data plane API, which can be obtained at: -| +| | * LICENSE: | * xds/third_party/udpa/LICENSE (Apache License 2.0) | * HOMEPAGE: @@ -152,41 +167,41 @@ This product bundles gRPC with the following in its NOTICE file: This product bundles Perfmark with the following in its NOTICE file: | Copyright 2019 Google LLC -| +| | Licensed under the Apache License, Version 2.0 (the "License"); | you may not use this file except in compliance with the License. | You may obtain a copy of the License at -| +| | http://www.apache.org/licenses/LICENSE-2.0 -| +| | Unless required by applicable law or agreed to in writing, software | distributed under the License is distributed on an "AS IS" BASIS, | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | See the License for the specific language governing permissions and | limitations under the License. -| +| | ----------------------------------------------------------------------- -| +| | This product contains a modified portion of 'Catapult', an open source -| Trace Event viewer for Chome, Linux, and Android applications, which can +| Trace Event viewer for Chome, Linux, and Android applications, which can | be obtained at: -| +| | * LICENSE: | * traceviewer/src/main/resources/io/perfmark/traceviewer/third_party/catapult/LICENSE (New BSD License) | * HOMEPAGE: | * https://github.com/catapult-project/catapult -| +| | This product contains a modified portion of 'Polymer', a library for Web | Components, which can be obtained at: | * LICENSE: | * traceviewer/src/main/resources/io/perfmark/traceviewer/third_party/polymer/LICENSE (New BSD License) | * HOMEPAGE: | * https://github.com/Polymer/polymer -| -| +| +| | This product contains a modified portion of 'ASM', an open source | Java Bytecode library, which can be obtained at: -| +| | * LICENSE: | * agent/src/main/resources/io/perfmark/agent/third_party/asm/LICENSE (BSD style License) | * HOMEPAGE: @@ -196,61 +211,39 @@ This product bundles Perfmark with the following in its NOTICE file: This product bundles Conscrypt (openjdk-uber) with the following in its NOTICE file: | Copyright 2016 The Android Open Source Project -| +| | Licensed under the Apache License, Version 2.0 (the "License"); | you may not use this file except in compliance with the License. | You may obtain a copy of the License at -| +| | http://www.apache.org/licenses/LICENSE-2.0 -| +| | Unless required by applicable law or agreed to in writing, software | distributed under the License is distributed on an "AS IS" BASIS, | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | See the License for the specific language governing permissions and | limitations under the License. -| +| | ----------------------------------------------------------------------- | This product contains a modified portion of `Netty`, a configurable network | stack in Java, which can be obtained at: -| +| | * LICENSE: | * licenses/LICENSE.netty.txt (Apache License 2.0) | * HOMEPAGE: | * http://netty.io/ -| +| | This product contains a modified portion of `Apache Harmony`, modular Java runtime, | which can be obtained at: -| +| | * LICENSE: | * licenses/LICENSE.harmony.txt (Apache License 2.0) | * HOMEPAGE: | * https://harmony.apache.org/ -------------------------------------------------------------------------------- -This product bundles GCS Analytics Core with the following in its NOTICE file: -| # GCS Analytics Core -| -| GCS Analytics Core is a Java library designed to optimize analytics workloads on -| Google Cloud Storage (GCS). -| -| Copyright Google LLC -| -| Licensed under the Apache License, Version 2.0 (the "License"); -| you may not use this file except in compliance with the License. -| You may obtain a copy of the License at -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, -| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -| See the License for the specific language governing permissions and -| limitations under the License. - --------------------------------------------------------------------------------- - This product bundles Envoy with the following in its NOTICE file: | Envoy | Copyright The Envoy Project Authors -| +| | Licensed under Apache License 2.0. See LICENSE for terms. diff --git a/gcp-bundle/build.gradle b/gcp-bundle/build.gradle index 6ebe05ccdbce..9c4907bcdaa5 100644 --- a/gcp-bundle/build.gradle +++ b/gcp-bundle/build.gradle @@ -23,6 +23,13 @@ project(":iceberg-gcp-bundle") { tasks.jar.dependsOn tasks.shadowJar + configurations { + implementation { + exclude group: 'com.google.code.findbugs', module: 'jsr305' + exclude group: 'org.slf4j' + } + } + dependencies { implementation platform(libs.google.libraries.bom) implementation "com.google.cloud:google-cloud-storage" @@ -42,10 +49,6 @@ project(":iceberg-gcp-bundle") { include 'NOTICE' } - dependencies { - exclude(dependency('org.slf4j:slf4j-api')) - } - // relocate GCP-specific versions relocate 'com.fasterxml.jackson', 'org.apache.iceberg.gcp.shaded.com.fasterxml.jackson' relocate 'com.google.common', 'org.apache.iceberg.gcp.shaded.com.google.common' @@ -59,4 +62,6 @@ project(":iceberg-gcp-bundle") { jar { enabled = false } + + apply from: "${rootDir}/runtime-deps.gradle" } diff --git a/gcp-bundle/runtime-deps.txt b/gcp-bundle/runtime-deps.txt new file mode 100644 index 000000000000..a109d4fb5676 --- /dev/null +++ b/gcp-bundle/runtime-deps.txt @@ -0,0 +1,112 @@ +com.fasterxml.jackson.core:jackson-annotations:2.18.3 +com.fasterxml.jackson.core:jackson-core:2.18.3 +com.fasterxml.jackson.core:jackson-databind:2.18.3 +com.fasterxml.jackson.dataformat:jackson-dataformat-xml:2.18.3 +com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.18.3 +com.fasterxml.woodstox:woodstox-core:7.0.0 +com.google.android:annotations:4.1.1.4 +com.google.api-client:google-api-client:2.7.2 +com.google.api.grpc:gapic-google-cloud-storage-v2:2.67.0 +com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1:3.27.0 +com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1beta1:0.199.0 +com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1beta2:0.199.0 +com.google.api.grpc:grpc-google-cloud-storage-v2:2.67.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1:3.27.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1alpha:3.27.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta1:0.199.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta2:0.199.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta:3.27.0 +com.google.api.grpc:proto-google-cloud-kms-v1:0.185.0 +com.google.api.grpc:proto-google-cloud-monitoring-v3:3.92.0 +com.google.api.grpc:proto-google-cloud-storage-v2:2.67.0 +com.google.api.grpc:proto-google-common-protos:2.70.0 +com.google.api.grpc:proto-google-iam-v1:1.65.0 +com.google.api:api-common:2.62.0 +com.google.api:gax-grpc:2.79.0 +com.google.api:gax-httpjson:2.79.0 +com.google.api:gax:2.79.0 +com.google.apis:google-api-services-bigquery:v2-rev20251012-2.0.0 +com.google.apis:google-api-services-storage:v1-rev20260204-2.0.0 +com.google.auth:google-auth-library-credentials:1.46.0 +com.google.auth:google-auth-library-oauth2-http:1.46.0 +com.google.auto.value:auto-value-annotations:1.11.1 +com.google.cloud.gcs.analytics:client:1.2.3 +com.google.cloud.gcs.analytics:gcs-analytics-core:1.2.3 +com.google.cloud.opentelemetry:detector-resources-support:0.33.0 +com.google.cloud.opentelemetry:exporter-metrics:0.33.0 +com.google.cloud.opentelemetry:shared-resourcemapping:0.33.0 +com.google.cloud:google-cloud-bigquery:2.65.0 +com.google.cloud:google-cloud-bigquerystorage:3.27.0 +com.google.cloud:google-cloud-core-grpc:2.69.0 +com.google.cloud:google-cloud-core-http:2.69.0 +com.google.cloud:google-cloud-core:2.69.0 +com.google.cloud:google-cloud-kms:2.94.0 +com.google.cloud:google-cloud-monitoring:3.92.0 +com.google.cloud:google-cloud-storage:2.67.0 +com.google.code.gson:gson:2.12.1 +com.google.errorprone:error_prone_annotations:2.45.0 +com.google.flatbuffers:flatbuffers-java:24.3.25 +com.google.guava:failureaccess:1.0.3 +com.google.guava:guava:33.5.0-jre +com.google.guava:listenablefuture:9999.0-empty-to-avoid-conflict-with-guava +com.google.http-client:google-http-client-apache-v2:2.1.0 +com.google.http-client:google-http-client-appengine:2.1.0 +com.google.http-client:google-http-client-gson:2.1.0 +com.google.http-client:google-http-client-jackson2:2.1.0 +com.google.http-client:google-http-client:2.1.0 +com.google.j2objc:j2objc-annotations:3.1 +com.google.oauth-client:google-oauth-client:1.39.0 +com.google.protobuf:protobuf-java-util:4.33.2 +com.google.protobuf:protobuf-java:4.33.2 +com.google.re2j:re2j:1.8 +commons-codec:commons-codec:1.18.0 +io.grpc:grpc-alts:1.80.0 +io.grpc:grpc-api:1.80.0 +io.grpc:grpc-auth:1.80.0 +io.grpc:grpc-context:1.80.0 +io.grpc:grpc-core:1.80.0 +io.grpc:grpc-googleapis:1.80.0 +io.grpc:grpc-grpclb:1.80.0 +io.grpc:grpc-inprocess:1.80.0 +io.grpc:grpc-netty-shaded:1.80.0 +io.grpc:grpc-opentelemetry:1.80.0 +io.grpc:grpc-protobuf-lite:1.80.0 +io.grpc:grpc-protobuf:1.80.0 +io.grpc:grpc-rls:1.80.0 +io.grpc:grpc-services:1.80.0 +io.grpc:grpc-stub:1.80.0 +io.grpc:grpc-util:1.80.0 +io.grpc:grpc-xds:1.80.0 +io.netty:netty-buffer:4.1.110.Final +io.netty:netty-common:4.1.110.Final +io.opencensus:opencensus-api:0.31.1 +io.opencensus:opencensus-contrib-http-util:0.31.1 +io.opentelemetry.contrib:opentelemetry-gcp-resources:1.37.0-alpha +io.opentelemetry.semconv:opentelemetry-semconv:1.29.0-alpha +io.opentelemetry:opentelemetry-api:1.51.0 +io.opentelemetry:opentelemetry-context:1.51.0 +io.opentelemetry:opentelemetry-sdk-common:1.51.0 +io.opentelemetry:opentelemetry-sdk-extension-autoconfigure-spi:1.51.0 +io.opentelemetry:opentelemetry-sdk-logs:1.51.0 +io.opentelemetry:opentelemetry-sdk-metrics:1.51.0 +io.opentelemetry:opentelemetry-sdk-trace:1.51.0 +io.opentelemetry:opentelemetry-sdk:1.51.0 +io.perfmark:perfmark-api:0.27.0 +javax.annotation:javax.annotation-api:1.3.2 +org.apache.arrow:arrow-format:17.0.0 +org.apache.arrow:arrow-memory-core:17.0.0 +org.apache.arrow:arrow-memory-netty-buffer-patch:17.0.0 +org.apache.arrow:arrow-memory-netty:17.0.0 +org.apache.arrow:arrow-vector:17.0.0 +org.apache.commons:commons-lang3:3.20.0 +org.apache.httpcomponents:httpclient:4.5.14 +org.apache.httpcomponents:httpcore:4.4.16 +org.checkerframework:checker-compat-qual:2.5.6 +org.checkerframework:checker-qual:3.49.0 +org.codehaus.mojo:animal-sniffer-annotations:1.26 +org.codehaus.woodstox:stax2-api:4.2.2 +org.conscrypt:conscrypt-openjdk-uber:2.5.2 +org.json:json:20250517 +org.jspecify:jspecify:1.0.0 +org.threeten:threeten-extra:1.8.0 +org.threeten:threetenbp:1.7.0 diff --git a/gcp/src/integration/java/org/apache/iceberg/gcp/gcs/TestGcsFileIO.java b/gcp/src/integration/java/org/apache/iceberg/gcp/gcs/TestGcsFileIO.java index b377d24c6042..626aacd17d33 100644 --- a/gcp/src/integration/java/org/apache/iceberg/gcp/gcs/TestGcsFileIO.java +++ b/gcp/src/integration/java/org/apache/iceberg/gcp/gcs/TestGcsFileIO.java @@ -31,7 +31,6 @@ import com.google.cloud.storage.BlobInfo; import com.google.cloud.storage.BucketInfo; import com.google.cloud.storage.Storage; -import com.google.cloud.storage.StorageException; import com.google.cloud.storage.StorageOptions; import java.io.IOException; import java.io.InputStream; @@ -39,10 +38,12 @@ import java.util.List; import java.util.Random; import java.util.stream.Collectors; +import org.apache.iceberg.exceptions.NotFoundException; import org.apache.iceberg.gcp.GCPProperties; import org.apache.iceberg.io.FileInfo; import org.apache.iceberg.io.IOUtil; import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.io.SeekableInputStream; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.junit.jupiter.api.AfterAll; @@ -222,14 +223,40 @@ public void deletePrefix() { } @Test - public void readMissingLocation() { + public void readMissingLocation() throws IOException { String location = String.format("gs://%s/path/to/data.parquet", BUCKET); + InputFile input = fileIO.newInputFile(location); + + // Creating an input stream or changing the read position in it are local operations + try (SeekableInputStream in = input.newStream()) { + in.seek(1); + } + + try (SeekableInputStream in = input.newStream()) { + assertThatThrownBy(in::read) + .isInstanceOf(NotFoundException.class) + .hasCauseInstanceOf(IOException.class) + .hasMessage("Location does not exist: gs://test-bucket/path/to/data.parquet"); + } + } + + @Test + public void readMissingLocationGcsAnalyticsCoreEnabled() throws IOException { + String location = String.format("gs://%s/path/to/data.parquet", BUCKET); + fileIO.initialize( + ImmutableMap.of( + GCPProperties.GCS_ANALYTICS_CORE_ENABLED, + "true", + GCPProperties.GCS_NO_AUTH, + "true", + GCPProperties.GCS_SERVICE_HOST, + String.format("http://localhost:%d", GCS_EMULATOR_PORT))); InputFile in = fileIO.newInputFile(location); assertThatThrownBy(() -> in.newStream().read()) - .isInstanceOf(IOException.class) - .hasCauseInstanceOf(StorageException.class) - .hasMessageContaining("404 Not Found"); + .isInstanceOf(NotFoundException.class) + .hasCauseInstanceOf(IOException.class) + .hasMessage("Location does not exist: gs://test-bucket/path/to/data.parquet"); } @Test diff --git a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSExceptionUtil.java b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSExceptionUtil.java new file mode 100644 index 000000000000..681a2436e622 --- /dev/null +++ b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSExceptionUtil.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.gcp.gcs; + +import com.google.cloud.storage.BlobId; +import com.google.cloud.storage.StorageException; +import java.io.IOException; +import org.apache.iceberg.exceptions.NotFoundException; + +final class GCSExceptionUtil { + private GCSExceptionUtil() {} + + static void throwNotFoundIfNotPresent(IOException ioException, BlobId blobId) { + if (ioException.getCause() instanceof StorageException storageException + && storageException.getCode() == 404) { + throw new NotFoundException(ioException, "Location does not exist: %s", blobId.toGsUtilUri()); + } + } +} diff --git a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputFile.java b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputFile.java index 497af03bcdaa..12dc71b5a181 100644 --- a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputFile.java +++ b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputFile.java @@ -94,11 +94,11 @@ public SeekableInputStream newStream() { private SeekableInputStream newGoogleCloudStorageInputStream() throws IOException { if (null == blobSize) { return new GcsInputStreamWrapper( - GoogleCloudStorageInputStream.create(gcsFileSystem(), gcsItemId()), metrics()); + GoogleCloudStorageInputStream.create(gcsFileSystem(), gcsItemId()), blobId(), metrics()); } return new GcsInputStreamWrapper( - GoogleCloudStorageInputStream.create(gcsFileSystem(), gcsFileInfo()), metrics()); + GoogleCloudStorageInputStream.create(gcsFileSystem(), gcsFileInfo()), blobId(), metrics()); } private GcsItemId gcsItemId() { diff --git a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputStream.java b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputStream.java index 3b41ae21d34e..910e97e0c178 100644 --- a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputStream.java +++ b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputStream.java @@ -127,7 +127,12 @@ public int read() throws IOException { singleByteBuffer.position(0); pos += 1; - channel.read(singleByteBuffer); + try { + channel.read(singleByteBuffer); + } catch (IOException e) { + GCSExceptionUtil.throwNotFoundIfNotPresent(e, blobId); + throw e; + } readBytes.increment(); readOperations.increment(); @@ -174,7 +179,12 @@ private int read(ReadChannel readChannel, ByteBuffer buffer, int off, int len) throws IOException { buffer.position(off); buffer.limit(Math.min(off + len, buffer.capacity())); - return readChannel.read(buffer); + try { + return readChannel.read(buffer); + } catch (IOException e) { + GCSExceptionUtil.throwNotFoundIfNotPresent(e, blobId); + throw e; + } } @Override diff --git a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GcsInputStreamWrapper.java b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GcsInputStreamWrapper.java index 2e1dfdd73c08..25ba7662dd55 100644 --- a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GcsInputStreamWrapper.java +++ b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GcsInputStreamWrapper.java @@ -21,6 +21,7 @@ import com.google.api.client.util.Preconditions; import com.google.cloud.gcs.analyticscore.client.GcsObjectRange; import com.google.cloud.gcs.analyticscore.core.GoogleCloudStorageInputStream; +import com.google.cloud.storage.BlobId; import java.io.IOException; import java.nio.ByteBuffer; import java.util.List; @@ -37,10 +38,14 @@ class GcsInputStreamWrapper extends SeekableInputStream implements RangeReadable private final Counter readBytes; private final Counter readOperations; private final GoogleCloudStorageInputStream stream; + private final BlobId blobId; - GcsInputStreamWrapper(GoogleCloudStorageInputStream stream, MetricsContext metrics) { + GcsInputStreamWrapper( + GoogleCloudStorageInputStream stream, BlobId blobId, MetricsContext metrics) { Preconditions.checkArgument(null != stream, "Invalid input stream : null"); + Preconditions.checkArgument(null != blobId, "Invalid blobId : null"); this.stream = stream; + this.blobId = blobId; this.readBytes = metrics.counter(FileIOMetricsContext.READ_BYTES, MetricsContext.Unit.BYTES); this.readOperations = metrics.counter(FileIOMetricsContext.READ_OPERATIONS); } @@ -57,7 +62,13 @@ public void seek(long newPos) throws IOException { @Override public int read() throws IOException { - int readByte = stream.read(); + int readByte; + try { + readByte = stream.read(); + } catch (IOException e) { + GCSExceptionUtil.throwNotFoundIfNotPresent(e, blobId); + throw e; + } readBytes.increment(); readOperations.increment(); return readByte; @@ -70,7 +81,13 @@ public int read(byte[] b) throws IOException { @Override public int read(byte[] b, int off, int len) throws IOException { - int bytesRead = stream.read(b, off, len); + int bytesRead; + try { + bytesRead = stream.read(b, off, len); + } catch (IOException e) { + GCSExceptionUtil.throwNotFoundIfNotPresent(e, blobId); + throw e; + } if (bytesRead > 0) { readBytes.increment(bytesRead); } @@ -80,12 +97,22 @@ public int read(byte[] b, int off, int len) throws IOException { @Override public void readFully(long position, byte[] buffer, int offset, int length) throws IOException { - stream.readFully(position, buffer, offset, length); + try { + stream.readFully(position, buffer, offset, length); + } catch (IOException e) { + GCSExceptionUtil.throwNotFoundIfNotPresent(e, blobId); + throw e; + } } @Override public int readTail(byte[] buffer, int offset, int length) throws IOException { - return stream.readTail(buffer, offset, length); + try { + return stream.readTail(buffer, offset, length); + } catch (IOException e) { + GCSExceptionUtil.throwNotFoundIfNotPresent(e, blobId); + throw e; + } } @Override @@ -101,8 +128,12 @@ public void readVectored(List ranges, IntFunction allocat .setByteBufferFuture(fileRange.byteBuffer()) .build()) .collect(Collectors.toList()); - - stream.readVectored(objectRanges, allocate); + try { + stream.readVectored(objectRanges, allocate); + } catch (IOException e) { + GCSExceptionUtil.throwNotFoundIfNotPresent(e, blobId); + throw e; + } } @Override diff --git a/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGCSFileIOCredentialRefresh.java b/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGCSFileIOCredentialRefresh.java index d0c05483add3..0b9bb37f5f90 100644 --- a/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGCSFileIOCredentialRefresh.java +++ b/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGCSFileIOCredentialRefresh.java @@ -29,6 +29,7 @@ import java.util.Map; import java.util.concurrent.TimeUnit; import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.TestHelpers; import org.apache.iceberg.gcp.GCPProperties; import org.apache.iceberg.io.StorageCredential; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; @@ -226,4 +227,70 @@ void credentialRefreshWithinFiveMinuteWindow() { }); } } + + @Test + void refreshedCredentialsAreKryoSerializable() throws Exception { + // Verify that a GCSFileIO whose credentials have been refreshed at runtime can still be + // round-tripped through Kryo. The internal storageCredentials list must be backed by a + // collection that Kryo can serialize and deserialize. + String nearExpiryMs = Long.toString(Instant.now().plus(3, ChronoUnit.MINUTES).toEpochMilli()); + + StorageCredential initialCredential = + StorageCredential.create( + "gs://bucket/path", + ImmutableMap.of( + GCPProperties.GCS_OAUTH2_TOKEN, + "initialToken", + GCPProperties.GCS_OAUTH2_TOKEN_EXPIRES_AT, + nearExpiryMs)); + + String refreshedExpiryMs = + Long.toString(Instant.now().plus(1, ChronoUnit.HOURS).toEpochMilli()); + LoadCredentialsResponse refreshResponse = + ImmutableLoadCredentialsResponse.builder() + .addCredentials( + ImmutableCredential.builder() + .prefix("gs://bucket/path") + .config( + ImmutableMap.of( + GCPProperties.GCS_OAUTH2_TOKEN, + "refreshedToken", + GCPProperties.GCS_OAUTH2_TOKEN_EXPIRES_AT, + refreshedExpiryMs)) + .build()) + .build(); + + HttpRequest mockRequest = request("/v1/credentials").withMethod(HttpMethod.GET.name()); + mockServer + .when(mockRequest) + .respond( + response(LoadCredentialsResponseParser.toJson(refreshResponse)).withStatusCode(200)); + + Map properties = + ImmutableMap.of( + GCPProperties.GCS_OAUTH2_REFRESH_CREDENTIALS_ENDPOINT, + credentialsUri, + CatalogProperties.URI, + catalogUri); + + try (GCSFileIO fileIO = new GCSFileIO()) { + fileIO.initialize(properties); + fileIO.setCredentials(List.of(initialCredential)); + + fileIO.client(); + + // Wait for the refresh to update the in-memory credentials + Awaitility.await() + .atMost(10, TimeUnit.SECONDS) + .untilAsserted( + () -> + assertThat(fileIO.credentials().get(0).config()) + .containsEntry(GCPProperties.GCS_OAUTH2_TOKEN, "refreshedToken")); + + // Round-trip through Kryo and verify the credentials still match + try (GCSFileIO deserialized = TestHelpers.KryoHelpers.roundTripSerialize(fileIO)) { + assertThat(deserialized.credentials()).isEqualTo(fileIO.credentials()); + } + } + } } diff --git a/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGCSInputStream.java b/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGCSInputStream.java index f367db94264a..8cc85fad72fd 100644 --- a/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGCSInputStream.java +++ b/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGCSInputStream.java @@ -163,6 +163,9 @@ private void readAndCheckRanges( @Test public void testClose() throws Exception { BlobId blobId = BlobId.fromGsUtilUri("gs://bucket/path/to/closed.dat"); + byte[] data = randomData(1024 * 1024); + writeGCSData(blobId, data); + SeekableInputStream closed = new GCSInputStream(storage, blobId, null, gcpProperties, MetricsContext.nullMetrics()); closed.close(); diff --git a/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGcsInputStreamWrapper.java b/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGcsInputStreamWrapper.java index 2320037bd017..c6eae113d52d 100644 --- a/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGcsInputStreamWrapper.java +++ b/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGcsInputStreamWrapper.java @@ -20,6 +20,7 @@ import com.google.cloud.gcs.analyticscore.client.GcsObjectRange; import com.google.cloud.gcs.analyticscore.core.GoogleCloudStorageInputStream; +import com.google.cloud.storage.BlobId; import java.io.IOException; import java.nio.ByteBuffer; import java.util.List; @@ -44,7 +45,10 @@ public class TestGcsInputStreamWrapper { @BeforeEach public void before() { inputStreamWrapper = - new GcsInputStreamWrapper(googleCloudStorageInputStream, MetricsContext.nullMetrics()); + new GcsInputStreamWrapper( + googleCloudStorageInputStream, + BlobId.of("mockbucket", "mockname"), + MetricsContext.nullMetrics()); } @Test diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 2c1b7df56e9c..b16aa98c76bf 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -33,9 +33,10 @@ arrow = "15.0.2" avro = "1.12.1" assertj-core = "3.27.7" awaitility = "4.3.0" -awssdk-bom = "2.42.18" -azuresdk-bom = "1.3.5" +awssdk-bom = "2.42.41" +azuresdk-bom = "1.3.6" awssdk-s3accessgrants = "2.4.1" +bouncycastle = "1.84" bson-ver = "4.11.5" caffeine = "2.9.3" calcite = "1.41.0" @@ -44,43 +45,44 @@ delta-standalone = "3.3.2" delta-spark = "3.3.2" derby = "10.15.2.0" esotericsoftware-kryo = "4.0.3" -errorprone-annotations = "2.48.0" +errorprone-annotations = "2.49.0" failsafe = "3.3.2" findbugs-jsr305 = "3.0.2" flink120 = { strictly = "1.20.1"} flink20 = { strictly = "2.0.0"} flink21 = { strictly = "2.1.0"} -google-libraries-bom = "26.78.0" +google-libraries-bom = "26.80.0" gcs-analytics-core = "1.2.3" -guava = "33.5.0-jre" +guava = "33.6.0-jre" hadoop3 = "3.4.3" -httpcomponents-httpclient5 = "5.6" +httpcomponents-httpclient5 = "5.6.1" hive2 = { strictly = "2.3.10"} # see rich version usage explanation above immutables-value = "2.12.1" jackson-annotations = "2.21" -jackson-bom = "2.21.2" +jackson-bom = "2.21.3" jackson214 = { strictly = "2.14.2"} jackson215 = { strictly = "2.15.2"} # see rich version usage explanation above jakarta-el-api = "3.0.3" jakarta-servlet-api = "6.1.0" jaxb-api = "2.3.1" jaxb-runtime = "2.3.9" -jetty = "11.0.26" -junit = "5.14.3" -junit-platform = "1.14.3" +jetty = "12.1.8" +joda = "2.14.2" +junit = "5.14.4" +junit-platform = "1.14.4" junit-pioneer = "2.3.0" kafka = "3.9.2" kryo-shaded = "4.0.3" -lz4Java = "1.10.4" +lz4Java = "1.11.0" microprofile-openapi-api = "3.1.2" mockito = "4.11.0" mockserver = "5.15.0" -nessie = "0.107.4" -netty-buffer = "4.2.10.Final" +nessie = "0.107.5" +netty-buffer = "4.2.12.Final" object-client-bundle = "3.3.2" orc = "1.9.8" parquet = "1.17.0" -roaringbitmap = "1.6.13" +roaringbitmap = "1.6.14" scala-collection-compat = "2.14.0" slf4j = "2.0.17" snowflake-jdbc = "3.28.0" @@ -88,8 +90,8 @@ spark34 = "3.4.4" spark35 = "3.5.8" spark40 = "4.0.2" spark41 = "4.1.1" -sqlite-jdbc = "3.51.3.0" -testcontainers = "2.0.4" +sqlite-jdbc = "3.53.0.0" +testcontainers = "2.0.5" tez08 = { strictly = "0.8.4"} # see rich version usage explanation above vortex = "0.67.0" @@ -111,6 +113,9 @@ awssdk-bom = { module = "software.amazon.awssdk:bom", version.ref = "awssdk-bom" awssdk-s3accessgrants = { module = "software.amazon.s3.accessgrants:aws-s3-accessgrants-java-plugin", version.ref = "awssdk-s3accessgrants" } azuresdk-bom = { module = "com.azure:azure-sdk-bom", version.ref = "azuresdk-bom" } bson = { module = "org.mongodb:bson", version.ref = "bson-ver"} +bouncycastle-bcpkix = { module = "org.bouncycastle:bcpkix-jdk18on", version.ref = "bouncycastle" } +bouncycastle-bcprov = { module = "org.bouncycastle:bcprov-jdk18on", version.ref = "bouncycastle" } +bouncycastle-bcutil = { module = "org.bouncycastle:bcutil-jdk18on", version.ref = "bouncycastle" } caffeine = { module = "com.github.ben-manes.caffeine:caffeine", version.ref = "caffeine" } calcite-core = { module = "org.apache.calcite:calcite-core", version.ref = "calcite" } calcite-druid = { module = "org.apache.calcite:calcite-druid", version.ref = "calcite" } @@ -208,8 +213,10 @@ flink21-test-utilsjunit = { module = "org.apache.flink:flink-test-utils-junit", guava-testlib = { module = "com.google.guava:guava-testlib", version.ref = "guava" } jakarta-el-api = { module = "jakarta.el:jakarta.el-api", version.ref = "jakarta-el-api" } jakarta-servlet = {module = "jakarta.servlet:jakarta.servlet-api", version.ref = "jakarta-servlet-api"} -jetty-server = { module = "org.eclipse.jetty:jetty-server", version.ref = "jetty" } -jetty-servlet = { module = "org.eclipse.jetty:jetty-servlet", version.ref = "jetty" } +jetty-compression-server = { module = "org.eclipse.jetty.compression:jetty-compression-server", version.ref = "jetty" } +jetty-compression-gzip = { module = "org.eclipse.jetty.compression:jetty-compression-gzip", version.ref = "jetty" } +jetty-servlet = { module = "org.eclipse.jetty.ee10:jetty-ee10-servlet", version.ref = "jetty" } +joda-time = { module = "joda-time:joda-time", version.ref = "joda" } junit-jupiter = { module = "org.junit.jupiter:junit-jupiter", version.ref = "junit" } junit-jupiter-engine = { module = "org.junit.jupiter:junit-jupiter-engine", version.ref = "junit" } junit-pioneer = { module = "org.junit-pioneer:junit-pioneer", version.ref = "junit-pioneer" } diff --git a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveCatalog.java b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveCatalog.java index 93267716db66..4d881c515d48 100644 --- a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveCatalog.java +++ b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveCatalog.java @@ -65,6 +65,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.util.LocationUtil; +import org.apache.iceberg.util.PropertyUtil; import org.apache.iceberg.view.BaseMetastoreViewCatalog; import org.apache.iceberg.view.View; import org.apache.iceberg.view.ViewBuilder; @@ -94,6 +95,7 @@ public class HiveCatalog extends BaseMetastoreViewCatalog private KeyManagementClient keyManagementClient; private ClientPool clients; private boolean listAllTables = false; + private boolean uniqueTableLocation; private Map catalogProperties; public HiveCatalog() {} @@ -131,6 +133,12 @@ public void initialize(String inputName, Map properties) { this.keyManagementClient = EncryptionUtil.createKmsClient(properties); } + this.uniqueTableLocation = + PropertyUtil.propertyAsBoolean( + properties, + CatalogProperties.UNIQUE_TABLE_LOCATION, + CatalogProperties.UNIQUE_TABLE_LOCATION_DEFAULT); + this.clients = new CachedClientPool(conf, properties); } @@ -708,13 +716,14 @@ protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { // - Create the metadata in HMS, and this way committing the changes // Create a new location based on the namespace / database if it is set on database level + String tableLocation = LocationUtil.tableLocation(tableIdentifier, uniqueTableLocation); try { Database databaseData = clients.run(client -> client.getDatabase(tableIdentifier.namespace().levels()[0])); if (databaseData.getLocationUri() != null) { // If the database location is set use it as a base. String databaseLocation = LocationUtil.stripTrailingSlash(databaseData.getLocationUri()); - return String.format("%s/%s", databaseLocation, tableIdentifier.name()); + return String.format("%s/%s", databaseLocation, tableLocation); } } catch (NoSuchObjectException e) { @@ -731,7 +740,7 @@ protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { // Otherwise, stick to the {WAREHOUSE_DIR}/{DB_NAME}.db/{TABLE_NAME} path String databaseLocation = databaseLocation(tableIdentifier.namespace().levels()[0]); - return String.format("%s/%s", databaseLocation, tableIdentifier.name()); + return String.format("%s/%s", databaseLocation, tableLocation); } private String databaseLocation(String databaseName) { diff --git a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java index 20f9eb7f616e..d1ff5db66ad4 100644 --- a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java +++ b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java @@ -167,6 +167,8 @@ private static String convertToTypeString(Type type) { case FIXED: case BINARY: return "binary"; + case VARIANT: + return "unknown"; case DECIMAL: final Types.DecimalType decimalType = (Types.DecimalType) type; return String.format("decimal(%s,%s)", decimalType.precision(), decimalType.scale()); diff --git a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java index 68aedebf4771..1038d907e718 100644 --- a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java +++ b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java @@ -35,6 +35,7 @@ import org.apache.hadoop.hive.metastore.api.Table; import org.apache.iceberg.BaseMetastoreOperations; import org.apache.iceberg.BaseMetastoreTableOperations; +import org.apache.iceberg.CatalogProperties; import org.apache.iceberg.ClientPool; import org.apache.iceberg.LocationProviders; import org.apache.iceberg.TableMetadata; @@ -56,8 +57,9 @@ import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.LocationProvider; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.util.PropertyUtil; import org.apache.thrift.TException; import org.slf4j.Logger; @@ -142,15 +144,17 @@ public EncryptionManager encryption() { } if (tableKeyId != null) { - if (keyManagementClient == null) { - throw new RuntimeException( - "Can't create encryption manager, because key management client is not set"); - } - - Map encryptionProperties = Maps.newHashMap(); - encryptionProperties.put(TableProperties.ENCRYPTION_TABLE_KEY, tableKeyId); - encryptionProperties.put( - TableProperties.ENCRYPTION_DEK_LENGTH, String.valueOf(encryptionDekLength)); + Preconditions.checkArgument( + keyManagementClient != null, + "Cannot create encryption manager without a key management client. Consider setting the '%s' catalog property", + CatalogProperties.ENCRYPTION_KMS_IMPL); + + Map encryptionProperties = + ImmutableMap.of( + TableProperties.ENCRYPTION_TABLE_KEY, + tableKeyId, + TableProperties.ENCRYPTION_DEK_LENGTH, + String.valueOf(encryptionDekLength)); encryptionManager = EncryptionUtil.createEncryptionManager( @@ -312,17 +316,16 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { base.properties().keySet().stream() .filter(key -> !tableMetadata.properties().containsKey(key)) .collect(Collectors.toSet()); - } - if (removedProps.contains(TableProperties.ENCRYPTION_TABLE_KEY)) { - throw new IllegalArgumentException("Cannot remove key in encrypted table"); - } + Preconditions.checkArgument( + !removedProps.contains(TableProperties.ENCRYPTION_TABLE_KEY), + "Cannot remove key ID from an encrypted table"); - if (base != null - && !Objects.equals( - base.properties().get(TableProperties.ENCRYPTION_TABLE_KEY), - metadata.properties().get(TableProperties.ENCRYPTION_TABLE_KEY))) { - throw new IllegalArgumentException("Cannot modify key in encrypted table"); + Preconditions.checkArgument( + Objects.equals( + base.properties().get(TableProperties.ENCRYPTION_TABLE_KEY), + metadata.properties().get(TableProperties.ENCRYPTION_TABLE_KEY)), + "Cannot modify key ID of an encrypted table"); } HMSTablePropertyHelper.updateHmsTableForIcebergTable( diff --git a/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveSchemaUtil.java b/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveSchemaUtil.java index 1592a3461b40..59c19a5d095d 100644 --- a/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveSchemaUtil.java +++ b/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveSchemaUtil.java @@ -205,6 +205,13 @@ public void testConversionWithoutLastComment() { assertThat(schema.asStruct()).isEqualTo(expected.asStruct()); } + @Test + public void testVariantTypeConvertToHiveSchema() { + Schema schema = new Schema(optional(0, "variant_field", Types.VariantType.get())); + List hiveSchema = HiveSchemaUtil.convert(schema); + assertThat(hiveSchema).containsExactly(new FieldSchema("variant_field", "unknown", null)); + } + protected List getSupportedFieldSchemas() { List fields = Lists.newArrayListWithCapacity(10); fields.add(new FieldSchema("c_float", serdeConstants.FLOAT_TYPE_NAME, "float comment")); diff --git a/kafka-connect/build.gradle b/kafka-connect/build.gradle index af1f9cbbb62b..9f99277d3450 100644 --- a/kafka-connect/build.gradle +++ b/kafka-connect/build.gradle @@ -82,7 +82,7 @@ project(':iceberg-kafka-connect:iceberg-kafka-connect-runtime') { force 'org.apache.hadoop.thirdparty:hadoop-shaded-guava:1.5.0' force 'com.fasterxml.woodstox:woodstox-core:6.7.0' force 'commons-beanutils:commons-beanutils:1.11.0' - force 'io.grpc:grpc-netty-shaded:1.80.0' + force 'io.grpc:grpc-netty-shaded:1.81.0' } } } @@ -255,6 +255,8 @@ project(':iceberg-kafka-connect:iceberg-kafka-connect-runtime') { check.dependsOn integrationTest assemble.dependsOn distZip, hiveDistZip + + apply from: "${rootDir}/runtime-deps.gradle" } project(':iceberg-kafka-connect:iceberg-kafka-connect-transforms') { diff --git a/kafka-connect/kafka-connect-runtime/runtime-deps.txt b/kafka-connect/kafka-connect-runtime/runtime-deps.txt new file mode 100644 index 000000000000..4d61894086be --- /dev/null +++ b/kafka-connect/kafka-connect-runtime/runtime-deps.txt @@ -0,0 +1,239 @@ +com.azure:azure-core-http-netty:1.16.3 +com.azure:azure-core:1.57.1 +com.azure:azure-identity:1.18.2 +com.azure:azure-json:1.5.1 +com.azure:azure-storage-blob:12.33.3 +com.azure:azure-storage-common:12.32.2 +com.azure:azure-storage-file-datalake:12.26.3 +com.azure:azure-storage-internal-avro:12.18.2 +com.azure:azure-xml:1.2.1 +com.fasterxml.jackson.core:jackson-annotations:2.21 +com.fasterxml.jackson.core:jackson-core:2.21.3 +com.fasterxml.jackson.core:jackson-databind:2.21.3 +com.fasterxml.jackson.dataformat:jackson-dataformat-xml:2.21.3 +com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.21.3 +com.fasterxml.woodstox:woodstox-core:6.7.0 +com.github.ben-manes.caffeine:caffeine:2.9.3 +com.github.luben:zstd-jni:1.5.7-3 +com.github.pjfanning:jersey-json:1.22.0 +com.google.android:annotations:4.1.1.4 +com.google.api-client:google-api-client:2.7.2 +com.google.api.grpc:gapic-google-cloud-storage-v2:2.67.0 +com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1:3.27.0 +com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1beta1:0.199.0 +com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1beta2:0.199.0 +com.google.api.grpc:grpc-google-cloud-storage-v2:2.67.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1:3.27.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1alpha:3.27.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta1:0.199.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta2:0.199.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta:3.27.0 +com.google.api.grpc:proto-google-cloud-monitoring-v3:3.92.0 +com.google.api.grpc:proto-google-cloud-storage-v2:2.67.0 +com.google.api.grpc:proto-google-common-protos:2.70.0 +com.google.api.grpc:proto-google-iam-v1:1.65.0 +com.google.api:api-common:2.62.0 +com.google.api:gax-grpc:2.79.0 +com.google.api:gax-httpjson:2.79.0 +com.google.api:gax:2.79.0 +com.google.apis:google-api-services-bigquery:v2-rev20251012-2.0.0 +com.google.apis:google-api-services-storage:v1-rev20260204-2.0.0 +com.google.auth:google-auth-library-credentials:1.46.0 +com.google.auth:google-auth-library-oauth2-http:1.46.0 +com.google.auto.value:auto-value-annotations:1.11.0 +com.google.cloud.opentelemetry:detector-resources-support:0.33.0 +com.google.cloud.opentelemetry:exporter-metrics:0.33.0 +com.google.cloud.opentelemetry:shared-resourcemapping:0.33.0 +com.google.cloud:google-cloud-bigquery:2.65.0 +com.google.cloud:google-cloud-bigquerystorage:3.27.0 +com.google.cloud:google-cloud-core-grpc:2.69.0 +com.google.cloud:google-cloud-core-http:2.69.0 +com.google.cloud:google-cloud-core:2.69.0 +com.google.cloud:google-cloud-monitoring:3.92.0 +com.google.cloud:google-cloud-storage:2.67.0 +com.google.code.findbugs:jsr305:3.0.2 +com.google.code.gson:gson:2.12.1 +com.google.errorprone:error_prone_annotations:2.45.0 +com.google.flatbuffers:flatbuffers-java:25.2.10 +com.google.guava:failureaccess:1.0.3 +com.google.guava:guava:33.5.0-jre +com.google.guava:listenablefuture:9999.0-empty-to-avoid-conflict-with-guava +com.google.http-client:google-http-client-apache-v2:2.1.0 +com.google.http-client:google-http-client-appengine:2.1.0 +com.google.http-client:google-http-client-gson:2.1.0 +com.google.http-client:google-http-client-jackson2:2.1.0 +com.google.http-client:google-http-client:2.1.0 +com.google.j2objc:j2objc-annotations:3.1 +com.google.oauth-client:google-oauth-client:1.39.0 +com.google.protobuf:protobuf-java-util:4.33.2 +com.google.protobuf:protobuf-java:4.33.2 +com.google.re2j:re2j:1.8 +com.jcraft:jsch:0.1.55 +com.microsoft.azure:msal4j-persistence-extension:1.3.0 +com.microsoft.azure:msal4j:1.23.1 +com.sun.xml.bind:jaxb-impl:2.2.3-1 +commons-cli:commons-cli:1.9.0 +commons-codec:commons-codec:1.19.0 +commons-io:commons-io:2.20.0 +commons-logging:commons-logging:1.2 +commons-net:commons-net:3.9.0 +commons-pool:commons-pool:1.6 +dev.failsafe:failsafe:3.3.2 +dev.vortex:vortex-jni:0.67.0 +dnsjava:dnsjava:3.6.1 +io.airlift:aircompressor:2.0.3 +io.dropwizard.metrics:metrics-core:3.2.4 +io.grpc:grpc-alts:1.80.0 +io.grpc:grpc-api:1.80.0 +io.grpc:grpc-auth:1.80.0 +io.grpc:grpc-context:1.80.0 +io.grpc:grpc-core:1.80.0 +io.grpc:grpc-googleapis:1.80.0 +io.grpc:grpc-grpclb:1.80.0 +io.grpc:grpc-inprocess:1.80.0 +io.grpc:grpc-netty-shaded:1.81.0 +io.grpc:grpc-opentelemetry:1.80.0 +io.grpc:grpc-protobuf-lite:1.80.0 +io.grpc:grpc-protobuf:1.80.0 +io.grpc:grpc-rls:1.80.0 +io.grpc:grpc-services:1.80.0 +io.grpc:grpc-stub:1.80.0 +io.grpc:grpc-util:1.80.0 +io.grpc:grpc-xds:1.80.0 +io.netty:netty-buffer:4.2.7.Final +io.netty:netty-codec-base:4.2.7.Final +io.netty:netty-codec-compression:4.2.7.Final +io.netty:netty-codec-dns:4.2.7.Final +io.netty:netty-codec-http2:4.2.7.Final +io.netty:netty-codec-http:4.2.7.Final +io.netty:netty-codec-marshalling:4.2.7.Final +io.netty:netty-codec-protobuf:4.2.7.Final +io.netty:netty-codec-socks:4.2.7.Final +io.netty:netty-codec:4.2.7.Final +io.netty:netty-common:4.2.7.Final +io.netty:netty-handler-proxy:4.2.7.Final +io.netty:netty-handler:4.2.7.Final +io.netty:netty-resolver-dns-classes-macos:4.2.7.Final +io.netty:netty-resolver-dns-native-macos:4.2.7.Final +io.netty:netty-resolver-dns:4.2.7.Final +io.netty:netty-resolver:4.2.7.Final +io.netty:netty-tcnative-boringssl-static:2.0.74.Final +io.netty:netty-tcnative-classes:2.0.74.Final +io.netty:netty-transport-classes-epoll:4.2.7.Final +io.netty:netty-transport-classes-kqueue:4.2.7.Final +io.netty:netty-transport-native-epoll:4.2.7.Final +io.netty:netty-transport-native-kqueue:4.2.7.Final +io.netty:netty-transport-native-unix-common:4.2.7.Final +io.netty:netty-transport:4.2.7.Final +io.opencensus:opencensus-api:0.31.1 +io.opencensus:opencensus-contrib-http-util:0.31.1 +io.opentelemetry.contrib:opentelemetry-gcp-resources:1.37.0-alpha +io.opentelemetry.semconv:opentelemetry-semconv:1.29.0-alpha +io.opentelemetry:opentelemetry-api:1.51.0 +io.opentelemetry:opentelemetry-context:1.51.0 +io.opentelemetry:opentelemetry-sdk-common:1.51.0 +io.opentelemetry:opentelemetry-sdk-extension-autoconfigure-spi:1.51.0 +io.opentelemetry:opentelemetry-sdk-logs:1.51.0 +io.opentelemetry:opentelemetry-sdk-metrics:1.51.0 +io.opentelemetry:opentelemetry-sdk-trace:1.51.0 +io.opentelemetry:opentelemetry-sdk:1.51.0 +io.perfmark:perfmark-api:0.27.0 +io.projectreactor.netty:reactor-netty-core:1.2.13 +io.projectreactor.netty:reactor-netty-http:1.2.13 +io.projectreactor:reactor-core:3.7.14 +jakarta.activation:jakarta.activation-api:1.2.1 +javax.annotation:javax.annotation-api:1.3.2 +javax.servlet.jsp:jsp-api:2.1 +javax.servlet:javax.servlet-api:3.1.0 +javax.xml.bind:jaxb-api:2.2.2 +javax.xml.stream:stax-api:1.0-2 +net.java.dev.jna:jna-platform:5.17.0 +net.java.dev.jna:jna:5.17.0 +org.apache.arrow:arrow-c-data:18.3.0 +org.apache.arrow:arrow-format:18.3.0 +org.apache.arrow:arrow-memory-core:18.3.0 +org.apache.arrow:arrow-memory-netty-buffer-patch:18.3.0 +org.apache.arrow:arrow-memory-netty:18.3.0 +org.apache.arrow:arrow-vector:18.3.0 +org.apache.avro:avro:1.12.1 +org.apache.commons:commons-collections4:4.4 +org.apache.commons:commons-compress:1.28.0 +org.apache.commons:commons-lang3:3.18.0 +org.apache.commons:commons-math3:3.6.1 +org.apache.commons:commons-text:1.14.0 +org.apache.hadoop.thirdparty:hadoop-shaded-guava:1.5.0 +org.apache.hadoop.thirdparty:hadoop-shaded-protobuf_3_25:1.5.0 +org.apache.hadoop:hadoop-annotations:3.4.3 +org.apache.hadoop:hadoop-common:3.4.3 +org.apache.httpcomponents.client5:httpclient5:5.6.1 +org.apache.httpcomponents.core5:httpcore5-h2:5.4 +org.apache.httpcomponents.core5:httpcore5:5.4 +org.apache.httpcomponents:httpclient:4.5.14 +org.apache.httpcomponents:httpcore:4.4.16 +org.apache.orc:orc-core:1.9.8 +org.apache.orc:orc-shims:1.9.8 +org.apache.parquet:parquet-avro:1.17.0 +org.apache.parquet:parquet-column:1.17.0 +org.apache.parquet:parquet-common:1.17.0 +org.apache.parquet:parquet-encoding:1.17.0 +org.apache.parquet:parquet-format-structures:1.17.0 +org.apache.parquet:parquet-hadoop:1.17.0 +org.apache.parquet:parquet-jackson:1.17.0 +org.apache.parquet:parquet-variant:1.17.0 +org.bouncycastle:bcprov-jdk18on:1.82 +org.checkerframework:checker-compat-qual:2.5.6 +org.checkerframework:checker-qual:3.49.0 +org.codehaus.jettison:jettison:1.5.5 +org.codehaus.mojo:animal-sniffer-annotations:1.26 +org.codehaus.woodstox:stax2-api:4.2.2 +org.conscrypt:conscrypt-openjdk-uber:2.5.2 +org.json:json:20250517 +org.locationtech.jts:jts-core:1.20.0 +org.mongodb:bson:4.11.5 +org.reactivestreams:reactive-streams:1.0.4 +org.roaringbitmap:RoaringBitmap:1.6.14 +org.slf4j:slf4j-api:2.0.17 +org.threeten:threeten-extra:1.8.0 +org.threeten:threetenbp:1.7.0 +org.xerial.snappy:snappy-java:1.1.10.8 +software.amazon.awssdk.crt:aws-crt:0.45.1 +software.amazon.awssdk:annotations:2.42.41 +software.amazon.awssdk:apache-client:2.42.41 +software.amazon.awssdk:arns:2.42.41 +software.amazon.awssdk:auth:2.42.41 +software.amazon.awssdk:aws-core:2.42.41 +software.amazon.awssdk:aws-json-protocol:2.42.41 +software.amazon.awssdk:aws-query-protocol:2.42.41 +software.amazon.awssdk:aws-xml-protocol:2.42.41 +software.amazon.awssdk:checksums-spi:2.42.41 +software.amazon.awssdk:checksums:2.42.41 +software.amazon.awssdk:crt-core:2.42.41 +software.amazon.awssdk:dynamodb:2.42.41 +software.amazon.awssdk:endpoints-spi:2.42.41 +software.amazon.awssdk:glue:2.42.41 +software.amazon.awssdk:http-auth-aws-crt:2.42.41 +software.amazon.awssdk:http-auth-aws-eventstream:2.42.41 +software.amazon.awssdk:http-auth-aws:2.42.41 +software.amazon.awssdk:http-auth-spi:2.42.41 +software.amazon.awssdk:http-auth:2.42.41 +software.amazon.awssdk:http-client-spi:2.42.41 +software.amazon.awssdk:iam:2.42.41 +software.amazon.awssdk:identity-spi:2.42.41 +software.amazon.awssdk:json-utils:2.42.41 +software.amazon.awssdk:kms:2.42.41 +software.amazon.awssdk:lakeformation:2.42.41 +software.amazon.awssdk:metrics-spi:2.42.41 +software.amazon.awssdk:netty-nio-client:2.42.41 +software.amazon.awssdk:profiles:2.42.41 +software.amazon.awssdk:protocol-core:2.42.41 +software.amazon.awssdk:regions:2.42.41 +software.amazon.awssdk:retries-spi:2.42.41 +software.amazon.awssdk:retries:2.42.41 +software.amazon.awssdk:s3:2.42.41 +software.amazon.awssdk:sdk-core:2.42.41 +software.amazon.awssdk:sso:2.42.41 +software.amazon.awssdk:sts:2.42.41 +software.amazon.awssdk:third-party-jackson-core:2.42.41 +software.amazon.awssdk:utils-lite:2.42.41 +software.amazon.awssdk:utils:2.42.41 +software.amazon.eventstream:eventstream:1.0.1 diff --git a/kafka-connect/kafka-connect-runtime/src/integration/java/org/apache/iceberg/connect/TestIntegrationDynamicTable.java b/kafka-connect/kafka-connect-runtime/src/integration/java/org/apache/iceberg/connect/TestIntegrationDynamicTable.java index 65bbcde9dfed..1d3d71a54152 100644 --- a/kafka-connect/kafka-connect-runtime/src/integration/java/org/apache/iceberg/connect/TestIntegrationDynamicTable.java +++ b/kafka-connect/kafka-connect-runtime/src/integration/java/org/apache/iceberg/connect/TestIntegrationDynamicTable.java @@ -20,11 +20,14 @@ import static org.assertj.core.api.Assertions.assertThat; +import java.time.Duration; import java.time.Instant; import java.util.List; import org.apache.iceberg.DataFile; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.NullSource; import org.junit.jupiter.params.provider.ValueSource; @@ -59,6 +62,56 @@ public void testIcebergSink(String branch) { assertSnapshotProps(TABLE_IDENTIFIER2, branch); } + /** + * Verifies dynamic routing works when topic-rewriting SMTs (e.g. RegexRouter) change + * record.topic(). Before the fix, SinkWriter tracked offsets under the rewritten topic, causing a + * mismatch with context.assignment() and preventing proper offset commits. + */ + @Test + public void testDynamicRouteWithTopicRewritingSMT() { + String smtTable = "smttbl"; + TableIdentifier smtTableId = TableIdentifier.of(TEST_DB, smtTable); + catalog().createTable(smtTableId, TestEvent.TEST_SCHEMA); + + try { + // RegexRouter rewrites topic to "test.smttbl", then InsertField copies + // record.topic() (now "test.smttbl") into field "srcTopic", and dynamic + // routing uses "srcTopic" to pick the destination table. + KafkaConnectUtils.Config connectorConfig = + createCommonConfig(false) + .config("iceberg.tables.dynamic-enabled", true) + .config("iceberg.tables.route-field", "srcTopic") + .config("transforms", "rewriteTopic,insertTopic") + .config( + "transforms.rewriteTopic.type", "org.apache.kafka.connect.transforms.RegexRouter") + .config("transforms.rewriteTopic.regex", ".*") + .config("transforms.rewriteTopic.replacement", TEST_DB + "." + smtTable) + .config( + "transforms.insertTopic.type", + "org.apache.kafka.connect.transforms.InsertField$Value") + .config("transforms.insertTopic.topic.field", "srcTopic"); + + context().connectorCatalogProperties().forEach(connectorConfig::config); + context().startConnector(connectorConfig); + + send(testTopic(), new TestEvent(1, "type1", Instant.now(), "hello"), false); + send(testTopic(), new TestEvent(2, "type2", Instant.now(), "world"), false); + flush(); + + Awaitility.await() + .atMost(Duration.ofSeconds(30)) + .pollInterval(Duration.ofSeconds(1)) + .untilAsserted(() -> assertSnapshotAdded(List.of(smtTableId))); + + List files = dataFiles(smtTableId, null); + assertThat(files).hasSizeBetween(1, 2); + assertThat(files.stream().mapToLong(DataFile::recordCount).sum()).isEqualTo(2); + assertSnapshotProps(smtTableId, null); + } finally { + catalog().dropTable(smtTableId); + } + } + @Override protected KafkaConnectUtils.Config createConfig(boolean useSchema) { return createCommonConfig(useSchema) diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/channel/CommitterImpl.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/channel/CommitterImpl.java index 04602a66a5e1..7b2d4a25363d 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/channel/CommitterImpl.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/channel/CommitterImpl.java @@ -30,7 +30,6 @@ import org.apache.kafka.clients.admin.ConsumerGroupDescription; import org.apache.kafka.clients.admin.MemberDescription; import org.apache.kafka.common.TopicPartition; -import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.sink.SinkRecord; import org.apache.kafka.connect.sink.SinkTaskContext; import org.slf4j.Logger; @@ -48,6 +47,7 @@ public class CommitterImpl implements Committer { private KafkaClientFactory clientFactory; private Collection membersWhenWorkerIsCoordinator; private final AtomicBoolean isInitialized = new AtomicBoolean(false); + private String taskId; private void initialize( Catalog icebergCatalog, @@ -58,6 +58,7 @@ private void initialize( this.config = icebergSinkConfig; this.context = sinkTaskContext; this.clientFactory = new KafkaClientFactory(config.kafkaProps()); + this.taskId = config.connectorName() + "-" + config.taskId(); } } @@ -92,16 +93,38 @@ boolean hasLeaderPartition(Collection currentAssignedPartitions) @VisibleForTesting boolean containsFirstPartition( Collection members, Collection partitions) { - // there should only be one task assigned partition 0 of the first topic, - // so elect that one the leader - TopicPartition firstTopicPartition = - members.stream() - .flatMap(member -> member.assignment().topicPartitions().stream()) - .min(new TopicPartitionComparator()) - .orElseThrow( - () -> new ConnectException("No partitions assigned, cannot determine leader")); - - return partitions.contains(firstTopicPartition); + // Determine the first partition across all members to elect the leader + TopicPartition firstTopicPartition = findFirstTopicPartition(members); + + if (firstTopicPartition == null) { + LOG.warn( + "Committer {} found no partitions assigned across all members, cannot determine leader", + taskId); + return false; + } + + boolean containsFirst = partitions.contains(firstTopicPartition); + if (containsFirst) { + LOG.info( + "Committer {} contains the first partition {}, this task is the leader", + taskId, + firstTopicPartition); + } else { + LOG.debug( + "Committer {} does not contain the first partition {}, not the leader", + taskId, + firstTopicPartition); + } + + return containsFirst; + } + + @VisibleForTesting + TopicPartition findFirstTopicPartition(Collection members) { + return members.stream() + .flatMap(member -> member.assignment().topicPartitions().stream()) + .min(new TopicPartitionComparator()) + .orElse(null); } @Override @@ -122,7 +145,7 @@ public void open( Collection addedPartitions) { initialize(icebergCatalog, icebergSinkConfig, sinkTaskContext); if (hasLeaderPartition(addedPartitions)) { - LOG.info("Committer received leader partition. Starting Coordinator."); + LOG.info("Committer {} received leader partition. Starting Coordinator.", taskId); startCoordinator(); } } @@ -141,31 +164,25 @@ public void close(Collection closedPartitions) { // Defensive: close called without prior initialization (should not happen). if (!isInitialized.get()) { - LOG.warn("Close unexpectedly called without partition assignment"); + LOG.warn("Close unexpectedly called on committer {} without partition assignment", taskId); return; } // Empty partitions → task was stopped explicitly. Stop coordinator if running. if (closedPartitions.isEmpty()) { - LOG.info("Task stopped. Closing coordinator."); + LOG.info("Committer {} stopped. Closing coordinator.", taskId); stopCoordinator(); return; } // Normal close: if leader partition is lost, stop coordinator. if (hasLeaderPartition(closedPartitions)) { - LOG.info( - "Committer {}-{} lost leader partition. Stopping coordinator.", - config.connectorName(), - config.taskId()); + LOG.info("Committer {} lost leader partition. Stopping coordinator.", taskId); stopCoordinator(); } // Reset offsets to last committed to avoid data loss. - LOG.info( - "Seeking to last committed offsets for worker {}-{}.", - config.connectorName(), - config.taskId()); + LOG.info("Seeking to last committed offsets for worker {}.", taskId); KafkaUtils.seekToLastCommittedOffsets(context); } @@ -181,9 +198,7 @@ public void save(Collection sinkRecords) { private void processControlEvents() { if (coordinatorThread != null && coordinatorThread.isTerminated()) { throw new NotRunningException( - String.format( - "Coordinator unexpectedly terminated on committer %s-%s", - config.connectorName(), config.taskId())); + String.format("Coordinator unexpectedly terminated on committer %s", taskId)); } if (worker != null) { worker.process(); @@ -192,7 +207,7 @@ private void processControlEvents() { private void startWorker() { if (null == this.worker) { - LOG.info("Starting commit worker {}-{}", config.connectorName(), config.taskId()); + LOG.info("Starting commit worker {}", taskId); SinkWriter sinkWriter = new SinkWriter(catalog, config); worker = new Worker(config, clientFactory, sinkWriter, context); worker.start(); @@ -201,10 +216,7 @@ private void startWorker() { private void startCoordinator() { if (null == this.coordinatorThread) { - LOG.info( - "Task {}-{} elected leader, starting commit coordinator", - config.connectorName(), - config.taskId()); + LOG.info("Task {} elected leader, starting commit coordinator", taskId); Coordinator coordinator = new Coordinator(catalog, config, membersWhenWorkerIsCoordinator, clientFactory, context); coordinatorThread = new CoordinatorThread(coordinator); diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/channel/Coordinator.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/channel/Coordinator.java index 068e1e1f6e9c..c986f8afc2eb 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/channel/Coordinator.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/channel/Coordinator.java @@ -81,6 +81,7 @@ class Coordinator extends Channel { private final ExecutorService exec; private final CommitState commitState; private volatile boolean terminated; + private final String taskId; Coordinator( Catalog catalog, @@ -110,6 +111,7 @@ class Coordinator extends Channel { .setNameFormat("iceberg-committer" + "-%d") .build()); this.commitState = new CommitState(config); + this.taskId = config.connectorName() + "-" + config.taskId(); } void process() { @@ -119,7 +121,7 @@ void process() { Event event = new Event(config.connectGroupId(), new StartCommit(commitState.currentCommitId())); send(event); - LOG.info("Commit {} initiated", commitState.currentCommitId()); + LOG.info("Coordinator {} initiated commit {}", taskId, commitState.currentCommitId()); } consumeAvailable(POLL_DURATION); @@ -149,7 +151,11 @@ private void commit(boolean partialCommit) { try { doCommit(partialCommit); } catch (Exception e) { - LOG.warn("Commit failed, will try again next cycle", e); + LOG.warn( + "Coordinator {} failed to commit for commit {}, will try again next cycle", + taskId, + commitState.currentCommitId(), + e); } finally { commitState.endCurrentCommit(); } @@ -163,10 +169,9 @@ private void doCommit(boolean partialCommit) { .executeWith(exec) .stopOnFailure() .run( - entry -> { - commitToTable( - entry.getKey(), entry.getValue(), controlTopicOffsets(), validThroughTs); - }); + entry -> + commitToTable( + entry.getKey(), entry.getValue(), controlTopicOffsets(), validThroughTs)); // we should only get here if all tables committed successfully... commitConsumerOffsets(); @@ -179,7 +184,8 @@ private void doCommit(boolean partialCommit) { send(event); LOG.info( - "Commit {} complete, committed to {} table(s), valid-through {}", + "Coordinator {} completed commit {}, committed to {} table(s), valid-through {}", + taskId, commitState.currentCommitId(), commitMap.size(), validThroughTs); @@ -256,13 +262,14 @@ private void commitToTable( .collect(Collectors.toList()); if (terminated) { - throw new ConnectException("Coordinator is terminated, commit aborted"); + throw new ConnectException( + String.format("Coordinator %s is terminated, commit aborted", taskId)); } if (dataFiles.isEmpty() && deleteFiles.isEmpty()) { - LOG.info("Nothing to commit to table {}, skipping", tableIdentifier); + LOG.info( + "Coordinator {} found nothing to commit to table {}, skipping", taskId, tableIdentifier); } else { - String taskId = String.format("%s-%s", config.connectorName(), config.taskId()); if (deleteFiles.isEmpty()) { AppendFiles appendOp = table.newAppend().validateWith(offsetValidator(tableIdentifier, committedOffsets)); @@ -303,7 +310,8 @@ private void commitToTable( send(event); LOG.info( - "Commit complete to table {}, snapshot {}, commit ID {}, valid-through {}", + "Coordinator {} completed commit to table {}, snapshot {}, commit ID {}, valid-through {}", + taskId, tableIdentifier, snapshotId, commitState.currentCommitId(), @@ -372,7 +380,7 @@ private Map parseOffsets(String value) { return Map.of(); } - TypeReference> typeRef = new TypeReference>() {}; + TypeReference> typeRef = new TypeReference<>() {}; try { return MAPPER.readValue(value, typeRef); } catch (IOException e) { diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java index 1a57a6444870..ab3d5aa9bb43 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java @@ -22,23 +22,29 @@ import java.io.IOException; import java.io.UncheckedIOException; import java.math.BigDecimal; +import java.math.BigInteger; import java.math.RoundingMode; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; +import java.time.Instant; import java.time.LocalDate; import java.time.LocalDateTime; import java.time.LocalTime; import java.time.OffsetDateTime; import java.time.ZoneOffset; +import java.time.ZonedDateTime; import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatterBuilder; import java.time.format.DateTimeParseException; import java.time.temporal.Temporal; import java.util.Base64; +import java.util.Collection; +import java.util.Collections; import java.util.Date; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Set; import java.util.UUID; import java.util.stream.Collectors; import org.apache.iceberg.FileFormat; @@ -53,6 +59,7 @@ import org.apache.iceberg.mapping.NameMappingParser; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Type.PrimitiveType; import org.apache.iceberg.types.Types.DecimalType; @@ -64,6 +71,13 @@ import org.apache.iceberg.util.ByteBuffers; import org.apache.iceberg.util.DateTimeUtil; import org.apache.iceberg.util.UUIDUtil; +import org.apache.iceberg.variants.ShreddedObject; +import org.apache.iceberg.variants.ValueArray; +import org.apache.iceberg.variants.Variant; +import org.apache.iceberg.variants.VariantMetadata; +import org.apache.iceberg.variants.VariantValue; +import org.apache.iceberg.variants.Variants; +import org.apache.kafka.connect.data.Field; import org.apache.kafka.connect.data.Struct; import org.apache.kafka.connect.errors.ConnectException; @@ -142,6 +156,8 @@ private Object convertValue( return convertTimeValue(value); case TIMESTAMP: return convertTimestampValue(value, (TimestampType) type); + case VARIANT: + return convertVariantValue(value); } throw new UnsupportedOperationException("Unsupported type: " + type.typeId()); } @@ -464,6 +480,234 @@ protected Temporal convertTimestampValue(Object value, TimestampType type) { return convertLocalDateTime(value); } + protected Variant convertVariantValue(Object value) { + if (value instanceof Variant variant) { + return variant; + } + + List sortedFieldNames = + collectFieldNames(value).stream().sorted().collect(Collectors.toList()); + VariantMetadata metadata = Variants.metadata(sortedFieldNames); + return Variant.of(metadata, objectToVariantValue(value, metadata, null)); + } + + /** + * Recursively collects field names from collections, maps, and structs. Returns an empty set for + * null, scalar values, and empty maps, lists, or structs. Map keys must be strings; non-string + * keys cause IllegalArgumentException. + */ + private static Set collectFieldNames(Object value) { + if (value == null) { + return Collections.emptySet(); + } + if (value instanceof Collection collection) { + if (collection.isEmpty()) { + return Collections.emptySet(); + } + Set names = Sets.newHashSet(); + collection.forEach(element -> names.addAll(collectFieldNames(element))); + return names; + } else if (value instanceof Map map) { + if (map.isEmpty()) { + return Collections.emptySet(); + } + Set names = Sets.newHashSet(); + map.forEach( + (key, val) -> { + if (key instanceof String keyStr) { + names.add(keyStr); + names.addAll(collectFieldNames(val)); + } else { + throw new IllegalArgumentException( + "Cannot convert map to variant: keys must be non-null strings, was: " + + (key == null ? "null" : key.getClass().getName())); + } + }); + return names; + } else if (value instanceof Struct struct) { + List fields = struct.schema().fields(); + if (fields.isEmpty()) { + return Collections.emptySet(); + } + Set names = Sets.newHashSet(); + fields.forEach( + field -> { + names.add(field.name()); + names.addAll(collectFieldNames(struct.get(field))); + }); + return names; + } + return Collections.emptySet(); + } + + /** + * Recursively converts a Java object to a VariantValue using the given shared metadata for all + * nested maps. Handles primitives, List (array), and Map (object); map keys become field names. + */ + private static VariantValue objectToVariantValue( + Object value, VariantMetadata metadata, org.apache.kafka.connect.data.Schema schema) { + if (value == null) { + return Variants.ofNull(); + } + VariantValue primitive = primitiveToVariantValue(value, schema); + if (primitive != null) { + return primitive; + } + if (value instanceof Collection collection) { + ValueArray array = Variants.array(); + org.apache.kafka.connect.data.Schema elementSchema = + schema != null ? schema.valueSchema() : null; + for (Object element : collection) { + array.add(objectToVariantValue(element, metadata, elementSchema)); + } + return array; + } + if (value instanceof Map map) { + return mapToVariantValue(map, metadata, schema); + } + if (value instanceof Struct struct) { + ShreddedObject object = Variants.object(metadata); + for (Field field : struct.schema().fields()) { + object.put(field.name(), objectToVariantValue(struct.get(field), metadata, field.schema())); + } + return object; + } + throw new IllegalArgumentException("Cannot convert to variant: " + value.getClass().getName()); + } + + /** Converts a Map to VariantValue; throw IllegalArgumentException if the key is not a string. */ + private static VariantValue mapToVariantValue( + Map map, VariantMetadata metadata, org.apache.kafka.connect.data.Schema schema) { + ShreddedObject object = Variants.object(metadata); + org.apache.kafka.connect.data.Schema mapValueSchema = + schema != null ? schema.valueSchema() : null; + map.forEach( + (key, val) -> { + if (key instanceof String keyStr) { + object.put(keyStr, objectToVariantValue(val, metadata, mapValueSchema)); + } else { + throw new IllegalArgumentException( + "Cannot convert map to variant: keys must be non-null strings, was: " + + (key == null ? "null" : key.getClass().getName())); + } + }); + return object; + } + + /** + * Converts a primitive or primitive-like value to VariantValue; returns null if not supported. + * The optional schema is used to disambiguate java.util.Date which Kafka Connect uses for Date, + * Time, and Timestamp logical types. + */ + private static VariantValue primitiveToVariantValue( + Object value, org.apache.kafka.connect.data.Schema schema) { + if (value instanceof Boolean booleanValue) { + return Variants.of(booleanValue); + } + VariantValue temporal = temporalObjectToVariantValue(value, schema); + if (temporal != null) { + return temporal; + } + if (value instanceof Number number) { + return numberToVariantValue(number); + } + if (value instanceof String stringValue) { + return Variants.of(stringValue); + } + if (value instanceof ByteBuffer byteBuffer) { + return Variants.of(byteBuffer); + } + if (value instanceof byte[] byteArray) { + return Variants.of(ByteBuffer.wrap(byteArray)); + } + if (value instanceof UUID uuid) { + return Variants.ofUUID(uuid); + } + return null; + } + + /** + * Converts java.time values and java.util.Date (with Connect logical type from the optional + * schema) to VariantValue; returns null if the value is not a supported temporal representation. + */ + private static VariantValue temporalObjectToVariantValue( + Object value, org.apache.kafka.connect.data.Schema schema) { + if (value instanceof Instant instant) { + return Variants.ofTimestamptz(DateTimeUtil.microsFromInstant(instant)); + } + if (value instanceof OffsetDateTime offsetDateTime) { + return Variants.ofTimestamptz(DateTimeUtil.microsFromTimestamptz(offsetDateTime)); + } + if (value instanceof ZonedDateTime zonedDateTime) { + return Variants.ofTimestamptz( + DateTimeUtil.microsFromTimestamptz(zonedDateTime.toOffsetDateTime())); + } + if (value instanceof LocalDateTime localDateTime) { + return Variants.ofTimestampntz(DateTimeUtil.microsFromTimestamp(localDateTime)); + } + if (value instanceof LocalDate localDate) { + return Variants.ofDate(DateTimeUtil.daysFromDate(localDate)); + } + if (value instanceof LocalTime localTime) { + return Variants.ofTime(DateTimeUtil.microsFromTime(localTime)); + } + if (value instanceof Date date) { + String logicalName = schema != null ? schema.name() : null; + // Connect represents Timestamp, Time, and Date logical types as java.util.Date at runtime; + // normalize to Instant once, then interpret using the schema logical type name. + Instant connectInstant = date.toInstant(); + if (org.apache.kafka.connect.data.Timestamp.LOGICAL_NAME.equals(logicalName)) { + return Variants.ofTimestamptz(DateTimeUtil.microsFromInstant(connectInstant)); + } + if (org.apache.kafka.connect.data.Time.LOGICAL_NAME.equals(logicalName)) { + LocalTime utcTime = connectInstant.atZone(ZoneOffset.UTC).toLocalTime(); + return Variants.ofTime(DateTimeUtil.microsFromTime(utcTime)); + } + if (org.apache.kafka.connect.data.Date.LOGICAL_NAME.equals(logicalName)) { + return Variants.ofDate(DateTimeUtil.daysFromInstant(connectInstant)); + } + throw new IllegalArgumentException( + "Cannot convert java.util.Date to variant without a recognized logical type schema" + + " (expected Timestamp, Time, or Date but got: " + + logicalName + + ")"); + } + return null; + } + + /** + * Converts a Number to VariantValue; throw IllegalArgumentException if the value is not a + * supported number representation. + */ + private static VariantValue numberToVariantValue(Number number) { + if (number instanceof BigDecimal bigDecimal) { + return Variants.of(bigDecimal); + } + if (number instanceof BigInteger bigInteger) { + return Variants.of(new BigDecimal(bigInteger)); + } + if (number instanceof Integer integer) { + return Variants.of(integer); + } + if (number instanceof Long longValue) { + return Variants.of(longValue); + } + if (number instanceof Float floatValue) { + return Variants.of(floatValue); + } + if (number instanceof Double doubleValue) { + return Variants.of(doubleValue); + } + if (number instanceof Byte byteValue) { + return Variants.of(byteValue); + } + if (number instanceof Short shortValue) { + return Variants.of(shortValue); + } + throw new IllegalArgumentException( + "Cannot convert Number to variant (unknown type): " + number.getClass().getName()); + } + @SuppressWarnings("JavaUtilDate") private OffsetDateTime convertOffsetDateTime(Object value) { if (value instanceof Number) { @@ -524,10 +768,19 @@ private String ensureTimestampFormat(String str) { if (result.charAt(10) == ' ') { result = result.substring(0, 10) + 'T' + result.substring(11); } - if (result.length() > 22 - && (result.charAt(19) == '+' || result.charAt(19) == '-') - && result.charAt(22) == ':') { - result = result.substring(0, 19) + result.substring(19).replace(":", ""); + // Search for the timezone offset sign starting after the seconds portion (index 19+). + // With fractional seconds (e.g. "...T03:17:37.260514+00:00") the sign appears later + // than index 19, so we must locate it dynamically rather than assuming a fixed position. + int signIdx = -1; + for (int i = 19; i < result.length(); i++) { + char ch = result.charAt(i); + if (ch == '+' || ch == '-') { + signIdx = i; + break; + } + } + if (signIdx != -1 && signIdx + 3 < result.length() && result.charAt(signIdx + 3) == ':') { + result = result.substring(0, signIdx + 3) + result.substring(signIdx + 4); } return result; } diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SinkWriter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SinkWriter.java index f81155e13777..48a01881935b 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SinkWriter.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SinkWriter.java @@ -75,9 +75,12 @@ private void save(SinkRecord record) { record.timestamp() == null ? null : OffsetDateTime.ofInstant(Instant.ofEpochMilli(record.timestamp()), ZoneOffset.UTC); + // use the original topic and partition to track offsets, as SMTs may have changed + // record.topic() and record.kafkaPartition() (e.g. RegexRouter). The framework's + // context.assignment() and consumer offset management use the original values. sourceOffsets.put( - new TopicPartition(record.topic(), record.kafkaPartition()), - new Offset(record.kafkaOffset() + 1, timestamp)); + new TopicPartition(record.originalTopic(), record.originalKafkaPartition()), + new Offset(record.originalKafkaOffset() + 1, timestamp)); if (config.dynamicTablesEnabled()) { routeRecordDynamically(record); diff --git a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/TestRecordConverter.java b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/TestRecordConverter.java index 45d07f69591b..9b91ba61c167 100644 --- a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/TestRecordConverter.java +++ b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/TestRecordConverter.java @@ -32,6 +32,7 @@ import java.time.LocalTime; import java.time.OffsetDateTime; import java.time.ZoneOffset; +import java.time.ZonedDateTime; import java.time.temporal.Temporal; import java.util.Base64; import java.util.Collection; @@ -74,7 +75,12 @@ import org.apache.iceberg.types.Types.TimeType; import org.apache.iceberg.types.Types.TimestampType; import org.apache.iceberg.types.Types.UUIDType; +import org.apache.iceberg.types.Types.VariantType; +import org.apache.iceberg.util.DateTimeUtil; import org.apache.iceberg.util.UUIDUtil; +import org.apache.iceberg.variants.PhysicalType; +import org.apache.iceberg.variants.Variant; +import org.apache.iceberg.variants.VariantValue; import org.apache.kafka.connect.data.Decimal; import org.apache.kafka.connect.data.Schema; import org.apache.kafka.connect.data.SchemaBuilder; @@ -152,6 +158,9 @@ public class TestRecordConverter { NestedField.required( 100, "stma", MapType.ofRequired(101, 102, StringType.get(), ID_SCHEMA.asStruct()))); + private static final org.apache.iceberg.Schema VARIANT_SCHEMA = + new org.apache.iceberg.Schema(NestedField.required(1, "v", VariantType.get())); + private static final Schema CONNECT_SCHEMA = SchemaBuilder.struct() .field("i", Schema.INT32_SCHEMA) @@ -569,6 +578,22 @@ public void testTimestampWithZoneConversion() { assertTimestampConvert(expected, additionalInput, TimestampType.withZone()); } + @Test + public void testTimestampWithZoneAndFractionalSecondsConversion() { + // Timestamps with sub-second precision and a colon-separated UTC offset (e.g. +00:00) + // were previously mis-parsed because ensureTimestampFormat only checked for the timezone + // sign at the fixed index 19, which is only valid when there are no fractional seconds. + OffsetDateTime expected = OffsetDateTime.parse("2026-03-31T03:17:37.260514+00:00"); + List inputs = + ImmutableList.of( + "2026-03-31T03:17:37.260514+00:00", + "2026-03-31T03:17:37.260514+0000", + "2026-03-31T03:17:37.260514Z", + "2026-03-31 03:17:37.260514+00:00", + "2026-03-31 03:17:37.260514+0000"); + assertTimestampConvert(expected, inputs, TimestampType.withZone()); + } + @Test public void testTimestampWithoutZoneConversion() { LocalDateTime expected = LocalDateTime.parse("2023-05-18T11:22:33"); @@ -587,6 +612,23 @@ public void testTimestampWithoutZoneConversion() { assertTimestampConvert(expected, additionalInput, TimestampType.withoutZone()); } + @Test + public void testTimestampWithoutZoneAndFractionalSecondsConversion() { + // Fractional seconds with a colon-separated offset: timezone must be stripped and + // the colon in +HH:MM must be normalized before OFFSET_TIMESTAMP_FORMAT can parse it. + LocalDateTime expected = LocalDateTime.parse("2026-03-31T03:17:37.260514"); + List inputs = + ImmutableList.of( + "2026-03-31T03:17:37.260514", + "2026-03-31 03:17:37.260514", + "2026-03-31T03:17:37.260514+00:00", + "2026-03-31 03:17:37.260514+00:00", + "2026-03-31T03:17:37.260514+0000", + "2026-03-31 03:17:37.260514+0000", + "2026-03-31T03:17:37.260514Z"); + assertTimestampConvert(expected, inputs, TimestampType.withoutZone()); + } + private void assertTimestampConvert(Temporal expected, long expectedMillis, TimestampType type) { List inputList = Lists.newArrayList( @@ -881,6 +923,364 @@ public void testEvolveTypeDetectionStructNested() { assertThat(updateMap.get("st.ff").type()).isInstanceOf(DoubleType.class); } + private RecordConverter variantConverter() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(VARIANT_SCHEMA); + return new RecordConverter(table, config); + } + + @Test + public void testConvertVariantValueFromNull() { + Variant variant = variantConverter().convertVariantValue(null); + assertThat(variant).isNotNull(); + assertThat(variant.value().type()).isEqualTo(PhysicalType.NULL); + } + + @Test + public void testConvertVariantValuePassThrough() { + Variant original = variantConverter().convertVariantValue("hello"); + assertThat(variantConverter().convertVariantValue(original)).isSameAs(original); + } + + @Test + public void testConvertVariantValueFromPrimitiveString() { + Variant variant = variantConverter().convertVariantValue("hello"); + assertThat(variant).isNotNull(); + assertThat(variant.metadata()).isNotNull(); + assertThat(variant.metadata().dictionarySize()).isEqualTo(0); + assertThat(variant.value().type()).isEqualTo(PhysicalType.STRING); + assertThat(variant.value().asPrimitive().get()).isEqualTo("hello"); + } + + @Test + public void testConvertVariantValueFromPrimitiveNumber() { + Variant variant = variantConverter().convertVariantValue(123); + assertThat(variant).isNotNull(); + assertThat(variant.metadata().dictionarySize()).isEqualTo(0); + assertThat(variant.value().type()).isEqualTo(PhysicalType.INT32); + assertThat(variant.value().asPrimitive().get()).isEqualTo(123); + } + + @Test + public void testConvertVariantValueFromBoolean() { + Variant variant = variantConverter().convertVariantValue(true); + assertThat(variant).isNotNull(); + assertThat(variant.value().type()).isEqualTo(PhysicalType.BOOLEAN_TRUE); + assertThat(variant.value().asPrimitive().get()).isEqualTo(true); + } + + @Test + public void testConvertVariantValueFromInstant() { + Instant instant = Instant.parse("2025-04-04T12:34:56.789Z"); + Variant variant = variantConverter().convertVariantValue(instant); + assertThat(variant).isNotNull(); + assertThat(variant.metadata().dictionarySize()).isEqualTo(0); + assertThat(variant.value().type()).isEqualTo(PhysicalType.TIMESTAMPTZ); + assertThat(variant.value().asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromInstant(instant)); + } + + @Test + public void testConvertVariantValueFromOffsetDateTime() { + OffsetDateTime odt = OffsetDateTime.parse("2025-04-04T12:34:56.789+09:00"); + Variant variant = variantConverter().convertVariantValue(odt); + assertThat(variant).isNotNull(); + assertThat(variant.metadata().dictionarySize()).isEqualTo(0); + assertThat(variant.value().type()).isEqualTo(PhysicalType.TIMESTAMPTZ); + assertThat(variant.value().asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTimestamptz(odt)); + } + + @Test + public void testConvertVariantValueFromZonedDateTime() { + ZonedDateTime zdt = ZonedDateTime.parse("2025-04-04T12:34:56.789-05:00[America/New_York]"); + Variant variant = variantConverter().convertVariantValue(zdt); + assertThat(variant).isNotNull(); + assertThat(variant.metadata().dictionarySize()).isEqualTo(0); + assertThat(variant.value().type()).isEqualTo(PhysicalType.TIMESTAMPTZ); + assertThat(variant.value().asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTimestamptz(zdt.toOffsetDateTime())); + } + + @Test + public void testConvertVariantValueFromLocalDateTime() { + LocalDateTime ldt = LocalDateTime.parse("2025-04-04T12:34:56.789"); + Variant variant = variantConverter().convertVariantValue(ldt); + assertThat(variant).isNotNull(); + assertThat(variant.metadata().dictionarySize()).isEqualTo(0); + assertThat(variant.value().type()).isEqualTo(PhysicalType.TIMESTAMPNTZ); + assertThat(variant.value().asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTimestamp(ldt)); + } + + @Test + public void testConvertVariantValueFromLocalDate() { + LocalDate date = LocalDate.of(2025, 4, 4); + Variant variant = variantConverter().convertVariantValue(date); + assertThat(variant).isNotNull(); + assertThat(variant.metadata().dictionarySize()).isEqualTo(0); + assertThat(variant.value().type()).isEqualTo(PhysicalType.DATE); + assertThat(variant.value().asPrimitive().get()).isEqualTo(DateTimeUtil.daysFromDate(date)); + } + + @Test + public void testConvertVariantValueFromLocalTime() { + LocalTime time = LocalTime.of(12, 34, 56, 789_000_000); + Variant variant = variantConverter().convertVariantValue(time); + assertThat(variant).isNotNull(); + assertThat(variant.metadata().dictionarySize()).isEqualTo(0); + assertThat(variant.value().type()).isEqualTo(PhysicalType.TIME); + assertThat(variant.value().asPrimitive().get()).isEqualTo(DateTimeUtil.microsFromTime(time)); + } + + @Test + public void testConvertVariantValueFromList() { + // array with heterogeneous element types (string, int, boolean, double, null, nested array/map, + // java.time primitives). Note: java.util.Date is not supported without Connect logical schema. + Instant instant = Instant.parse("2025-04-04T12:34:56.789Z"); + OffsetDateTime offsetTs = OffsetDateTime.parse("2025-04-04T12:34:56.789+09:00"); + ZonedDateTime zonedTs = ZonedDateTime.parse("2025-04-04T12:34:56.789-05:00[America/New_York]"); + LocalDateTime localTs = LocalDateTime.parse("2025-04-04T12:34:56.789"); + LocalDate localDate = LocalDate.of(2025, 4, 4); + LocalTime localTime = LocalTime.of(12, 34, 56, 789_000_000); + + List input = + Lists.newArrayList( + "a", + 1, + true, + 2.5, + null, + ImmutableList.of("a", "b"), + ImmutableMap.of("key1", "value1", "key2", "value2"), + instant, + offsetTs, + zonedTs, + localTs, + localDate, + localTime); + Variant variant = variantConverter().convertVariantValue(input); + + assertThat(variant).isNotNull(); + assertThat(variant.value().type()).isEqualTo(PhysicalType.ARRAY); + assertThat(variant.value().asArray().numElements()).isEqualTo(13); + + assertThat(variant.value().asArray().get(0).type()).isEqualTo(PhysicalType.STRING); + assertThat(variant.value().asArray().get(0).asPrimitive().get()).isEqualTo("a"); + + assertThat(variant.value().asArray().get(1).type()).isEqualTo(PhysicalType.INT32); + assertThat(variant.value().asArray().get(1).asPrimitive().get()).isEqualTo(1); + + assertThat(variant.value().asArray().get(2).type()).isEqualTo(PhysicalType.BOOLEAN_TRUE); + assertThat(variant.value().asArray().get(2).asPrimitive().get()).isEqualTo(true); + + assertThat(variant.value().asArray().get(3).type()).isEqualTo(PhysicalType.DOUBLE); + assertThat(variant.value().asArray().get(3).asPrimitive().get()).isEqualTo(2.5); + + assertThat(variant.value().asArray().get(4).type()).isEqualTo(PhysicalType.NULL); + + assertThat(variant.value().asArray().get(5).type()).isEqualTo(PhysicalType.ARRAY); + assertThat(variant.value().asArray().get(5).asArray().numElements()).isEqualTo(2); + assertThat(variant.value().asArray().get(5).asArray().get(0).asPrimitive().get()) + .isEqualTo("a"); + assertThat(variant.value().asArray().get(5).asArray().get(1).asPrimitive().get()) + .isEqualTo("b"); + + assertThat(variant.value().asArray().get(6).type()).isEqualTo(PhysicalType.OBJECT); + assertThat(variant.value().asArray().get(6).asObject().numFields()).isEqualTo(2); + assertThat(variant.value().asArray().get(6).asObject().get("key1").asPrimitive().get()) + .isEqualTo("value1"); + assertThat(variant.value().asArray().get(6).asObject().get("key2").asPrimitive().get()) + .isEqualTo("value2"); + + assertThat(variant.value().asArray().get(7).type()).isEqualTo(PhysicalType.TIMESTAMPTZ); + assertThat(variant.value().asArray().get(7).asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromInstant(instant)); + + assertThat(variant.value().asArray().get(8).type()).isEqualTo(PhysicalType.TIMESTAMPTZ); + assertThat(variant.value().asArray().get(8).asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTimestamptz(offsetTs)); + + assertThat(variant.value().asArray().get(9).type()).isEqualTo(PhysicalType.TIMESTAMPTZ); + assertThat(variant.value().asArray().get(9).asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTimestamptz(zonedTs.toOffsetDateTime())); + + assertThat(variant.value().asArray().get(10).type()).isEqualTo(PhysicalType.TIMESTAMPNTZ); + assertThat(variant.value().asArray().get(10).asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTimestamp(localTs)); + + assertThat(variant.value().asArray().get(11).type()).isEqualTo(PhysicalType.DATE); + assertThat(variant.value().asArray().get(11).asPrimitive().get()) + .isEqualTo(DateTimeUtil.daysFromDate(localDate)); + + assertThat(variant.value().asArray().get(12).type()).isEqualTo(PhysicalType.TIME); + assertThat(variant.value().asArray().get(12).asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTime(localTime)); + } + + @Test + public void testConvertVariantValueFromMap() { + // heterogeneous top-level values, nested map, java.time primitives; + // metadata shares one sorted dictionary for the whole tree + Instant instant = Instant.parse("2025-04-04T12:34:56.789Z"); + OffsetDateTime offsetTs = OffsetDateTime.parse("2025-04-04T12:34:56.789+09:00"); + ZonedDateTime zonedTs = ZonedDateTime.parse("2025-04-04T12:34:56.789-05:00[America/New_York]"); + LocalDateTime localTs = LocalDateTime.parse("2025-04-04T12:34:56.789"); + LocalDate localDate = LocalDate.of(2025, 4, 4); + LocalTime localTime = LocalTime.of(12, 34, 56, 789_000_000); + + Map input = Maps.newLinkedHashMap(); + input.put("s", "text"); + input.put("i", 1); + input.put("bool", true); + input.put("d", 2.5); + input.put("n", null); + input.put("hello", ImmutableMap.of("world", 1)); + input.put("tags", ImmutableList.of("a", "b")); + input.put("instant", instant); + input.put("odt", offsetTs); + input.put("zdt", zonedTs); + input.put("ldt", localTs); + input.put("ldate", localDate); + input.put("ltime", localTime); + + Variant variant = variantConverter().convertVariantValue(input); + + assertThat(variant).isNotNull(); + assertThat(variant.metadata().dictionarySize()).isEqualTo(14); + assertThat(variant.metadata().get(0)).isEqualTo("bool"); + assertThat(variant.metadata().get(1)).isEqualTo("d"); + assertThat(variant.metadata().get(2)).isEqualTo("hello"); + assertThat(variant.metadata().get(3)).isEqualTo("i"); + assertThat(variant.metadata().get(4)).isEqualTo("instant"); + assertThat(variant.metadata().get(5)).isEqualTo("ldate"); + assertThat(variant.metadata().get(6)).isEqualTo("ldt"); + assertThat(variant.metadata().get(7)).isEqualTo("ltime"); + assertThat(variant.metadata().get(8)).isEqualTo("n"); + assertThat(variant.metadata().get(9)).isEqualTo("odt"); + assertThat(variant.metadata().get(10)).isEqualTo("s"); + assertThat(variant.metadata().get(11)).isEqualTo("tags"); + assertThat(variant.metadata().get(12)).isEqualTo("world"); + assertThat(variant.metadata().get(13)).isEqualTo("zdt"); + + assertThat(variant.value().type()).isEqualTo(PhysicalType.OBJECT); + assertThat(variant.value().asObject().numFields()).isEqualTo(13); + + assertThat(variant.value().asObject().get("bool").type()).isEqualTo(PhysicalType.BOOLEAN_TRUE); + assertThat(variant.value().asObject().get("bool").asPrimitive().get()).isEqualTo(true); + + assertThat(variant.value().asObject().get("d").type()).isEqualTo(PhysicalType.DOUBLE); + assertThat(variant.value().asObject().get("d").asPrimitive().get()).isEqualTo(2.5); + + assertThat(variant.value().asObject().get("i").type()).isEqualTo(PhysicalType.INT32); + assertThat(variant.value().asObject().get("i").asPrimitive().get()).isEqualTo(1); + + assertThat(variant.value().asObject().get("n").type()).isEqualTo(PhysicalType.NULL); + + assertThat(variant.value().asObject().get("s").type()).isEqualTo(PhysicalType.STRING); + assertThat(variant.value().asObject().get("s").asPrimitive().get()).isEqualTo("text"); + + VariantValue tags = variant.value().asObject().get("tags"); + assertThat(tags.type()).isEqualTo(PhysicalType.ARRAY); + assertThat(tags.asArray().numElements()).isEqualTo(2); + assertThat(tags.asArray().get(0).asPrimitive().get()).isEqualTo("a"); + assertThat(tags.asArray().get(1).asPrimitive().get()).isEqualTo("b"); + + assertThat(variant.value().asObject().get("instant").type()) + .isEqualTo(PhysicalType.TIMESTAMPTZ); + assertThat(variant.value().asObject().get("instant").asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromInstant(instant)); + + assertThat(variant.value().asObject().get("odt").type()).isEqualTo(PhysicalType.TIMESTAMPTZ); + assertThat(variant.value().asObject().get("odt").asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTimestamptz(offsetTs)); + + assertThat(variant.value().asObject().get("zdt").type()).isEqualTo(PhysicalType.TIMESTAMPTZ); + assertThat(variant.value().asObject().get("zdt").asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTimestamptz(zonedTs.toOffsetDateTime())); + + assertThat(variant.value().asObject().get("ldt").type()).isEqualTo(PhysicalType.TIMESTAMPNTZ); + assertThat(variant.value().asObject().get("ldt").asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTimestamp(localTs)); + + assertThat(variant.value().asObject().get("ldate").type()).isEqualTo(PhysicalType.DATE); + assertThat(variant.value().asObject().get("ldate").asPrimitive().get()) + .isEqualTo(DateTimeUtil.daysFromDate(localDate)); + + assertThat(variant.value().asObject().get("ltime").type()).isEqualTo(PhysicalType.TIME); + assertThat(variant.value().asObject().get("ltime").asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTime(localTime)); + + VariantValue nested = variant.value().asObject().get("hello"); + assertThat(nested.type()).isEqualTo(PhysicalType.OBJECT); + assertThat(nested.asObject().get("world").asPrimitive().get()).isEqualTo(1); + } + + @Test + public void testConvertVariantValueFromStruct() { + // Nested Connect struct: primitives, array, and Timestamp / Time / Date (java.util.Date + + // logical types) + // 2025-04-04 12:34:56.789 UTC (aligned with java.time variant tests) + long tsMillis = 1743770096789L; + long timeMillis = 45296789L; + long dateMillis = 20182L * 86_400_000; + + Schema innerSchema = + SchemaBuilder.struct() + .field("i", Schema.INT32_SCHEMA) + .field("str", Schema.STRING_SCHEMA) + .field("tags", SchemaBuilder.array(Schema.STRING_SCHEMA).build()) + .field("ts", Timestamp.SCHEMA) + .field("t", Time.SCHEMA) + .field("d", org.apache.kafka.connect.data.Date.SCHEMA) + .build(); + Schema outerSchema = + SchemaBuilder.struct().field("inner", innerSchema).field("id", Schema.INT64_SCHEMA).build(); + Struct inner = + new Struct(innerSchema) + .put("i", 1) + .put("str", "world") + .put("tags", ImmutableList.of("a", "b")) + .put("ts", new Date(tsMillis)) + .put("t", new Date(timeMillis)) + .put("d", new Date(dateMillis)); + Struct outer = new Struct(outerSchema).put("inner", inner).put("id", 100L); + + Variant variant = variantConverter().convertVariantValue(outer); + + assertThat(variant).isNotNull(); + assertThat(variant.metadata().dictionarySize()).isEqualTo(8); + assertThat(variant.metadata().get(0)).isEqualTo("d"); + assertThat(variant.metadata().get(1)).isEqualTo("i"); + assertThat(variant.metadata().get(2)).isEqualTo("id"); + assertThat(variant.metadata().get(3)).isEqualTo("inner"); + assertThat(variant.metadata().get(4)).isEqualTo("str"); + assertThat(variant.metadata().get(5)).isEqualTo("t"); + assertThat(variant.metadata().get(6)).isEqualTo("tags"); + assertThat(variant.metadata().get(7)).isEqualTo("ts"); + + assertThat(variant.value().type()).isEqualTo(PhysicalType.OBJECT); + assertThat(variant.value().asObject().get("id").asPrimitive().get()).isEqualTo(100L); + + VariantValue innerVal = variant.value().asObject().get("inner"); + assertThat(innerVal.type()).isEqualTo(PhysicalType.OBJECT); + assertThat(innerVal.asObject().get("i").asPrimitive().get()).isEqualTo(1); + assertThat(innerVal.asObject().get("str").asPrimitive().get()).isEqualTo("world"); + assertThat(innerVal.asObject().get("tags").type()).isEqualTo(PhysicalType.ARRAY); + assertThat(innerVal.asObject().get("tags").asArray().numElements()).isEqualTo(2); + assertThat(innerVal.asObject().get("tags").asArray().get(0).asPrimitive().get()).isEqualTo("a"); + assertThat(innerVal.asObject().get("tags").asArray().get(1).asPrimitive().get()).isEqualTo("b"); + + assertThat(innerVal.asObject().get("ts").type()).isEqualTo(PhysicalType.TIMESTAMPTZ); + assertThat(innerVal.asObject().get("ts").asPrimitive().get()).isEqualTo(tsMillis * 1000); + + assertThat(innerVal.asObject().get("t").type()).isEqualTo(PhysicalType.TIME); + assertThat(innerVal.asObject().get("t").asPrimitive().get()).isEqualTo(timeMillis * 1000); + + assertThat(innerVal.asObject().get("d").type()).isEqualTo(PhysicalType.DATE); + assertThat(innerVal.asObject().get("d").asPrimitive().get()).isEqualTo(20182); + } + public static Map createMapData() { return ImmutableMap.builder() .put("i", 1) diff --git a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/TestSinkWriter.java b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/TestSinkWriter.java index 6baf72117d04..09f7a373d5f2 100644 --- a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/TestSinkWriter.java +++ b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/TestSinkWriter.java @@ -153,6 +153,74 @@ public void testDynamicRoute() { assertThat(writerResult.tableReference().identifier()).isEqualTo(TABLE_IDENTIFIER); } + @Test + public void testOffsetTrackedByOriginalTopicPartition() { + IcebergSinkConfig config = mock(IcebergSinkConfig.class); + when(config.tableConfig(any())).thenReturn(mock(TableSinkConfig.class)); + when(config.tables()).thenReturn(ImmutableList.of(TABLE_IDENTIFIER.toString())); + when(config.dynamicTablesEnabled()).thenReturn(true); + when(config.tablesRouteField()).thenReturn(ROUTE_FIELD); + + IcebergWriterResult writeResult = + new IcebergWriterResult( + TableIdentifier.parse(TABLE_NAME), + ImmutableList.of(mock(DataFile.class)), + ImmutableList.of(), + Types.StructType.of()); + IcebergWriter writer = mock(IcebergWriter.class); + when(writer.complete()).thenReturn(ImmutableList.of(writeResult)); + + IcebergWriterFactory writerFactory = mock(IcebergWriterFactory.class); + when(writerFactory.createWriter(any(), any(), anyBoolean())).thenReturn(writer); + + SinkWriter sinkWriter = new SinkWriter(catalog, config); + + // simulate a record that has been transformed by RegexRouter (topic changed) + String originalTopic = "orders"; + int originalPartition = 0; + long originalOffset = 42L; + Instant now = Instant.now().truncatedTo(ChronoUnit.MILLIS); + + SinkRecord original = + new SinkRecord( + originalTopic, + originalPartition, + null, + "key", + null, + ImmutableMap.of(ROUTE_FIELD, TABLE_IDENTIFIER.toString()), + originalOffset, + now.toEpochMilli(), + TimestampType.LOG_APPEND_TIME); + + // RegexRouter changes the topic via newRecord + String transformedTopic = "tmp.dynamic_orders"; + SinkRecord transformed = + original.newRecord( + transformedTopic, + originalPartition, + original.keySchema(), + original.key(), + original.valueSchema(), + original.value(), + original.timestamp()); + + sinkWriter.save(ImmutableList.of(transformed)); + SinkWriterResult result = sinkWriter.completeWrite(); + + // offsets must be keyed by the ORIGINAL topic, not the transformed one + Offset offset = + result.sourceOffsets().get(new TopicPartition(originalTopic, originalPartition)); + assertThat(offset).isNotNull(); + assertThat(offset.offset()).isEqualTo(originalOffset + 1); + assertThat(offset.timestamp()).isEqualTo(now.atOffset(ZoneOffset.UTC)); + + // the transformed topic key should NOT be present + Offset wrongOffset = + result.sourceOffsets().get(new TopicPartition(transformedTopic, originalPartition)); + assertThat(wrongOffset).isNull(); + } + @Test public void testDynamicNoRoute() { IcebergSinkConfig config = mock(IcebergSinkConfig.class); diff --git a/open-api/Makefile b/open-api/Makefile index 3c2c07936e41..797a2abd9293 100644 --- a/open-api/Makefile +++ b/open-api/Makefile @@ -21,10 +21,12 @@ install: validate-spec: uv run openapi-spec-validator --errors all rest-catalog-open-api.yaml + # TODO remove when s3-signer-open-api.yaml is removed uv run openapi-spec-validator --errors all ../aws/src/main/resources/s3-signer-open-api.yaml lint-spec: uv run yamllint --strict rest-catalog-open-api.yaml + # TODO remove when s3-signer-open-api.yaml is removed uv run yamllint --strict ../aws/src/main/resources/s3-signer-open-api.yaml lint: validate-spec lint-spec diff --git a/open-api/requirements.txt b/open-api/requirements.txt index 4076246c83cf..4e75e426b537 100644 --- a/open-api/requirements.txt +++ b/open-api/requirements.txt @@ -15,6 +15,6 @@ # specific language governing permissions and limitations # under the License. -openapi-spec-validator==0.8.4 -datamodel-code-generator==0.55.0 +openapi-spec-validator==0.8.5 +datamodel-code-generator==0.56.1 yamllint==1.38.0 diff --git a/open-api/rest-catalog-open-api.py b/open-api/rest-catalog-open-api.py index 32cf975cf5b6..f8b3f5bd3771 100644 --- a/open-api/rest-catalog-open-api.py +++ b/open-api/rest-catalog-open-api.py @@ -1045,6 +1045,43 @@ class PlanTask(RootModel[str]): ) +class MultiValuedMap(RootModel[dict[str, list[str]]]): + """ + A map of string keys where each key can map to multiple string values. + """ + + root: dict[str, list[str]] + + +class RemoteSignRequest(BaseModel): + """ + The request to be signed remotely. + """ + + region: str + uri: str + method: Literal['PUT', 'GET', 'HEAD', 'POST', 'DELETE', 'PATCH', 'OPTIONS'] + headers: MultiValuedMap + properties: dict[str, str] | None = None + body: str | None = Field( + None, + description='Optional body of the request to send to the signing API. This should only be populated for requests where the body of the message contains content which must be validated before a request is signed, such as the S3 DeleteObjects call.', + ) + provider: str | None = Field( + None, + description='The storage provider for which the request is to be signed. The provider should correspond to the scheme used for a storage native URI. For example `s3` for AWS S3 paths. For backwards compatibility, if this is not specified, the provider is assumed to be `s3`.', + ) + + +class RemoteSignResult(BaseModel): + """ + The result of a remote request signing operation. + """ + + uri: str + headers: MultiValuedMap + + class CreateNamespaceRequest(BaseModel): namespace: Namespace properties: dict[str, str] | None = Field( @@ -1435,7 +1472,7 @@ class LoadTableResult(BaseModel): - `s3.access-key-id`: id for credentials that provide access to the data in S3 - `s3.secret-access-key`: secret for credentials that provide access to data in S3 - `s3.session-token`: if present, this value should be used for as the session token - - `s3.remote-signing-enabled`: if `true` remote signing should be performed as described in the `s3-signer-open-api.yaml` specification + - `s3.remote-signing-enabled`: if `true` remote signing should be performed as described in the `RemoteSignRequest` schema section of this spec document. - `s3.cross-region-access-enabled`: if `true`, S3 Cross-Region bucket access is enabled ## Storage Credentials @@ -1443,6 +1480,12 @@ class LoadTableResult(BaseModel): Credentials for ADLS / GCS / S3 / ... are provided through the `storage-credentials` field. Clients must first check whether the respective credentials exist in the `storage-credentials` field before checking the `config` for credentials. + ## Remote Signing + + If remote signing for a specific storage provider is enabled, clients must respect the following configurations when creating a remote signer client: + - `signer.endpoint`: the remote signer endpoint. Required. Can either be a relative path (to be resolved against `signer.uri`) or an absolute URI. + - `signer.uri`: the base URI to resolve `signer.endpoint` against. Optional. Only meaningful if `signer.endpoint` is a relative path. Defaults to the catalog's base URI if not set. + """ metadata_location: str | None = Field( diff --git a/open-api/rest-catalog-open-api.yaml b/open-api/rest-catalog-open-api.yaml index ee0097042534..06d13ec133d9 100644 --- a/open-api/rest-catalog-open-api.yaml +++ b/open-api/rest-catalog-open-api.yaml @@ -162,6 +162,15 @@ paths: $ref: '#/components/responses/UnauthorizedResponse' 403: $ref: '#/components/responses/ForbiddenResponse' + 404: + description: Not Found - Warehouse provided in the `warehouse` query parameter is not found. + content: + application/json: + schema: + $ref: '#/components/schemas/IcebergErrorResponse' + examples: + NoSuchWarehouseExample: + $ref: '#/components/examples/NoSuchWarehouseError' 419: $ref: '#/components/responses/AuthenticationTimeoutResponse' 503: @@ -598,7 +607,7 @@ paths: NamespaceNotFound: $ref: '#/components/examples/NoSuchNamespaceError' 409: - description: Conflict - The table already exists + description: Conflict - The identifier already exists as a table or view content: application/json: schema: @@ -918,7 +927,7 @@ paths: NamespaceNotFound: $ref: '#/components/examples/NoSuchNamespaceError' 409: - description: Conflict - The table already exists + description: Conflict - The identifier already exists as a table or view content: application/json: schema: @@ -974,9 +983,9 @@ paths: - in: query name: snapshots description: - The snapshots to return in the body of the metadata. Setting the value to `all` would - return the full set of snapshots currently valid for the table. Setting the value to - `refs` would load all snapshots referenced by branches or tags. + The snapshots to return in the body of the metadata via the `snapshots` field. Setting + the value to `all` would return the full set of snapshots currently valid for the table. + Setting the value to `refs` would load all snapshots referenced by branches or tags. Default if no param is provided is `all`. required: false @@ -1254,6 +1263,40 @@ paths: 5XX: $ref: '#/components/responses/ServerErrorResponse' + /v1/{prefix}/namespaces/{namespace}/tables/{table}/sign: + parameters: + - $ref: '#/components/parameters/prefix' + - $ref: '#/components/parameters/namespace' + - $ref: '#/components/parameters/table' + + post: + tags: + - Catalog API + summary: Remotely signs requests to object storage + operationId: signRequest + requestBody: + description: The request to be signed + content: + application/json: + schema: + $ref: '#/components/schemas/RemoteSignRequest' + required: true + responses: + 200: + $ref: '#/components/responses/RemoteSignResponse' + 400: + $ref: '#/components/responses/BadRequestErrorResponse' + 401: + $ref: '#/components/responses/UnauthorizedResponse' + 403: + $ref: '#/components/responses/ForbiddenResponse' + 419: + $ref: '#/components/responses/AuthenticationTimeoutResponse' + 503: + $ref: '#/components/responses/ServiceUnavailableResponse' + 5XX: + $ref: '#/components/responses/ServerErrorResponse' + /v1/{prefix}/tables/rename: parameters: - $ref: '#/components/parameters/prefix' @@ -1550,7 +1593,7 @@ paths: NamespaceNotFound: $ref: '#/components/examples/NoSuchNamespaceError' 409: - description: Conflict - The view already exists + description: Conflict - The identifier already exists as a table or view content: application/json: schema: @@ -1963,12 +2006,8 @@ components: to supply access via any or none of the requested mechanisms. - Specific properties and handling for `vended-credentials` is documented - in the `LoadTableResult` schema section of this spec document. - - - The protocol and specification for `remote-signing` is documented in - the `s3-signer-open-api.yaml` OpenApi spec in the `aws` module. + Specific properties and handling for `vended-credentials` and `remote-signing` + are documented in the `LoadTableResult` schema section of this spec document. required: false schema: @@ -3479,13 +3518,19 @@ components: - `s3.access-key-id`: id for credentials that provide access to the data in S3 - `s3.secret-access-key`: secret for credentials that provide access to data in S3 - `s3.session-token`: if present, this value should be used for as the session token - - `s3.remote-signing-enabled`: if `true` remote signing should be performed as described in the `s3-signer-open-api.yaml` specification + - `s3.remote-signing-enabled`: if `true` remote signing should be performed as described in the `RemoteSignRequest` schema section of this spec document. - `s3.cross-region-access-enabled`: if `true`, S3 Cross-Region bucket access is enabled ## Storage Credentials Credentials for ADLS / GCS / S3 / ... are provided through the `storage-credentials` field. Clients must first check whether the respective credentials exist in the `storage-credentials` field before checking the `config` for credentials. + + ## Remote Signing + + If remote signing for a specific storage provider is enabled, clients must respect the following configurations when creating a remote signer client: + - `signer.endpoint`: the remote signer endpoint. Required. Can either be a relative path (to be resolved against `signer.uri`) or an absolute URI. + - `signer.uri`: the base URI to resolve `signer.endpoint` against. Optional. Only meaningful if `signer.endpoint` is a relative path. Defaults to the catalog's base URI if not set. type: object required: - metadata @@ -4696,6 +4741,59 @@ components: allOf: - $ref: '#/components/schemas/Expression' + MultiValuedMap: + description: A map of string keys where each key can map to multiple string values. + type: object + additionalProperties: + type: array + items: + type: string + + RemoteSignRequest: + description: The request to be signed remotely. + type: object + required: + - region + - uri + - method + - headers + properties: + region: + type: string + uri: + type: string + method: + type: string + enum: ["PUT", "GET", "HEAD", "POST", "DELETE", "PATCH", "OPTIONS"] + headers: + $ref: '#/components/schemas/MultiValuedMap' + properties: + type: object + additionalProperties: + type: string + body: + type: string + description: Optional body of the request to send to the signing API. This should only be populated + for requests where the body of the message contains content which must be validated before a request is + signed, such as the S3 DeleteObjects call. + provider: + type: string + description: The storage provider for which the request is to be signed. The provider should correspond to + the scheme used for a storage native URI. For example `s3` for AWS S3 paths. For backwards compatibility, + if this is not specified, the provider is assumed to be `s3`. + + RemoteSignResult: + description: The result of a remote request signing operation. + type: object + required: + - uri + - headers + properties: + uri: + type: string + headers: + $ref: '#/components/schemas/MultiValuedMap' + ############################# # Reusable Response Objects # ############################# @@ -4977,6 +5075,15 @@ components: schema: $ref: '#/components/schemas/LoadCredentialsResponse' + RemoteSignResponse: + description: The response containing signed & unsigned headers. The server will also send + a Cache-Control header, indicating whether the response can be cached (Cache-Control = ["private"]) + or not (Cache-Control = ["no-cache"]). + content: + application/json: + schema: + $ref: '#/components/schemas/RemoteSignResult' + ####################################### # Common examples of different values # ####################################### @@ -5070,6 +5177,16 @@ components: } } + NoSuchWarehouseError: + summary: The requested warehouse does not exist + value: { + "error": { + "message": "The given warehouse does not exist", + "type": "NoSuchWarehouseException", + "code": 404 + } + } + NoSuchNamespaceError: summary: The requested namespace does not exist value: { @@ -5108,7 +5225,7 @@ components: summary: The requested table identifier already exists value: { "error": { - "message": "The given table already exists", + "message": "The requested table identifier already exists", "type": "AlreadyExistsException", "code": 409 } @@ -5118,7 +5235,7 @@ components: summary: The requested view identifier already exists value: { "error": { - "message": "The given view already exists", + "message": "The requested view identifier already exists", "type": "AlreadyExistsException", "code": 409 } diff --git a/open-api/src/test/java/org/apache/iceberg/rest/RESTCompatibilityKitCatalogTests.java b/open-api/src/test/java/org/apache/iceberg/rest/RESTCompatibilityKitCatalogTests.java index 87ec90663db2..cbf752e484bd 100644 --- a/open-api/src/test/java/org/apache/iceberg/rest/RESTCompatibilityKitCatalogTests.java +++ b/open-api/src/test/java/org/apache/iceberg/rest/RESTCompatibilityKitCatalogTests.java @@ -26,6 +26,8 @@ import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -97,4 +99,19 @@ protected boolean supportsNamesWithDot() { return PropertyUtil.propertyAsBoolean( restCatalog.properties(), RESTCompatibilityKitSuite.RCK_SUPPORTS_NAMES_WITH_DOT, false); } + + @Override + protected boolean supportsNamesWithSlashes() { + // names with slashes are rejected and considered as suspicious characters after upgrading Jetty + // and the Servlet API. See also + // https://jakarta.ee/specifications/servlet/6.0/jakarta-servlet-spec-6.0.html#uri-path-canonicalization + // for additional details + return false; + } + + @Disabled("RESTServerExtension isn’t configurable per test") + @Test + public void createTableInUniqueLocation() { + super.createTableInUniqueLocation(); + } } diff --git a/open-api/src/testFixtures/java/org/apache/iceberg/rest/RCKUtils.java b/open-api/src/testFixtures/java/org/apache/iceberg/rest/RCKUtils.java index 4bd060d788a7..bfdcfc8a4bd9 100644 --- a/open-api/src/testFixtures/java/org/apache/iceberg/rest/RCKUtils.java +++ b/open-api/src/testFixtures/java/org/apache/iceberg/rest/RCKUtils.java @@ -37,7 +37,13 @@ class RCKUtils { static final String RCK_LOCAL = "rck.local"; static final String RCK_PURGE_TEST_NAMESPACES = "rck.purge-test-namespaces"; - static final List TEST_NAMESPACES = List.of(Namespace.of("ns"), Namespace.of("newdb")); + static final List TEST_NAMESPACES = + List.of( + Namespace.of("ns"), + Namespace.of("newdb"), + Namespace.of("ns1"), + Namespace.of("ns2"), + Namespace.of("other_ns")); private RCKUtils() {} diff --git a/open-api/src/testFixtures/java/org/apache/iceberg/rest/RESTCatalogServer.java b/open-api/src/testFixtures/java/org/apache/iceberg/rest/RESTCatalogServer.java index 5f0f89d92646..2e4541b50b33 100644 --- a/open-api/src/testFixtures/java/org/apache/iceberg/rest/RESTCatalogServer.java +++ b/open-api/src/testFixtures/java/org/apache/iceberg/rest/RESTCatalogServer.java @@ -28,12 +28,13 @@ import org.apache.iceberg.jdbc.JdbcCatalog; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.util.PropertyUtil; +import org.eclipse.jetty.compression.gzip.GzipCompression; +import org.eclipse.jetty.compression.server.CompressionHandler; +import org.eclipse.jetty.ee10.servlet.ServletContextHandler; +import org.eclipse.jetty.ee10.servlet.ServletHolder; import org.eclipse.jetty.server.Connector; import org.eclipse.jetty.server.Server; import org.eclipse.jetty.server.ServerConnector; -import org.eclipse.jetty.server.handler.gzip.GzipHandler; -import org.eclipse.jetty.servlet.ServletContextHandler; -import org.eclipse.jetty.servlet.ServletHolder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -115,7 +116,9 @@ public void start(boolean join) throws Exception { ServletContextHandler context = new ServletContextHandler(ServletContextHandler.NO_SESSIONS); ServletHolder servletHolder = new ServletHolder(servlet); context.addServlet(servletHolder, "/*"); - context.insertHandler(new GzipHandler()); + CompressionHandler compressionHandler = new CompressionHandler(); + compressionHandler.putCompression(new GzipCompression()); + context.insertHandler(compressionHandler); this.httpServer = new Server( diff --git a/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReader.java b/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReader.java index 74c1303dfeda..3bd8bfbfd6b7 100644 --- a/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReader.java +++ b/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReader.java @@ -76,7 +76,7 @@ public OrcValueReader record( TypeDescription record, List names, List> fields) { - return GenericOrcReaders.struct(fields, expected, idToConstant); + return GenericOrcReaders.struct(record, fields, expected, idToConstant); } @Override diff --git a/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReaders.java b/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReaders.java index ba8cbbb749a7..faa62f770e4d 100644 --- a/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReaders.java +++ b/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReaders.java @@ -43,6 +43,7 @@ import org.apache.iceberg.variants.Variant; import org.apache.iceberg.variants.VariantMetadata; import org.apache.iceberg.variants.VariantValue; +import org.apache.orc.TypeDescription; import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; import org.apache.orc.storage.ql.exec.vector.ColumnVector; import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; @@ -56,11 +57,25 @@ public class GenericOrcReaders { private GenericOrcReaders() {} + /** + * @deprecated Use {@link #struct(TypeDescription, List, Types.StructType, Map)} instead. This + * method uses position-based binding which may cause field misalignment in MOR and lineage + * scenarios. + */ + @Deprecated public static OrcValueReader struct( List> readers, Types.StructType struct, Map idToConstant) { return new StructReader(readers, struct, idToConstant); } + public static OrcValueReader struct( + TypeDescription orcType, + List> readers, + Types.StructType struct, + Map idToConstant) { + return new StructReader(orcType, readers, struct, idToConstant); + } + public static OrcValueReader> array(OrcValueReader elementReader) { return new ListReader(elementReader); } @@ -231,6 +246,12 @@ public Variant nonNullRead(ColumnVector vector, int row) { private static class StructReader extends OrcValueReaders.StructReader { private final GenericRecord template; + /** + * @deprecated Use {@link #StructReader(TypeDescription, List, Types.StructType, Map)} instead. + * This constructor uses position-based binding which may cause field misalignment in MOR + * and lineage scenarios. + */ + @Deprecated protected StructReader( List> readers, Types.StructType structType, @@ -239,6 +260,15 @@ protected StructReader( this.template = GenericRecord.create(structType); } + protected StructReader( + TypeDescription orcType, + List> readers, + Types.StructType structType, + Map idToConstant) { + super(orcType, readers, structType, idToConstant); + this.template = GenericRecord.create(structType); + } + @Override protected Record create() { // GenericRecord.copy() is more performant then GenericRecord.create(StructType) since diff --git a/orc/src/main/java/org/apache/iceberg/orc/ORC.java b/orc/src/main/java/org/apache/iceberg/orc/ORC.java index 2c8fd6e436b2..9fb805246962 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/ORC.java +++ b/orc/src/main/java/org/apache/iceberg/orc/ORC.java @@ -787,11 +787,17 @@ ReadBuilder constantFieldIds(Set newConstantFieldIds) { public CloseableIterable build() { Preconditions.checkNotNull(schema, "Schema is required"); + Set idsToExclude = + Sets.difference( + Sets.union(constantFieldIds, MetadataColumns.metadataFieldIds()), + ImmutableSet.of( + MetadataColumns.ROW_ID.fieldId(), + MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId())); + return new OrcIterable<>( file, conf, - TypeUtil.selectNot( - schema, Sets.union(constantFieldIds, MetadataColumns.metadataFieldIds())), + TypeUtil.selectNot(schema, idsToExclude), nameMapping, start, length, diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcIterable.java b/orc/src/main/java/org/apache/iceberg/orc/OrcIterable.java index 119b3c54f278..0f65f1b65d9c 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/OrcIterable.java +++ b/orc/src/main/java/org/apache/iceberg/orc/OrcIterable.java @@ -103,6 +103,8 @@ public CloseableIterator iterator() { VectorizedRowBatchIterator rowBatchIterator = newOrcIterator(file, readOrcSchema, start, length, orcFileReader, sarg, recordsPerBatch); + addCloseable(rowBatchIterator); + if (batchReaderFunction != null) { OrcBatchReader batchReader = (OrcBatchReader) batchReaderFunction.apply(readOrcSchema); return CloseableIterator.transform( diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcValueReaders.java b/orc/src/main/java/org/apache/iceberg/orc/OrcValueReaders.java index b6d40a3d7d00..c1fba3f15add 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/OrcValueReaders.java +++ b/orc/src/main/java/org/apache/iceberg/orc/OrcValueReaders.java @@ -22,8 +22,11 @@ import java.util.List; import java.util.Map; import org.apache.iceberg.MetadataColumns; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; +import org.apache.orc.TypeDescription; import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; import org.apache.orc.storage.ql.exec.vector.ColumnVector; import org.apache.orc.storage.ql.exec.vector.DoubleColumnVector; @@ -135,12 +138,27 @@ public byte[] nonNullRead(ColumnVector vector, int row) { public abstract static class StructReader implements OrcValueReader { private final OrcValueReader[] readers; private final boolean[] isConstantOrMetadataField; - + // Maps each projected struct field position to the matching child index in the ORC schema. + // This allows fields to be read by Iceberg field ID when the projected struct order differs + // from the file schema. + private final int[] orcFieldIndex; + + /** + * @param readers readers for each field + * @param struct struct type + * @param idToConstant constant values by field id + * @deprecated Use {@link #StructReader(TypeDescription, List, Types.StructType, Map)} instead. + * This constructor uses position-based binding which may cause field misalignment in MOR + * scenarios. This doesn't work lineage scenarios. + */ + @Deprecated protected StructReader( List> readers, Types.StructType struct, Map idToConstant) { List fields = struct.fields(); this.readers = new OrcValueReader[fields.size()]; this.isConstantOrMetadataField = new boolean[fields.size()]; + this.orcFieldIndex = null; + for (int pos = 0, readerIndex = 0; pos < fields.size(); pos += 1) { Types.NestedField field = fields.get(pos); if (idToConstant.containsKey(field.fieldId())) { @@ -154,7 +172,6 @@ protected StructReader( this.readers[pos] = constants(false); } else if (MetadataColumns.isMetadataColumn(field.name()) || field.type().typeId() == Type.TypeID.UNKNOWN) { - // in case of any other metadata field, fill with nulls this.isConstantOrMetadataField[pos] = true; this.readers[pos] = constants(null); } else { @@ -163,6 +180,122 @@ protected StructReader( } } + protected StructReader( + TypeDescription orcType, + List> readers, + Types.StructType struct, + Map idToConstant) { + List fields = struct.fields(); + this.readers = new OrcValueReader[fields.size()]; + this.isConstantOrMetadataField = new boolean[fields.size()]; + this.orcFieldIndex = new int[fields.size()]; + + Map> readersById = readersByFieldId(orcType, readers); + Map fieldIdToOrcIndex = buildFieldIdToOrcIndex(orcType); + + for (int pos = 0; pos < fields.size(); pos += 1) { + Types.NestedField field = fields.get(pos); + OrcValueReader fileReader = readersById.get(field.fieldId()); + int orcIndex = fieldIdToOrcIndex.getOrDefault(field.fieldId(), -1); + + if (field.equals(MetadataColumns.ROW_ID)) { + handleRowIdField(pos, field, fileReader, idToConstant, orcIndex); + } else if (field.equals(MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER)) { + handleLastUpdatedSeqField(pos, field, fileReader, idToConstant, orcIndex); + } else if (idToConstant.containsKey(field.fieldId())) { + this.isConstantOrMetadataField[pos] = true; + this.readers[pos] = constants(idToConstant.get(field.fieldId())); + } else if (field.equals(MetadataColumns.ROW_POSITION)) { + this.isConstantOrMetadataField[pos] = true; + this.readers[pos] = new RowPositionReader(); + } else if (field.equals(MetadataColumns.IS_DELETED)) { + this.isConstantOrMetadataField[pos] = true; + this.readers[pos] = constants(false); + } else if (fileReader != null) { + this.isConstantOrMetadataField[pos] = false; + this.orcFieldIndex[pos] = fieldIdToOrcIndex.getOrDefault(field.fieldId(), -1); + this.readers[pos] = fileReader; + } else if (MetadataColumns.isMetadataColumn(field.name()) + || field.type().typeId() == Type.TypeID.UNKNOWN) { + this.isConstantOrMetadataField[pos] = true; + this.readers[pos] = constants(null); + } else { + throw new IllegalArgumentException( + String.format("Missing ORC reader for field %s (%s)", field.name(), field.fieldId())); + } + } + } + + private Map buildFieldIdToOrcIndex(TypeDescription orcType) { + List children = orcType.getChildren(); + Map mapping = Maps.newHashMap(); + for (int i = 0; i < children.size(); i++) { + mapping.put(ORCSchemaUtil.fieldId(children.get(i)), i); + } + + return mapping; + } + + private Map> readersByFieldId( + TypeDescription orcType, List> readerList) { + List children = orcType.getChildren(); + Preconditions.checkState( + children.size() == readerList.size(), + "Invalid ORC reader binding: children=%s readers=%s", + children.size(), + readerList.size()); + + Map> readersById = Maps.newHashMap(); + for (int i = 0; i < children.size(); i += 1) { + readersById.put(ORCSchemaUtil.fieldId(children.get(i)), readerList.get(i)); + } + + return readersById; + } + + @SuppressWarnings("unchecked") + private void handleRowIdField( + int pos, + Types.NestedField field, + OrcValueReader fileReader, + Map idToConstant, + int orcIndex) { + Long firstRowId = (Long) idToConstant.get(field.fieldId()); + if (firstRowId != null) { + OrcValueReader fileIdReader = (OrcValueReader) fileReader; + this.readers[pos] = new RowIdReader(firstRowId, fileIdReader); + this.isConstantOrMetadataField[pos] = fileIdReader == null; + if (fileIdReader != null) { + this.orcFieldIndex[pos] = orcIndex; + } + } else { + this.isConstantOrMetadataField[pos] = true; + this.readers[pos] = constants(null); + } + } + + @SuppressWarnings("unchecked") + private void handleLastUpdatedSeqField( + int pos, + Types.NestedField field, + OrcValueReader fileReader, + Map idToConstant, + int orcIndex) { + Long fileLastUpdated = (Long) idToConstant.get(field.fieldId()); + Long firstRowId = (Long) idToConstant.get(MetadataColumns.ROW_ID.fieldId()); + if (fileLastUpdated != null && firstRowId != null) { + OrcValueReader fileSeqReader = (OrcValueReader) fileReader; + this.readers[pos] = new LastUpdatedSeqReader(fileLastUpdated, fileSeqReader); + this.isConstantOrMetadataField[pos] = fileSeqReader == null; + if (fileSeqReader != null) { + this.orcFieldIndex[pos] = orcIndex; + } + } else { + this.isConstantOrMetadataField[pos] = true; + this.readers[pos] = constants(null); + } + } + protected abstract T create(); protected abstract void set(T struct, int pos, Object value); @@ -178,14 +311,17 @@ public T nonNullRead(ColumnVector vector, int row) { } private T readInternal(T struct, ColumnVector[] columnVectors, int row) { - for (int c = 0, vectorIndex = 0; c < readers.length; ++c) { + int vectorIndex = 0; + for (int c = 0; c < readers.length; ++c) { ColumnVector vector; if (isConstantOrMetadataField[c]) { vector = null; + } else if (orcFieldIndex != null) { + vector = columnVectors[orcFieldIndex[c]]; } else { - vector = columnVectors[vectorIndex]; - vectorIndex++; + vector = columnVectors[vectorIndex++]; } + set(struct, c, reader(c).read(vector, row)); } return struct; @@ -235,4 +371,76 @@ public void setBatchContext(long newBatchOffsetInFile) { this.batchOffsetInFile = newBatchOffsetInFile; } } + + private static class RowIdReader implements OrcValueReader { + private final long firstRowId; + private final OrcValueReader fileIdReader; + private final RowPositionReader posReader; + + RowIdReader(long firstRowId, OrcValueReader fileIdReader) { + this.firstRowId = firstRowId; + this.fileIdReader = fileIdReader; + this.posReader = new RowPositionReader(); + } + + @Override + public Long read(ColumnVector vector, int row) { + if (fileIdReader != null) { + Long idFromFile = fileIdReader.read(vector, row); + if (idFromFile != null) { + return idFromFile; + } + } + + long pos = posReader.read(null, row); + return firstRowId + pos; + } + + @Override + public Long nonNullRead(ColumnVector vector, int row) { + return read(vector, row); + } + + @Override + public void setBatchContext(long batchOffsetInFile) { + posReader.setBatchContext(batchOffsetInFile); + if (fileIdReader != null) { + fileIdReader.setBatchContext(batchOffsetInFile); + } + } + } + + private static class LastUpdatedSeqReader implements OrcValueReader { + private final long fileLastUpdated; + private final OrcValueReader fileSeqReader; + + LastUpdatedSeqReader(long fileLastUpdated, OrcValueReader fileSeqReader) { + this.fileLastUpdated = fileLastUpdated; + this.fileSeqReader = fileSeqReader; + } + + @Override + public Long read(ColumnVector vector, int row) { + if (fileSeqReader != null) { + Long seqFromFile = fileSeqReader.read(vector, row); + if (seqFromFile != null) { + return seqFromFile; + } + } + + return fileLastUpdated; + } + + @Override + public Long nonNullRead(ColumnVector vector, int row) { + return read(vector, row); + } + + @Override + public void setBatchContext(long batchOffsetInFile) { + if (fileSeqReader != null) { + fileSeqReader.setBatchContext(batchOffsetInFile); + } + } + } } diff --git a/orc/src/test/java/org/apache/iceberg/orc/OrcWritingTestUtils.java b/orc/src/test/java/org/apache/iceberg/orc/OrcWritingTestUtils.java new file mode 100644 index 000000000000..72ed03ce2c80 --- /dev/null +++ b/orc/src/test/java/org/apache/iceberg/orc/OrcWritingTestUtils.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.orc; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.io.OutputFile; + +public class OrcWritingTestUtils { + private OrcWritingTestUtils() {} + + public static FileSystem outputFileSystem(OutputFile file) { + return new FileIOFSUtil.OutputFileSystem(file); + } + + public static FileSystem inputFileSystem(InputFile file) { + return new FileIOFSUtil.InputFileSystem(file); + } +} diff --git a/orc/src/test/java/org/apache/iceberg/orc/TestORCSchemaUtil.java b/orc/src/test/java/org/apache/iceberg/orc/TestORCSchemaUtil.java index c19e36be3ac1..e331ca94a211 100644 --- a/orc/src/test/java/org/apache/iceberg/orc/TestORCSchemaUtil.java +++ b/orc/src/test/java/org/apache/iceberg/orc/TestORCSchemaUtil.java @@ -560,4 +560,12 @@ private static boolean equalsWithIds(TypeDescription first, TypeDescription seco return true; } + + public static TypeDescription removeIds(TypeDescription type) { + return ORCSchemaUtil.removeIds(type); + } + + public static boolean hasIds(TypeDescription orcSchema) { + return ORCSchemaUtil.hasIds(orcSchema); + } } diff --git a/orc/src/test/java/org/apache/iceberg/orc/TestOrcIterableResourceCleanup.java b/orc/src/test/java/org/apache/iceberg/orc/TestOrcIterableResourceCleanup.java new file mode 100644 index 000000000000..6e819af5574b --- /dev/null +++ b/orc/src/test/java/org/apache/iceberg/orc/TestOrcIterableResourceCleanup.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.orc; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import org.apache.iceberg.Files; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.orc.GenericOrcReader; +import org.apache.iceberg.data.orc.GenericOrcWriter; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.io.DataWriter; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.io.SeekableInputStream; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.mockito.Mockito; + +public class TestOrcIterableResourceCleanup { + + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); + + @TempDir private File temp; + + @Test + public void testClosingIterableClosesAllStreams() throws IOException { + List inputStreams = Lists.newArrayList(); + InputFile inputFile = spyOnStreams(writeTestOrcFile(), inputStreams); + + try (CloseableIterable iterable = newOrcIterable(inputFile)) { + try (CloseableIterator iterator = iterable.iterator()) { + drain(iterator); + } + } + + verifyAllStreamsClosed(inputStreams); + } + + @Test + public void testClosingIterableClosesIteratorResources() throws IOException { + List inputStreams = Lists.newArrayList(); + InputFile inputFile = spyOnStreams(writeTestOrcFile(), inputStreams); + + // Without addCloseable(rowBatchIterator) in OrcIterable, the VectorizedRowBatchIterator + // and its RecordReader are never closed, leaking ORC input streams / file handles. + for (int round = 0; round < 5; round++) { + try (CloseableIterable iterable = newOrcIterable(inputFile)) { + drain(iterable.iterator()); + } + } + + verifyAllStreamsClosed(inputStreams); + } + + private static void drain(CloseableIterator iterator) { + while (iterator.hasNext()) { + iterator.next(); + } + } + + private InputFile writeTestOrcFile() throws IOException { + OutputFile outputFile = Files.localOutput(File.createTempFile("test", ".orc", temp)); + try (DataWriter writer = + ORC.writeData(outputFile) + .schema(SCHEMA) + .createWriterFunc(GenericOrcWriter::buildWriter) + .overwrite() + .withSpec(PartitionSpec.unpartitioned()) + .build()) { + GenericRecord record = GenericRecord.create(SCHEMA); + for (int i = 0; i < 10; i++) { + writer.write(record.copy(ImmutableMap.of("id", (long) i, "data", "val" + i))); + } + } + + return outputFile.toInputFile(); + } + + private static CloseableIterable newOrcIterable(InputFile input) { + return ORC.read(input) + .project(SCHEMA) + .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(SCHEMA, fileSchema)) + .build(); + } + + private static void verifyAllStreamsClosed(List streams) throws IOException { + for (SeekableInputStream stream : streams) { + Mockito.verify(stream, Mockito.times(1)).close(); + } + } + + private static InputFile spyOnStreams(InputFile delegate, List streams) { + InputFile inputFile = Mockito.spy(delegate); + Mockito.doAnswer( + invocation -> { + SeekableInputStream real = (SeekableInputStream) invocation.callRealMethod(); + SeekableInputStream inputStream = Mockito.spy(real); + streams.add(inputStream); + return inputStream; + }) + .when(inputFile) + .newStream(); + return inputFile; + } +} diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java index 2387d52edf2f..f02974d6e79c 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java @@ -23,6 +23,7 @@ import static org.apache.iceberg.TableProperties.DELETE_PARQUET_DICT_SIZE_BYTES; import static org.apache.iceberg.TableProperties.DELETE_PARQUET_PAGE_ROW_LIMIT; import static org.apache.iceberg.TableProperties.DELETE_PARQUET_PAGE_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.DELETE_PARQUET_PAGE_VERSION; import static org.apache.iceberg.TableProperties.DELETE_PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT; import static org.apache.iceberg.TableProperties.DELETE_PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT; import static org.apache.iceberg.TableProperties.DELETE_PARQUET_ROW_GROUP_SIZE_BYTES; @@ -42,6 +43,8 @@ import static org.apache.iceberg.TableProperties.PARQUET_PAGE_ROW_LIMIT_DEFAULT; import static org.apache.iceberg.TableProperties.PARQUET_PAGE_SIZE_BYTES; import static org.apache.iceberg.TableProperties.PARQUET_PAGE_SIZE_BYTES_DEFAULT; +import static org.apache.iceberg.TableProperties.PARQUET_PAGE_VERSION; +import static org.apache.iceberg.TableProperties.PARQUET_PAGE_VERSION_DEFAULT; import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT; import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT_DEFAULT; import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT; @@ -95,7 +98,6 @@ import org.apache.iceberg.mapping.NameMapping; import org.apache.iceberg.parquet.ParquetValueWriters.PositionDeleteStructWriter; import org.apache.iceberg.parquet.ParquetValueWriters.StructWriter; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -168,7 +170,6 @@ public static class WriteBuilder implements InternalData.WriteBuilder { private BiFunction> createWriterFunc = null; private MetricsConfig metricsConfig = MetricsConfig.getDefault(); private ParquetFileWriter.Mode writeMode = ParquetFileWriter.Mode.CREATE; - private WriterVersion writerVersion = WriterVersion.PARQUET_1_0; private Function, Context> createContextFunc = Context::dataContext; private ByteBuffer fileEncryptionKey = null; private ByteBuffer fileAADPrefix = null; @@ -266,7 +267,12 @@ public WriteBuilder overwrite(boolean enabled) { } public WriteBuilder writerVersion(WriterVersion version) { - this.writerVersion = version; + Preconditions.checkNotNull(version, "Writer version cannot be null"); + Preconditions.checkArgument( + version == WriterVersion.PARQUET_1_0 || version == WriterVersion.PARQUET_2_0, + "Unsupported writer version: %s", + version); + config.put(PARQUET_PAGE_VERSION, version.name()); return this; } @@ -292,15 +298,6 @@ private WriteSupport getWriteSupport(MessageType type) { } } - /* - * Sets the writer version. Default value is PARQUET_1_0 (v1). - */ - @VisibleForTesting - WriteBuilder withWriterVersion(WriterVersion version) { - this.writerVersion = version; - return this; - } - // supposed to always be a private method used strictly by data and delete write builders WriteBuilder createContextFunc(Function, Context> newCreateContextFunc) { this.createContextFunc = newCreateContextFunc; @@ -433,7 +430,7 @@ public FileAppender build() throws IOException { ParquetProperties.Builder propsBuilder = ParquetProperties.builder() - .withWriterVersion(writerVersion) + .withWriterVersion(context.writerVersion()) .withPageSize(pageSize) .withPageRowCountLimit(pageRowLimit) .withDictionaryEncoding(dictionaryEnabled) @@ -469,7 +466,7 @@ public FileAppender build() throws IOException { } else { ParquetWriteBuilder parquetWriteBuilder = new ParquetWriteBuilder(ParquetIO.file(file)) - .withWriterVersion(writerVersion) + .withWriterVersion(context.writerVersion()) .setType(type) .setConfig(config) .setKeyValueMetadata(metadata) @@ -502,6 +499,7 @@ static class Context { private final int pageSize; private final int pageRowLimit; private final int dictionaryPageSize; + private final WriterVersion writerVersion; private final CompressionCodecName codec; private final String compressionLevel; private final int rowGroupCheckMinRecordCount; @@ -518,6 +516,7 @@ private Context( int pageSize, int pageRowLimit, int dictionaryPageSize, + WriterVersion writerVersion, CompressionCodecName codec, String compressionLevel, int rowGroupCheckMinRecordCount, @@ -532,6 +531,7 @@ private Context( this.pageSize = pageSize; this.pageRowLimit = pageRowLimit; this.dictionaryPageSize = dictionaryPageSize; + this.writerVersion = writerVersion; this.codec = codec; this.compressionLevel = compressionLevel; this.rowGroupCheckMinRecordCount = rowGroupCheckMinRecordCount; @@ -565,6 +565,10 @@ static Context dataContext(Map config) { config, PARQUET_DICT_SIZE_BYTES, PARQUET_DICT_SIZE_BYTES_DEFAULT); Preconditions.checkArgument(dictionaryPageSize > 0, "Dictionary page size must be > 0"); + WriterVersion writerVersion = + toWriterVersion( + config.getOrDefault(PARQUET_PAGE_VERSION, PARQUET_PAGE_VERSION_DEFAULT)); + String codecAsString = config.getOrDefault(PARQUET_COMPRESSION, PARQUET_COMPRESSION_DEFAULT); CompressionCodecName codec = toCodec(codecAsString); @@ -616,6 +620,7 @@ static Context dataContext(Map config) { pageSize, pageRowLimit, dictionaryPageSize, + writerVersion, codec, compressionLevel, rowGroupCheckMinRecordCount, @@ -652,6 +657,12 @@ static Context deleteContext(Map config) { config, DELETE_PARQUET_DICT_SIZE_BYTES, dataContext.dictionaryPageSize()); Preconditions.checkArgument(dictionaryPageSize > 0, "Dictionary page size must be > 0"); + String deletePageVersion = config.get(DELETE_PARQUET_PAGE_VERSION); + WriterVersion writerVersion = + deletePageVersion != null + ? toWriterVersion(deletePageVersion) + : dataContext.writerVersion(); + String codecAsString = config.get(DELETE_PARQUET_COMPRESSION); CompressionCodecName codec = codecAsString != null ? toCodec(codecAsString) : dataContext.codec(); @@ -686,6 +697,7 @@ static Context deleteContext(Map config) { pageSize, pageRowLimit, dictionaryPageSize, + writerVersion, codec, compressionLevel, rowGroupCheckMinRecordCount, @@ -706,6 +718,15 @@ private static CompressionCodecName toCodec(String codecAsString) { } } + private static WriterVersion toWriterVersion(String pageVersion) { + try { + return WriterVersion.fromString(pageVersion); + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException( + "Unsupported Parquet page version: " + pageVersion + " (must be v1 or v2)"); + } + } + int rowGroupSize() { return rowGroupSize; } @@ -722,6 +743,10 @@ int dictionaryPageSize() { return dictionaryPageSize; } + WriterVersion writerVersion() { + return writerVersion; + } + CompressionCodecName codec() { return codec; } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvroWriter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvroWriter.java index 114e5fe27545..75b3a6604084 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvroWriter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvroWriter.java @@ -103,6 +103,11 @@ public ParquetValueWriter map( ParquetValueWriters.option(valueType, valueD, valueWriter)); } + @Override + public ParquetValueWriter variant(GroupType variant) { + throw new UnsupportedOperationException("Avro writer does not support variant types"); + } + @Override public ParquetValueWriter primitive(PrimitiveType primitive) { ColumnDescriptor desc = type.getColumnDescription(currentPath()); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java index fbd7a6e97fe2..90dd6e117ba8 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java @@ -19,13 +19,16 @@ package org.apache.iceberg.parquet; import java.io.IOException; +import java.io.UncheckedIOException; import java.nio.ByteBuffer; import java.util.Map; import java.util.function.Function; +import java.util.function.UnaryOperator; import org.apache.iceberg.FileContent; import org.apache.iceberg.FileFormat; import org.apache.iceberg.MetricsConfig; import org.apache.iceberg.Schema; +import org.apache.iceberg.TableProperties; import org.apache.iceberg.data.parquet.GenericParquetWriter; import org.apache.iceberg.deletes.PositionDelete; import org.apache.iceberg.encryption.EncryptedOutputFile; @@ -33,6 +36,7 @@ import org.apache.iceberg.formats.BaseFormatModel; import org.apache.iceberg.formats.ModelWriteBuilder; import org.apache.iceberg.formats.ReadBuilder; +import org.apache.iceberg.io.BufferedFileAppender; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.DeleteSchemaUtil; import org.apache.iceberg.io.FileAppender; @@ -40,16 +44,18 @@ import org.apache.iceberg.mapping.NameMapping; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.parquet.column.ParquetProperties; import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; public class ParquetFormatModel extends BaseFormatModel, R, MessageType> { - public static final String WRITER_VERSION_KEY = "parquet.writer.version"; private final boolean isBatchReader; + private final VariantShreddingAnalyzer variantAnalyzer; + private final UnaryOperator copyFunc; public static ParquetFormatModel, Void, Object> forPositionDeletes() { - return new ParquetFormatModel<>(PositionDelete.deleteClass(), Void.class, null, null, false); + return new ParquetFormatModel<>( + PositionDelete.deleteClass(), Void.class, null, null, false, null, null); } public static ParquetFormatModel> create( @@ -57,14 +63,26 @@ public static ParquetFormatModel> create( Class schemaType, WriterFunction, S, MessageType> writerFunction, ReaderFunction, S, MessageType> readerFunction) { - return new ParquetFormatModel<>(type, schemaType, writerFunction, readerFunction, false); + return new ParquetFormatModel<>( + type, schemaType, writerFunction, readerFunction, false, null, null); + } + + public static ParquetFormatModel> create( + Class type, + Class schemaType, + WriterFunction, S, MessageType> writerFunction, + ReaderFunction, S, MessageType> readerFunction, + VariantShreddingAnalyzer variantAnalyzer, + UnaryOperator copyFunc) { + return new ParquetFormatModel<>( + type, schemaType, writerFunction, readerFunction, false, variantAnalyzer, copyFunc); } public static ParquetFormatModel> create( Class type, Class schemaType, ReaderFunction, S, MessageType> batchReaderFunction) { - return new ParquetFormatModel<>(type, schemaType, null, batchReaderFunction, true); + return new ParquetFormatModel<>(type, schemaType, null, batchReaderFunction, true, null, null); } private ParquetFormatModel( @@ -72,9 +90,13 @@ private ParquetFormatModel( Class schemaType, WriterFunction, S, MessageType> writerFunction, ReaderFunction readerFunction, - boolean isBatchReader) { + boolean isBatchReader, + VariantShreddingAnalyzer variantAnalyzer, + UnaryOperator copyFunc) { super(type, schemaType, writerFunction, readerFunction); this.isBatchReader = isBatchReader; + this.variantAnalyzer = variantAnalyzer; + this.copyFunc = copyFunc; } @Override @@ -84,7 +106,7 @@ public FileFormat format() { @Override public ModelWriteBuilder writeBuilder(EncryptedOutputFile outputFile) { - return new WriteBuilderWrapper<>(outputFile, writerFunction()); + return new WriteBuilderWrapper<>(outputFile, writerFunction(), variantAnalyzer, copyFunc); } @Override @@ -95,15 +117,23 @@ public ReadBuilder readBuilder(InputFile inputFile) { private static class WriteBuilderWrapper implements ModelWriteBuilder { private final Parquet.WriteBuilder internal; private final WriterFunction, S, MessageType> writerFunction; + private final VariantShreddingAnalyzer variantAnalyzer; + private final UnaryOperator copyFunc; private Schema schema; private S engineSchema; private FileContent content; + private boolean shreddingEnabled = false; + private int bufferSize = TableProperties.PARQUET_VARIANT_BUFFER_SIZE_DEFAULT; private WriteBuilderWrapper( EncryptedOutputFile outputFile, - WriterFunction, S, MessageType> writerFunction) { + WriterFunction, S, MessageType> writerFunction, + VariantShreddingAnalyzer variantAnalyzer, + UnaryOperator copyFunc) { this.internal = Parquet.write(outputFile); this.writerFunction = writerFunction; + this.variantAnalyzer = variantAnalyzer; + this.copyFunc = copyFunc; } @Override @@ -121,8 +151,12 @@ public ModelWriteBuilder engineSchema(S newSchema) { @Override public ModelWriteBuilder set(String property, String value) { - if (WRITER_VERSION_KEY.equals(property)) { - internal.writerVersion(ParquetProperties.WriterVersion.valueOf(value)); + if (TableProperties.PARQUET_SHRED_VARIANTS.equals(property)) { + shreddingEnabled = Boolean.parseBoolean(value); + } + + if (TableProperties.PARQUET_VARIANT_BUFFER_SIZE.equals(property)) { + bufferSize = Integer.parseInt(value); } internal.set(property, value); @@ -131,7 +165,7 @@ public ModelWriteBuilder set(String property, String value) { @Override public ModelWriteBuilder setAll(Map properties) { - internal.setAll(properties); + properties.forEach(this::set); return this; } @@ -179,12 +213,14 @@ public ModelWriteBuilder withAADPrefix(ByteBuffer aadPrefix) { @Override public FileAppender build() throws IOException { + boolean shredVariants = false; switch (content) { case DATA: internal.createContextFunc(Parquet.WriteBuilder.Context::dataContext); internal.createWriterFunc( (icebergSchema, messageType) -> writerFunction.write(icebergSchema, messageType, engineSchema)); + shredVariants = shreddingEnabled && variantAnalyzer != null && hasVariantColumns(schema); break; case EQUALITY_DELETES: internal.createContextFunc(Parquet.WriteBuilder.Context::deleteContext); @@ -215,8 +251,45 @@ public FileAppender build() throws IOException { throw new IllegalArgumentException("Unknown file content: " + content); } + if (shredVariants) { + return buildShreddedAppender(); + } + return internal.build(); } + + /** + * Creates a {@link BufferedFileAppender} that buffers the first N rows, runs variant shredding + * analysis on them, then creates the real Parquet appender with a shredded schema. + * + *

    Only top-level variant columns are shredded. Nested variants (inside structs/lists/maps) + * fall through to unshredded 2-field layout because column index resolution only applies to + * top-level fields. + */ + private FileAppender buildShreddedAppender() { + return new BufferedFileAppender<>( + bufferSize, + bufferedRows -> { + Map shreddedTypes = + variantAnalyzer.analyzeVariantColumns(bufferedRows, schema, engineSchema); + + if (!shreddedTypes.isEmpty()) { + internal.variantShreddingFunc((fieldId, name) -> shreddedTypes.get(fieldId)); + } + + try { + return internal.build(); + } catch (IOException e) { + throw new UncheckedIOException("Failed to create shredded variant writer", e); + } + }, + copyFunc); + } + + private static boolean hasVariantColumns(Schema schema) { + return schema != null + && schema.columns().stream().anyMatch(field -> field.type().isVariantType()); + } } private static class ReadBuilderWrapper implements ReadBuilder { diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetVariantWriters.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetVariantWriters.java index 9e94b1bbd6cd..e5c56da166f4 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetVariantWriters.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetVariantWriters.java @@ -18,6 +18,8 @@ */ package org.apache.iceberg.parquet; +import java.math.BigDecimal; +import java.math.RoundingMode; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.Arrays; @@ -99,6 +101,16 @@ static ParquetValueWriter objects( builder.build()); } + @SuppressWarnings("unchecked") + static ParquetValueWriter decimal( + ParquetValueWriter writer, int precision, int scale, PhysicalType... types) { + return new DecimalPrimitiveWriter( + (ParquetValueWriter) writer, + Sets.immutableEnumSet(Arrays.asList(types)), + precision, + scale); + } + @SuppressWarnings("unchecked") public static ParquetValueWriter array( int repeatedDefinitionLevel, @@ -220,6 +232,10 @@ protected int writeTo(ByteBuffer buffer, int offset, VariantValue value) { private interface TypedWriter extends ParquetValueWriter { Set types(); + + default boolean canWrite(VariantValue value) { + return true; + } } private static class PrimitiveWriter implements TypedWriter { @@ -274,7 +290,7 @@ private ShreddedVariantWriter( @Override public void write(int repetitionLevel, VariantValue value) { - if (typedWriter.types().contains(value.type())) { + if (typedWriter.types().contains(value.type()) && typedWriter.canWrite(value)) { typedWriter.write(repetitionLevel, value); writeNull(valueWriter, repetitionLevel, valueDefinitionLevel); } else { @@ -372,6 +388,49 @@ public void setColumnStore(ColumnWriteStore columnStore) { } } + private static class DecimalPrimitiveWriter implements TypedWriter { + private final Set types; + private final ParquetValueWriter writer; + private final int precision; + private final int scale; + + private DecimalPrimitiveWriter( + ParquetValueWriter writer, Set types, int precision, int scale) { + this.types = types; + this.writer = writer; + this.precision = precision; + this.scale = scale; + } + + @Override + public Set types() { + return types; + } + + @Override + public boolean canWrite(VariantValue value) { + BigDecimal decimal = (BigDecimal) value.asPrimitive().get(); + int integerDigits = decimal.precision() - decimal.scale(); + return decimal.scale() <= scale && integerDigits + scale <= precision; + } + + @Override + public void write(int repetitionLevel, VariantValue value) { + BigDecimal decimal = (BigDecimal) value.asPrimitive().get(); + writer.write(repetitionLevel, decimal.setScale(scale, RoundingMode.UNNECESSARY)); + } + + @Override + public List> columns() { + return writer.columns(); + } + + @Override + public void setColumnStore(ColumnWriteStore columnStore) { + writer.setColumnStore(columnStore); + } + } + private static class ArrayWriter implements TypedWriter { private final int definitionLevel; private final int repetitionLevel; diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/VariantShreddingAnalyzer.java b/parquet/src/main/java/org/apache/iceberg/parquet/VariantShreddingAnalyzer.java new file mode 100644 index 000000000000..d2a058c1128a --- /dev/null +++ b/parquet/src/main/java/org/apache/iceberg/parquet/VariantShreddingAnalyzer.java @@ -0,0 +1,532 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.parquet; + +import java.math.BigDecimal; +import java.util.List; +import java.util.Map; +import java.util.Set; +import org.apache.iceberg.Schema; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.Types.NestedField; +import org.apache.iceberg.variants.PhysicalType; +import org.apache.iceberg.variants.VariantArray; +import org.apache.iceberg.variants.VariantObject; +import org.apache.iceberg.variants.VariantPrimitive; +import org.apache.iceberg.variants.VariantValue; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +/** + * Analyzes variant data across buffered rows to determine an optimal shredding schema. + * + *

    Determinism contract: for a given set of variant values (regardless of row arrival order), + * this analyzer produces the same shredded schema. When the number of distinct fields at any level + * exceeds {@code MAX_INTERMEDIATE_FIELDS}, field tracking becomes insertion-order dependent and + * determinism is not guaranteed. + * + *

      + *
    • Object fields use a TreeMap, so field ordering is alphabetical and deterministic. + *
    • Type selection picks the most common type with explicit tie-break priority (see + * TIE_BREAK_PRIORITY), not enum ordinal. + *
    • Integer types (INT8/16/32/64) and decimal types (DECIMAL4/8/16) are each promoted to the + * widest observed before competing with other types. + *
    • Fields below {@code MIN_FIELD_FREQUENCY} are pruned. Above {@code MAX_SHREDDED_FIELDS}, the + * most frequent are kept with alphabetical tie-breaking. + *
    • Recursion into nested objects/arrays stops at {@code MAX_SHREDDING_DEPTH} (default 50). + *
    • New struct fields are not tracked once a node reaches {@code MAX_INTERMEDIATE_FIELDS} + * (default 1000) to bound memory during inference. + *
    + * + *

    This contract holds within a single batch. Different batches with different distributions may + * produce different layouts; cross-batch stability requires schema pinning (not yet implemented). + * + *

    Subclasses implement {@link #extractVariantValues} to convert engine-specific row types into + * {@link VariantValue} instances. + * + * @param the engine-specific row type (e.g., Spark InternalRow, Flink RowData) + * @param the engine-specific schema type (e.g., Spark StructType, Flink RowType) + */ +public abstract class VariantShreddingAnalyzer { + private static final String TYPED_VALUE = "typed_value"; + private static final String VALUE = "value"; + private static final String ELEMENT = "element"; + private static final double MIN_FIELD_FREQUENCY = 0.10; + private static final int MAX_SHREDDED_FIELDS = 300; + private static final int MAX_SHREDDING_DEPTH = 50; + private static final int MAX_INTERMEDIATE_FIELDS = 1000; + + protected VariantShreddingAnalyzer() {} + + /** + * Analyzes buffered variant values to determine the optimal shredding schema. + * + * @param bufferedRows the buffered rows to analyze + * @param variantFieldIndex the index of the variant field in the rows + * @return the shredded schema type, or null if no shredding should be performed + */ + public Type analyzeAndCreateSchema(List bufferedRows, int variantFieldIndex) { + List variantValues = extractVariantValues(bufferedRows, variantFieldIndex); + if (variantValues.isEmpty()) { + return null; + } + + PathNode root = buildPathTree(variantValues); + PhysicalType rootType = root.info.getMostCommonType(); + if (rootType == null) { + return null; + } + + pruneInfrequentFields(root, root.info.observationCount); + + return buildTypedValue(root, rootType); + } + + protected abstract List extractVariantValues( + List bufferedRows, int variantFieldIndex); + + /** + * Resolves a column name to its index in the engine-specific schema. Returns -1 if the column is + * not found. + */ + protected abstract int resolveColumnIndex(S engineSchema, String columnName); + + /** + * Analyzes all variant columns in the schema, resolving column indices via the engine-specific + * {@link #resolveColumnIndex} method. + * + * @param bufferedRows the buffered rows to analyze + * @param icebergSchema the Iceberg table schema + * @param engineSchema the engine-specific schema used to resolve column indices + * @return a map from Iceberg field ID to the shredded Parquet type for each variant column + */ + public Map analyzeVariantColumns( + List bufferedRows, Schema icebergSchema, S engineSchema) { + Map shreddedTypes = Maps.newHashMap(); + for (NestedField col : icebergSchema.columns()) { + if (col.type().isVariantType()) { + int rowIndex = resolveColumnIndex(engineSchema, col.name()); + if (rowIndex >= 0) { + Type typed = analyzeAndCreateSchema(bufferedRows, rowIndex); + if (typed != null) { + shreddedTypes.put(col.fieldId(), typed); + } + } + } + } + + return shreddedTypes; + } + + private static PathNode buildPathTree(List variantValues) { + PathNode root = new PathNode(null); + root.info = new FieldInfo(); + + for (VariantValue value : variantValues) { + traverse(root, value, 0); + } + + return root; + } + + private static void pruneInfrequentFields(PathNode node, int totalRows) { + if (node.objectChildren.isEmpty() && node.arrayElement == null) { + return; + } + + // Remove fields below frequency threshold + node.objectChildren + .entrySet() + .removeIf( + entry -> { + FieldInfo info = entry.getValue().info; + return info != null + && ((double) info.observationCount / totalRows) < MIN_FIELD_FREQUENCY; + }); + + // Cap at MAX_SHREDDED_FIELDS, keep the most frequently observed + if (node.objectChildren.size() > MAX_SHREDDED_FIELDS) { + List> sorted = Lists.newArrayList(node.objectChildren.entrySet()); + sorted.sort( + (a, b) -> { + int cmp = + Integer.compare( + b.getValue().info.observationCount, a.getValue().info.observationCount); + return cmp != 0 ? cmp : a.getKey().compareTo(b.getKey()); + }); + Set keep = Sets.newHashSet(); + for (int i = 0; i < MAX_SHREDDED_FIELDS; i++) { + keep.add(sorted.get(i).getKey()); + } + node.objectChildren.entrySet().removeIf(entry -> !keep.contains(entry.getKey())); + } + + // Recurse into remaining object children + for (PathNode child : node.objectChildren.values()) { + pruneInfrequentFields(child, totalRows); + } + + // Recurse into array elements (arrays of objects need pruning too) + if (node.arrayElement != null) { + pruneInfrequentFields(node.arrayElement, totalRows); + } + } + + private static void traverse(PathNode node, VariantValue value, int depth) { + if (value == null || value.type() == PhysicalType.NULL) { + return; + } + + node.info.observe(value); + + if (value.type() == PhysicalType.OBJECT && depth < MAX_SHREDDING_DEPTH) { + traverseObject(node, value.asObject(), depth); + } else if (value.type() == PhysicalType.ARRAY && depth < MAX_SHREDDING_DEPTH) { + traverseArray(node, value.asArray(), depth); + } + } + + private static void traverseObject(PathNode node, VariantObject obj, int depth) { + for (String fieldName : obj.fieldNames()) { + VariantValue fieldValue = obj.get(fieldName); + if (fieldValue != null) { + PathNode childNode = node.objectChildren.get(fieldName); + if (childNode == null) { + if (node.objectChildren.size() >= MAX_INTERMEDIATE_FIELDS) { + continue; + } + childNode = new PathNode(fieldName); + childNode.info = new FieldInfo(); + node.objectChildren.put(fieldName, childNode); + } + traverse(childNode, fieldValue, depth + 1); + } + } + } + + // observationCount inside arrays counts per-element, not per-row, so fields in long arrays + // have inflated frequency and resist pruning. + private static void traverseArray(PathNode node, VariantArray array, int depth) { + int numElements = array.numElements(); + if (node.arrayElement == null) { + node.arrayElement = new PathNode(null); + node.arrayElement.info = new FieldInfo(); + } + for (int i = 0; i < numElements; i++) { + VariantValue element = array.get(i); + if (element != null) { + traverse(node.arrayElement, element, depth + 1); + } + } + } + + private static Type buildFieldGroup(PathNode node) { + PhysicalType commonType = node.info.getMostCommonType(); + if (commonType == null) { + return null; + } + + Type typedValue = buildTypedValue(node, commonType); + if (typedValue == null) { + return null; + } + + return Types.buildGroup(Type.Repetition.REQUIRED) + .optional(PrimitiveType.PrimitiveTypeName.BINARY) + .named(VALUE) + .addField(typedValue) + .named(node.fieldName); + } + + private static Type buildTypedValue(PathNode node, PhysicalType physicalType) { + return switch (physicalType) { + case ARRAY -> createArrayTypedValue(node); + case OBJECT -> createObjectTypedValue(node); + default -> createPrimitiveTypedValue(node.info, physicalType); + }; + } + + private static Type createObjectTypedValue(PathNode node) { + if (node.objectChildren.isEmpty()) { + return null; + } + + Types.GroupBuilder builder = Types.buildGroup(Type.Repetition.OPTIONAL); + boolean hasFields = false; + for (PathNode child : node.objectChildren.values()) { + Type fieldType = buildFieldGroup(child); + if (fieldType != null) { + builder.addField(fieldType); + hasFields = true; + } + } + + return hasFields ? builder.named(TYPED_VALUE) : null; + } + + private static Type createArrayTypedValue(PathNode node) { + PathNode elementNode = node.arrayElement; + if (elementNode == null) { + return null; + } + PhysicalType elementType = elementNode.info.getMostCommonType(); + if (elementType == null) { + return null; + } + Type elementTypedValue = buildTypedValue(elementNode, elementType); + if (elementTypedValue == null) { + return null; + } + + GroupType elementGroup = + Types.buildGroup(Type.Repetition.REQUIRED) + .optional(PrimitiveType.PrimitiveTypeName.BINARY) + .named(VALUE) + .addField(elementTypedValue) + .named(ELEMENT); + + return Types.optionalList().element(elementGroup).named(TYPED_VALUE); + } + + private static class PathNode { + private final String fieldName; + private final Map objectChildren = Maps.newTreeMap(); + private PathNode arrayElement = null; + private FieldInfo info = null; + + private PathNode(String fieldName) { + this.fieldName = fieldName; + } + } + + /** Use DECIMAL with maximum precision and scale as the shredding type */ + private static Type createDecimalTypedValue(FieldInfo info) { + int maxPrecision = Math.min(info.maxDecimalIntegerDigits + info.maxDecimalScale, 38); + int maxScale = Math.min(info.maxDecimalScale, Math.max(0, 38 - info.maxDecimalIntegerDigits)); + + if (maxPrecision <= 9) { + return Types.optional(PrimitiveType.PrimitiveTypeName.INT32) + .as(LogicalTypeAnnotation.decimalType(maxScale, maxPrecision)) + .named(TYPED_VALUE); + } else if (maxPrecision <= 18) { + return Types.optional(PrimitiveType.PrimitiveTypeName.INT64) + .as(LogicalTypeAnnotation.decimalType(maxScale, maxPrecision)) + .named(TYPED_VALUE); + } else { + return Types.optional(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) + .length(16) + .as(LogicalTypeAnnotation.decimalType(maxScale, maxPrecision)) + .named(TYPED_VALUE); + } + } + + private static Type createPrimitiveTypedValue(FieldInfo info, PhysicalType primitiveType) { + return switch (primitiveType) { + case BOOLEAN_TRUE, BOOLEAN_FALSE -> + Types.optional(PrimitiveType.PrimitiveTypeName.BOOLEAN).named(TYPED_VALUE); + case INT8 -> + Types.optional(PrimitiveType.PrimitiveTypeName.INT32) + .as(LogicalTypeAnnotation.intType(8, true)) + .named(TYPED_VALUE); + case INT16 -> + Types.optional(PrimitiveType.PrimitiveTypeName.INT32) + .as(LogicalTypeAnnotation.intType(16, true)) + .named(TYPED_VALUE); + case INT32 -> + Types.optional(PrimitiveType.PrimitiveTypeName.INT32) + .as(LogicalTypeAnnotation.intType(32, true)) + .named(TYPED_VALUE); + case INT64 -> + Types.optional(PrimitiveType.PrimitiveTypeName.INT64) + .as(LogicalTypeAnnotation.intType(64, true)) + .named(TYPED_VALUE); + case FLOAT -> Types.optional(PrimitiveType.PrimitiveTypeName.FLOAT).named(TYPED_VALUE); + case DOUBLE -> Types.optional(PrimitiveType.PrimitiveTypeName.DOUBLE).named(TYPED_VALUE); + case STRING -> + Types.optional(PrimitiveType.PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.stringType()) + .named(TYPED_VALUE); + case BINARY -> Types.optional(PrimitiveType.PrimitiveTypeName.BINARY).named(TYPED_VALUE); + case TIME -> + Types.optional(PrimitiveType.PrimitiveTypeName.INT64) + .as(LogicalTypeAnnotation.timeType(false, LogicalTypeAnnotation.TimeUnit.MICROS)) + .named(TYPED_VALUE); + case DATE -> + Types.optional(PrimitiveType.PrimitiveTypeName.INT32) + .as(LogicalTypeAnnotation.dateType()) + .named(TYPED_VALUE); + case TIMESTAMPTZ -> + Types.optional(PrimitiveType.PrimitiveTypeName.INT64) + .as(LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS)) + .named(TYPED_VALUE); + case TIMESTAMPNTZ -> + Types.optional(PrimitiveType.PrimitiveTypeName.INT64) + .as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MICROS)) + .named(TYPED_VALUE); + case TIMESTAMPTZ_NANOS -> + Types.optional(PrimitiveType.PrimitiveTypeName.INT64) + .as(LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.NANOS)) + .named(TYPED_VALUE); + case TIMESTAMPNTZ_NANOS -> + Types.optional(PrimitiveType.PrimitiveTypeName.INT64) + .as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.NANOS)) + .named(TYPED_VALUE); + case DECIMAL4, DECIMAL8, DECIMAL16 -> createDecimalTypedValue(info); + case UUID -> + Types.optional(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) + .length(16) + .as(LogicalTypeAnnotation.uuidType()) + .named(TYPED_VALUE); + default -> + throw new UnsupportedOperationException( + "Unknown primitive physical type: " + primitiveType); + }; + } + + /** Tracks occurrence count and types for a single field. */ + private static class FieldInfo { + private final Map typeCounts = Maps.newHashMap(); + private int maxDecimalScale = 0; + private int maxDecimalIntegerDigits = 0; + private int observationCount = 0; + + private static final Map INTEGER_PRIORITY = + ImmutableMap.of( + PhysicalType.INT8, 0, + PhysicalType.INT16, 1, + PhysicalType.INT32, 2, + PhysicalType.INT64, 3); + + private static final Map DECIMAL_PRIORITY = + ImmutableMap.of( + PhysicalType.DECIMAL4, 0, + PhysicalType.DECIMAL8, 1, + PhysicalType.DECIMAL16, 2); + + private static final Map TIE_BREAK_PRIORITY = + ImmutableMap.builder() + .put(PhysicalType.BOOLEAN_TRUE, 0) + .put(PhysicalType.INT8, 1) + .put(PhysicalType.INT16, 2) + .put(PhysicalType.INT32, 3) + .put(PhysicalType.INT64, 4) + .put(PhysicalType.FLOAT, 5) + .put(PhysicalType.DOUBLE, 6) + .put(PhysicalType.DECIMAL4, 7) + .put(PhysicalType.DECIMAL8, 8) + .put(PhysicalType.DECIMAL16, 9) + .put(PhysicalType.DATE, 10) + .put(PhysicalType.TIME, 11) + .put(PhysicalType.TIMESTAMPTZ, 12) + .put(PhysicalType.TIMESTAMPNTZ, 13) + .put(PhysicalType.BINARY, 14) + .put(PhysicalType.STRING, 15) + .put(PhysicalType.TIMESTAMPTZ_NANOS, 16) + .put(PhysicalType.TIMESTAMPNTZ_NANOS, 17) + .put(PhysicalType.UUID, 18) + .buildOrThrow(); + + void observe(VariantValue value) { + observationCount++; + // Use BOOLEAN_TRUE for both TRUE/FALSE values + PhysicalType type = + value.type() == PhysicalType.BOOLEAN_FALSE ? PhysicalType.BOOLEAN_TRUE : value.type(); + + typeCounts.compute(type, (k, v) -> (v == null) ? 1 : v + 1); + + // Track max precision and scale for decimal types + if (type == PhysicalType.DECIMAL4 + || type == PhysicalType.DECIMAL8 + || type == PhysicalType.DECIMAL16) { + VariantPrimitive primitive = value.asPrimitive(); + Object decimalValue = primitive.get(); + if (decimalValue instanceof BigDecimal bd) { + maxDecimalIntegerDigits = Math.max(maxDecimalIntegerDigits, bd.precision() - bd.scale()); + maxDecimalScale = Math.max(maxDecimalScale, bd.scale()); + } + } + } + + PhysicalType getMostCommonType() { + Map combinedCounts = Maps.newHashMap(); + + int integerTotalCount = 0; + PhysicalType mostCapableInteger = null; + + int decimalTotalCount = 0; + PhysicalType mostCapableDecimal = null; + + for (Map.Entry entry : typeCounts.entrySet()) { + PhysicalType type = entry.getKey(); + int count = entry.getValue(); + + if (isIntegerType(type)) { + integerTotalCount += count; + if (mostCapableInteger == null + || INTEGER_PRIORITY.get(type) > INTEGER_PRIORITY.get(mostCapableInteger)) { + mostCapableInteger = type; + } + } else if (isDecimalType(type)) { + decimalTotalCount += count; + if (mostCapableDecimal == null + || DECIMAL_PRIORITY.get(type) > DECIMAL_PRIORITY.get(mostCapableDecimal)) { + mostCapableDecimal = type; + } + } else { + combinedCounts.put(type, count); + } + } + + if (mostCapableInteger != null) { + combinedCounts.put(mostCapableInteger, integerTotalCount); + } + + if (mostCapableDecimal != null) { + combinedCounts.put(mostCapableDecimal, decimalTotalCount); + } + + // Pick the most common type with tie-breaking + return combinedCounts.entrySet().stream() + .max( + Map.Entry.comparingByValue() + .thenComparingInt(entry -> TIE_BREAK_PRIORITY.getOrDefault(entry.getKey(), -1))) + .map(Map.Entry::getKey) + .orElse(null); + } + + private static boolean isIntegerType(PhysicalType type) { + return type == PhysicalType.INT8 + || type == PhysicalType.INT16 + || type == PhysicalType.INT32 + || type == PhysicalType.INT64; + } + + private static boolean isDecimalType(PhysicalType type) { + return type == PhysicalType.DECIMAL4 + || type == PhysicalType.DECIMAL8 + || type == PhysicalType.DECIMAL16; + } + } +} diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/VariantWriterBuilder.java b/parquet/src/main/java/org/apache/iceberg/parquet/VariantWriterBuilder.java index a447a102690a..da409c92f113 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/VariantWriterBuilder.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/VariantWriterBuilder.java @@ -202,23 +202,29 @@ public Optional> visit(DecimalLogicalTypeAnnotation decima case FIXED_LEN_BYTE_ARRAY: case BINARY: writer = - ParquetVariantWriters.primitive( + ParquetVariantWriters.decimal( ParquetValueWriters.decimalAsFixed( desc, decimal.getPrecision(), decimal.getScale()), + decimal.getPrecision(), + decimal.getScale(), PhysicalType.DECIMAL16); return Optional.of(writer); case INT64: writer = - ParquetVariantWriters.primitive( + ParquetVariantWriters.decimal( ParquetValueWriters.decimalAsLong( desc, decimal.getPrecision(), decimal.getScale()), + decimal.getPrecision(), + decimal.getScale(), PhysicalType.DECIMAL8); return Optional.of(writer); case INT32: writer = - ParquetVariantWriters.primitive( + ParquetVariantWriters.decimal( ParquetValueWriters.decimalAsInteger( desc, decimal.getPrecision(), decimal.getScale()), + decimal.getPrecision(), + decimal.getScale(), PhysicalType.DECIMAL4); return Optional.of(writer); } diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/ParquetWritingTestUtils.java b/parquet/src/test/java/org/apache/iceberg/parquet/ParquetWritingTestUtils.java index b8cd38f56dfe..441073d34a4e 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/ParquetWritingTestUtils.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/ParquetWritingTestUtils.java @@ -35,7 +35,7 @@ import org.apache.parquet.schema.MessageType; /** Utilities for tests that need to write Parquet files. */ -class ParquetWritingTestUtils { +public class ParquetWritingTestUtils { private ParquetWritingTestUtils() {} diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java index 7a15f8609823..22f8068c0fa3 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java @@ -179,7 +179,7 @@ public void createInputFile() throws IOException { OutputFile outFile = Files.localOutput(parquetFile); try (FileAppender appender = - Parquet.write(outFile).schema(FILE_SCHEMA).withWriterVersion(writerVersion).build()) { + Parquet.write(outFile).schema(FILE_SCHEMA).writerVersion(writerVersion).build()) { GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table")); // create 20 copies of each record to ensure dictionary-encoding for (int copy = 0; copy < 20; copy += 1) { diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java index 58850ec7c9f4..5f1e0c83cc0f 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java @@ -30,6 +30,7 @@ import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.File; import java.io.IOException; @@ -57,6 +58,7 @@ import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.IntegerType; import org.apache.iceberg.util.Pair; +import org.apache.iceberg.variants.Variant; import org.apache.parquet.avro.AvroParquetWriter; import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.hadoop.ParquetFileReader; @@ -64,7 +66,9 @@ import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.io.LocalOutputFile; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; @@ -314,6 +318,26 @@ public void testFooterMetricsWithNameMappingForFileWithoutIds() throws IOExcepti } } + @Test + public void testAvroWriterRejectsVariantType() { + MessageType schema = + org.apache.parquet.schema.Types.buildMessage() + .optional(PrimitiveTypeName.INT32) + .named("id") + .optionalGroup() + .as(LogicalTypeAnnotation.variantType(Variant.VARIANT_SPEC_VERSION)) + .required(PrimitiveTypeName.BINARY) + .named("metadata") + .required(PrimitiveTypeName.BINARY) + .named("value") + .named("v") + .named("table"); + + assertThatThrownBy(() -> ParquetAvroWriter.buildWriter(schema)) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessage("Avro writer does not support variant types"); + } + private Pair generateFile( Function> createWriterFunc, int desiredRecordCount, diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetDataWriter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetDataWriter.java index 3918fdc63084..36e254628a6a 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetDataWriter.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetDataWriter.java @@ -42,8 +42,11 @@ import org.apache.iceberg.data.Record; import org.apache.iceberg.data.parquet.GenericParquetReaders; import org.apache.iceberg.data.parquet.GenericParquetWriter; +import org.apache.iceberg.encryption.EncryptedFiles; +import org.apache.iceberg.io.BufferedFileAppender; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.DataWriter; +import org.apache.iceberg.io.FileAppender; import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; @@ -52,8 +55,12 @@ import org.apache.iceberg.variants.Variant; import org.apache.iceberg.variants.VariantMetadata; import org.apache.iceberg.variants.VariantTestUtil; +import org.apache.iceberg.variants.VariantValue; import org.apache.iceberg.variants.Variants; +import org.apache.parquet.example.data.Group; import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.ParquetReader; +import org.apache.parquet.hadoop.example.GroupReadSupport; import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.MessageType; import org.junit.jupiter.api.BeforeEach; @@ -331,4 +338,207 @@ public void testDataWriterWithVariantShredding() throws IOException { testDataWriter( variantSchema, (id, name) -> ParquetVariantUtil.toParquetSchema(variant.value())); } + + @Test + public void testShreddingWriteReturnsBufferedAppender() throws IOException { + Schema variantSchema = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "v", Types.VariantType.get())); + + VariantShreddingAnalyzer testAnalyzer = + new VariantShreddingAnalyzer() { + @Override + protected List extractVariantValues(List rows, int idx) { + return java.util.Collections.emptyList(); + } + + @Override + protected int resolveColumnIndex(Void engineSchema, String columnName) { + return -1; + } + }; + + OutputFile outputFile = Files.localOutput(createTempFile(temp)); + + ParquetFormatModel> model = + ParquetFormatModel.create( + Record.class, + Void.class, + (icebergSchema, messageType, engineSchema) -> + GenericParquetWriter.create(icebergSchema, messageType), + (icebergSchema, fileSchema, engineSchema, idToConstant) -> + GenericParquetReaders.buildReader(icebergSchema, fileSchema), + testAnalyzer, + record -> record); + + try (FileAppender appender = + model + .writeBuilder(EncryptedFiles.plainAsEncryptedOutput(outputFile)) + .schema(variantSchema) + .setAll(ImmutableMap.of(TableProperties.PARQUET_SHRED_VARIANTS, "true")) + .content(FileContent.DATA) + .build()) { + assertThat(appender).isInstanceOf(BufferedFileAppender.class); + } + } + + @Test + public void testWriteBuilderReturnsDirectAppenderWithNullAnalyzer() throws IOException { + Schema variantSchema = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "v", Types.VariantType.get())); + + OutputFile outputFile = Files.localOutput(createTempFile(temp)); + + ParquetFormatModel> model = + ParquetFormatModel.create( + Record.class, + Void.class, + (icebergSchema, messageType, engineSchema) -> + GenericParquetWriter.create(icebergSchema, messageType), + (icebergSchema, fileSchema, engineSchema, idToConstant) -> + GenericParquetReaders.buildReader(icebergSchema, fileSchema), + null, + null); + + try (FileAppender appender = + model + .writeBuilder(EncryptedFiles.plainAsEncryptedOutput(outputFile)) + .schema(variantSchema) + .setAll(ImmutableMap.of(TableProperties.PARQUET_SHRED_VARIANTS, "true")) + .content(FileContent.DATA) + .build()) { + // Even with shredding property set, null variantAnalyzer means no BufferedFileAppender + assertThat(appender).isNotInstanceOf(BufferedFileAppender.class); + } + } + + @Test + public void testFormatModelVariantShreddingRoundTrip() throws IOException { + Schema variantSchema = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "v", Types.VariantType.get())); + + VariantShreddingAnalyzer analyzer = + new VariantShreddingAnalyzer() { + @Override + protected List extractVariantValues(List rows, int idx) { + List values = Lists.newArrayList(); + for (Record row : rows) { + Object obj = row.get(idx); + if (obj instanceof Variant) { + values.add(((Variant) obj).value()); + } + } + return values; + } + + @Override + protected int resolveColumnIndex(Void engineSchema, String columnName) { + // GenericRecord uses schema column order + return variantSchema.columns().indexOf(variantSchema.findField(columnName)); + } + }; + + ByteBuffer metadataBuffer = VariantTestUtil.createMetadata(ImmutableList.of("a", "b"), true); + VariantMetadata metadata = Variants.metadata(metadataBuffer); + ByteBuffer objectBuffer = + VariantTestUtil.createObject( + metadataBuffer, + ImmutableMap.of( + "a", Variants.of(42), + "b", Variants.of("hello"))); + Variant variant = Variant.of(metadata, Variants.value(metadata, objectBuffer)); + + GenericRecord record = GenericRecord.create(variantSchema); + List variantRecords = + ImmutableList.of( + record.copy(ImmutableMap.of("id", 1L, "v", variant)), + record.copy(ImmutableMap.of("id", 2L, "v", variant)), + record.copy(ImmutableMap.of("id", 3L, "v", variant))); + + OutputFile outputFile = Files.localOutput(createTempFile(temp)); + + ParquetFormatModel> model = + ParquetFormatModel.create( + Record.class, + Void.class, + (icebergSchema, messageType, engineSchema) -> + GenericParquetWriter.create(icebergSchema, messageType), + (icebergSchema, fileSchema, engineSchema, idToConstant) -> + GenericParquetReaders.buildReader(icebergSchema, fileSchema), + analyzer, + record1 -> record1); + + try (FileAppender appender = + model + .writeBuilder(EncryptedFiles.plainAsEncryptedOutput(outputFile)) + .schema(variantSchema) + .setAll( + ImmutableMap.of( + TableProperties.PARQUET_SHRED_VARIANTS, "true", + TableProperties.PARQUET_VARIANT_BUFFER_SIZE, "2")) + .content(FileContent.DATA) + .build()) { + assertThat(appender).isInstanceOf(BufferedFileAppender.class); + for (Record rec : variantRecords) { + appender.add(rec); + } + } + + // Verify shredded Parquet schema + try (ParquetFileReader reader = + ParquetFileReader.open(ParquetIO.file(outputFile.toInputFile()))) { + MessageType parquetSchema = reader.getFooter().getFileMetaData().getSchema(); + GroupType variantGroup = parquetSchema.getType("v").asGroupType(); + assertThat(variantGroup.containsField("metadata")).isTrue(); + assertThat(variantGroup.containsField("value")).isTrue(); + assertThat(variantGroup.containsField("typed_value")).isTrue(); + + GroupType typedValue = variantGroup.getType("typed_value").asGroupType(); + assertThat(typedValue.containsField("a")).isTrue(); + assertThat(typedValue.containsField("b")).isTrue(); + } + + // Verify data is in typed columns by reading raw Parquet groups + try (ParquetReader rawReader = + ParquetReader.builder( + new GroupReadSupport(), new org.apache.hadoop.fs.Path(outputFile.location())) + .build()) { + Group row = rawReader.read(); + Group variantData = row.getGroup("v", 0); + + assertThat(variantData.getFieldRepetitionCount("value")) + .as("value should be absent when fully shredded") + .isEqualTo(0); + + Group typedValue = variantData.getGroup("typed_value", 0); + assertThat(typedValue.getGroup("a", 0).getInteger("typed_value", 0)) + .as("typed_value.a should contain 42") + .isEqualTo(42); + assertThat(typedValue.getGroup("b", 0).getString("typed_value", 0)) + .as("typed_value.b should contain hello") + .isEqualTo("hello"); + } + + // Verify data round-trips + List writtenRecords; + try (CloseableIterable reader = + Parquet.read(outputFile.toInputFile()) + .project(variantSchema) + .createReaderFunc( + fileSchema -> GenericParquetReaders.buildReader(variantSchema, fileSchema)) + .build()) { + writtenRecords = Lists.newArrayList(reader); + } + + assertThat(writtenRecords).hasSameSizeAs(variantRecords); + for (int i = 0; i < variantRecords.size(); i++) { + InternalTestHelpers.assertEquals( + variantSchema.asStruct(), variantRecords.get(i), writtenRecords.get(i)); + } + } } diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetPageVersion.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetPageVersion.java new file mode 100644 index 000000000000..3be1dce4d9ea --- /dev/null +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetPageVersion.java @@ -0,0 +1,251 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.parquet; + +import static org.apache.iceberg.parquet.ParquetWritingTestUtils.createTempFile; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import org.apache.iceberg.Files; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.parquet.GenericParquetWriter; +import org.apache.iceberg.deletes.EqualityDeleteWriter; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.types.Types; +import org.apache.parquet.column.ParquetProperties.WriterVersion; +import org.apache.parquet.column.page.DataPage; +import org.apache.parquet.column.page.DataPageV1; +import org.apache.parquet.column.page.DataPageV2; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +class TestParquetPageVersion { + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); + + private List records; + + @TempDir private Path temp; + + @BeforeEach + void createRecords() { + GenericRecord record = GenericRecord.create(SCHEMA); + + this.records = + ImmutableList.of( + record.copy(ImmutableMap.of("id", 1L, "data", "a")), + record.copy(ImmutableMap.of("id", 2L, "data", "b")), + record.copy(ImmutableMap.of("id", 3L, "data", "c")), + record.copy(ImmutableMap.of("id", 4L, "data", "d")), + record.copy(ImmutableMap.of("id", 5L, "data", "e"))); + } + + @Test + void testWriterDefaultsToPageVersion1() throws IOException { + OutputFile outputFile = newOutputFile(); + + try (FileAppender writer = + Parquet.write(outputFile) + .schema(SCHEMA) + .createWriterFunc(GenericParquetWriter::create) + .build()) { + writer.addAll(records); + } + + assertThat(firstDataPage(outputFile)).isInstanceOf(DataPageV1.class); + } + + @Test + void testWriterUsesConfiguredPageVersion() throws IOException { + OutputFile outputFile = newOutputFile(); + + try (FileAppender writer = + Parquet.write(outputFile) + .schema(SCHEMA) + .set(TableProperties.PARQUET_PAGE_VERSION, "v2") + .createWriterFunc(GenericParquetWriter::create) + .build()) { + writer.addAll(records); + } + + assertThat(firstDataPage(outputFile)).isInstanceOf(DataPageV2.class); + } + + @Test + void testDeleteWriterUsesConfiguredPageVersion() throws IOException { + OutputFile outputFile = newOutputFile(); + + EqualityDeleteWriter deleteWriter = + Parquet.writeDeletes(outputFile) + .createWriterFunc(GenericParquetWriter::create) + .set(TableProperties.PARQUET_PAGE_VERSION, "v2") + .overwrite() + .rowSchema(SCHEMA) + .withSpec(PartitionSpec.unpartitioned()) + .equalityFieldIds(1) + .buildEqualityWriter(); + + try (EqualityDeleteWriter writer = deleteWriter) { + writer.write(records); + } + + assertThat(firstDataPage(outputFile)).isInstanceOf(DataPageV2.class); + } + + @Test + void testDeleteWriterUsesDeleteSpecificPageVersion() throws IOException { + OutputFile outputFile = newOutputFile(); + + EqualityDeleteWriter deleteWriter = + Parquet.writeDeletes(outputFile) + .createWriterFunc(GenericParquetWriter::create) + .set(TableProperties.PARQUET_PAGE_VERSION, "v1") + .set(TableProperties.DELETE_PARQUET_PAGE_VERSION, "v2") + .overwrite() + .rowSchema(SCHEMA) + .withSpec(PartitionSpec.unpartitioned()) + .equalityFieldIds(1) + .buildEqualityWriter(); + + try (EqualityDeleteWriter writer = deleteWriter) { + writer.write(records); + } + + assertThat(firstDataPage(outputFile)).isInstanceOf(DataPageV2.class); + } + + @Test + void testExplicitWriterVersion2OverridesPageVersionProperty() throws IOException { + OutputFile outputFile = newOutputFile(); + + try (FileAppender writer = + Parquet.write(outputFile) + .schema(SCHEMA) + .set(TableProperties.PARQUET_PAGE_VERSION, "v1") + .writerVersion(WriterVersion.PARQUET_2_0) + .createWriterFunc(GenericParquetWriter::create) + .build()) { + writer.addAll(records); + } + + assertThat(firstDataPage(outputFile)).isInstanceOf(DataPageV2.class); + } + + @Test + void testExplicitWriterVersion1OverridesPageVersionProperty() throws IOException { + OutputFile outputFile = newOutputFile(); + + try (FileAppender writer = + Parquet.write(outputFile) + .schema(SCHEMA) + .set(TableProperties.PARQUET_PAGE_VERSION, "v2") + .writerVersion(WriterVersion.PARQUET_1_0) + .createWriterFunc(GenericParquetWriter::create) + .build()) { + writer.addAll(records); + } + + assertThat(firstDataPage(outputFile)).isInstanceOf(DataPageV1.class); + } + + @Test + void testPageVersionPropertyAfterWriterVersionSetsVersion() throws IOException { + OutputFile outputFile = newOutputFile(); + + try (FileAppender writer = + Parquet.write(outputFile) + .schema(SCHEMA) + .writerVersion(WriterVersion.PARQUET_1_0) + .set(TableProperties.PARQUET_PAGE_VERSION, "v2") + .createWriterFunc(GenericParquetWriter::create) + .build()) { + writer.addAll(records); + } + + assertThat(firstDataPage(outputFile)).isInstanceOf(DataPageV2.class); + } + + @Test + void testInvalidPageVersionFails() throws IOException { + OutputFile outputFile = newOutputFile(); + + assertThatThrownBy( + () -> + Parquet.write(outputFile) + .schema(SCHEMA) + .set(TableProperties.PARQUET_PAGE_VERSION, "3") + .createWriterFunc(GenericParquetWriter::create) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Unsupported Parquet page version: 3 (must be v1 or v2)"); + } + + @Test + void testInvalidDeletePageVersionFails() throws IOException { + OutputFile outputFile = newOutputFile(); + + assertThatThrownBy( + () -> + Parquet.writeDeletes(outputFile) + .createWriterFunc(GenericParquetWriter::create) + .set(TableProperties.DELETE_PARQUET_PAGE_VERSION, "3") + .overwrite() + .rowSchema(SCHEMA) + .withSpec(PartitionSpec.unpartitioned()) + .equalityFieldIds(1) + .buildEqualityWriter()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Unsupported Parquet page version: 3 (must be v1 or v2)"); + } + + private OutputFile newOutputFile() throws IOException { + return Files.localOutput(createTempFile(temp)); + } + + private DataPage firstDataPage(OutputFile outputFile) throws IOException { + try (ParquetFileReader reader = + ParquetFileReader.open(ParquetIO.file(outputFile.toInputFile()))) { + PageReadStore rowGroup = reader.readNextRowGroup(); + assertThat(rowGroup).isNotNull(); + + DataPage dataPage = + rowGroup + .getPageReader( + reader.getFileMetaData().getSchema().getColumnDescription(new String[] {"id"})) + .readPage(); + assertThat(dataPage).isNotNull(); + return dataPage; + } + } +} diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestVariantShreddingAnalyzer.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestVariantShreddingAnalyzer.java new file mode 100644 index 000000000000..5ac10f74cc51 --- /dev/null +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestVariantShreddingAnalyzer.java @@ -0,0 +1,475 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.parquet; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.math.BigDecimal; +import java.util.List; +import java.util.Locale; +import java.util.function.Function; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.variants.ShreddedObject; +import org.apache.iceberg.variants.ValueArray; +import org.apache.iceberg.variants.VariantMetadata; +import org.apache.iceberg.variants.VariantValue; +import org.apache.iceberg.variants.Variants; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.junit.jupiter.api.Test; + +public class TestVariantShreddingAnalyzer { + + private static class DirectAnalyzer extends VariantShreddingAnalyzer { + @Override + protected List extractVariantValues(List rows, int idx) { + return rows; + } + + @Override + protected int resolveColumnIndex(Void engineSchema, String columnName) { + throw new UnsupportedOperationException("Not used in direct tests"); + } + } + + @Test + public void testDepthLimitStopsObjectRecursion() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + + // Each level has {"a": , "x": 1} so objects always have a shreddable primitive + VariantMetadata meta = Variants.metadata("a", "x"); + ShreddedObject innermost = Variants.object(meta); + innermost.put("a", Variants.of(42)); + innermost.put("x", Variants.of(1)); + + for (int i = 0; i < 54; i++) { + ShreddedObject wrapper = Variants.object(meta); + wrapper.put("a", innermost); + wrapper.put("x", Variants.of(1)); + innermost = wrapper; + } + + Type schema = analyzer.analyzeAndCreateSchema(List.of(innermost), 0); + assertThat(schema).isNotNull(); + assertThat(schema.getName()).isEqualTo("typed_value"); + + int shreddedDepth = countObjectDepth(schema); + assertThat(shreddedDepth).isLessThanOrEqualTo(50).isGreaterThan(0); + } + + @Test + public void testDepthLimitStopsArrayRecursion() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + + // 55-level nested arrays with a primitive only at the very bottom. + // Depth limit (50) prevents reaching the leaf, so schema is null (graceful degradation). + VariantValue innermost = Variants.of(42); + for (int i = 0; i < 55; i++) { + ValueArray wrapper = Variants.array(); + wrapper.add(innermost); + innermost = wrapper; + } + + Type schema = analyzer.analyzeAndCreateSchema(List.of(innermost), 0); + assertThat(schema).isNull(); + } + + @Test + public void testArrayWithinDepthLimit() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + + // 5-level nested arrays + VariantValue innermost = Variants.of(42); + for (int i = 0; i < 5; i++) { + ValueArray wrapper = Variants.array(); + wrapper.add(innermost); + innermost = wrapper; + } + + Type schema = analyzer.analyzeAndCreateSchema(List.of(innermost), 0); + assertThat(schema).isNotNull(); + assertThat(schema.getName()).isEqualTo("typed_value"); + + int arrayDepth = countArrayDepth(schema); + assertThat(arrayDepth).isEqualTo(5); + } + + @Test + public void testIntermediateFieldCapLimitsTrackedFields() { + int numFields = 1500; + String[] fieldNames = new String[numFields]; + for (int i = 0; i < numFields; i++) { + fieldNames[i] = String.format(Locale.ROOT, "field_%04d", i); + } + + VariantMetadata meta = Variants.metadata(fieldNames); + ShreddedObject obj = Variants.object(meta); + for (String name : fieldNames) { + obj.put(name, Variants.of(42)); + } + + DirectAnalyzer analyzer = new DirectAnalyzer(); + Type schema = analyzer.analyzeAndCreateSchema(List.of(obj), 0); + + assertThat(schema).isNotNull(); + assertThat(schema).isInstanceOf(GroupType.class); + GroupType typedValue = (GroupType) schema; + assertThat(typedValue.getFieldCount()).isLessThanOrEqualTo(300).isGreaterThan(0); + } + + @Test + public void testFieldCapAllowsExistingFieldUpdates() { + int numFields = 1500; + String[] fieldNames = new String[numFields]; + for (int i = 0; i < numFields; i++) { + fieldNames[i] = String.format(Locale.ROOT, "field_%04d", i); + } + + VariantMetadata meta = Variants.metadata(fieldNames); + + ShreddedObject row1 = Variants.object(meta); + for (String name : fieldNames) { + row1.put(name, Variants.of(42)); + } + + ShreddedObject row2 = Variants.object(meta); + for (int i = 0; i < 10; i++) { + row2.put(fieldNames[i], Variants.of("text")); + } + + ShreddedObject row3 = Variants.object(meta); + for (int i = 0; i < 10; i++) { + row3.put(fieldNames[i], Variants.of(99)); + } + + DirectAnalyzer analyzer = new DirectAnalyzer(); + Type schema = analyzer.analyzeAndCreateSchema(List.of(row1, row2, row3), 0); + + assertThat(schema).isNotNull(); + assertThat(schema).isInstanceOf(GroupType.class); + GroupType typedValue = (GroupType) schema; + assertThat(typedValue.getFieldCount()).isGreaterThan(0).isLessThanOrEqualTo(300); + } + + @Test + public void testNestedObjectsWithinDepthLimit() { + VariantMetadata cityMeta = Variants.metadata("city"); + ShreddedObject city = Variants.object(cityMeta); + city.put("city", Variants.of("NYC")); + + VariantMetadata addrMeta = Variants.metadata("address"); + ShreddedObject addr = Variants.object(addrMeta); + addr.put("address", city); + + VariantMetadata rootMeta = Variants.metadata("user"); + ShreddedObject root = Variants.object(rootMeta); + root.put("user", addr); + + DirectAnalyzer analyzer = new DirectAnalyzer(); + Type schema = analyzer.analyzeAndCreateSchema(List.of(root), 0); + + assertThat(schema).isNotNull(); + GroupType rootTv = schema.asGroupType(); + assertThat(rootTv.getName()).isEqualTo("typed_value"); + + // user -> typed_value -> address -> typed_value -> city -> typed_value (STRING) + GroupType userGroup = rootTv.getType("user").asGroupType(); + assertThat(userGroup.containsField("value")).isTrue(); + assertThat(userGroup.containsField("typed_value")).isTrue(); + + GroupType addrTv = userGroup.getType("typed_value").asGroupType(); + GroupType addrGroup = addrTv.getType("address").asGroupType(); + assertThat(addrGroup.containsField("typed_value")).isTrue(); + + GroupType cityTv = addrGroup.getType("typed_value").asGroupType(); + GroupType cityGroup = cityTv.getType("city").asGroupType(); + assertThat(cityGroup.containsField("typed_value")).isTrue(); + + PrimitiveType cityPrimitive = cityGroup.getType("typed_value").asPrimitiveType(); + assertThat(cityPrimitive.getPrimitiveTypeName()) + .isEqualTo(PrimitiveType.PrimitiveTypeName.BINARY); + assertThat(cityPrimitive.getLogicalTypeAnnotation()) + .isEqualTo(LogicalTypeAnnotation.stringType()); + } + + @Test + public void testDecimalForExceedingPrecision() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + // Value 1: 30 integer digits, 0 fractional -> precision=30, scale=0, intDigits=30 + // Value 2: 1 integer digit, 20 fractional -> precision=21, scale=20, intDigits=1 + // Combined: maxIntDigits=30, maxScale=20, raw sum=50 -> capped to precision=38, + // scale=min(20, 38-30)=8 (integer digits get priority) + VariantMetadata meta = Variants.metadata("val"); + ShreddedObject row1 = Variants.object(meta); + row1.put("val", Variants.of(new BigDecimal("123456789012345678901234567890"))); + + ShreddedObject row2 = Variants.object(meta); + row2.put("val", Variants.of(new BigDecimal("1.23456789012345678901"))); + + Type schema = analyzer.analyzeAndCreateSchema(List.of(row1, row2), 0); + assertThat(schema).isNotNull(); + + GroupType typedValue = schema.asGroupType(); + GroupType valGroup = typedValue.getType("val").asGroupType(); + PrimitiveType valPrimitive = valGroup.getType("typed_value").asPrimitiveType(); + + LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimal = + (LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) + valPrimitive.getLogicalTypeAnnotation(); + assertThat(decimal).isNotNull(); + assertThat(decimal.getPrecision()).isEqualTo(38); + // With 30 integer digits, scale is capped to 38 - 30 = 8 (integer digits get priority) + assertThat(decimal.getScale()).isEqualTo(8); + assertThat(decimal.getScale()).isLessThanOrEqualTo(decimal.getPrecision()); + + // Physical type should be FIXED_LEN_BYTE_ARRAY since precision > 18 + assertThat(valPrimitive.getPrimitiveTypeName()) + .isEqualTo(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY); + } + + @Test + public void testDecimalForExactPrecision() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + + // Value with exactly precision=38: 20 integer digits + 18 scale = 38 + VariantMetadata meta = Variants.metadata("val"); + ShreddedObject row = Variants.object(meta); + row.put("val", Variants.of(new BigDecimal("12345678901234567890.123456789012345678"))); + + Type schema = analyzer.analyzeAndCreateSchema(List.of(row), 0); + assertThat(schema).isNotNull(); + + GroupType typedValue = schema.asGroupType(); + GroupType valGroup = typedValue.getType("val").asGroupType(); + PrimitiveType valPrimitive = valGroup.getType("typed_value").asPrimitiveType(); + + LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimal = + (LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) + valPrimitive.getLogicalTypeAnnotation(); + assertThat(decimal.getPrecision()).isEqualTo(38); + assertThat(decimal.getScale()).isEqualTo(18); + } + + @Test + public void testInfrequentFieldsArePruned() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + + // 100 rows: "common" in all, "rare" in only 5 (below MIN_FIELD_FREQUENCY = 0.10) + List rows = buildPruningTestRows(5, obj -> obj); + + Type schema = analyzer.analyzeAndCreateSchema(rows, 0); + assertThat(schema).isNotNull(); + + GroupType group = schema.asGroupType(); + assertThat(group.containsField("common")).isTrue(); + assertThat(group.containsField("rare")).isFalse(); + } + + @Test + public void testEmptyArrayReturnsNull() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + + // All rows are empty arrays, no element type to infer + List rows = List.of(Variants.array(), Variants.array(), Variants.array()); + + Type schema = analyzer.analyzeAndCreateSchema(rows, 0); + assertThat(schema).isNull(); + } + + @Test + public void testRootPrimitiveProducesTypedValue() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + + // root type is primitive + List rows = List.of(Variants.of("hello"), Variants.of("world"), Variants.of("x")); + + Type schema = analyzer.analyzeAndCreateSchema(rows, 0); + assertThat(schema).isNotNull(); + assertThat(schema.getName()).isEqualTo("typed_value"); + assertThat(schema.isPrimitive()).isTrue(); + assertThat(schema.asPrimitiveType().getLogicalTypeAnnotation()) + .isEqualTo(LogicalTypeAnnotation.stringType()); + } + + @Test + public void testRootArrayOfObjectsPrunesInfrequentFields() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + + // 100 arrays: "common" in all, "rare" in only 3 (below MIN_FIELD_FREQUENCY = 0.10) + List rows = + buildPruningTestRows( + 3, + obj -> { + ValueArray arr = Variants.array(); + arr.add(obj); + return arr; + }); + + Type schema = analyzer.analyzeAndCreateSchema(rows, 0); + assertThat(schema).isNotNull(); + + GroupType listType = schema.asGroupType(); + assertThat(listType.getLogicalTypeAnnotation()) + .isInstanceOf(LogicalTypeAnnotation.ListLogicalTypeAnnotation.class); + GroupType repeatedGroup = listType.getType(0).asGroupType(); + GroupType elementGroup = repeatedGroup.getType(0).asGroupType(); + assertThat(elementGroup.containsField("typed_value")).isTrue(); + GroupType objectFields = elementGroup.getType("typed_value").asGroupType(); + assertThat(objectFields.containsField("common")).isTrue(); + assertThat(objectFields.containsField("rare")).isFalse(); + } + + @Test + public void testObjectWithArrayChildPrunesNestedFields() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + + VariantMetadata itemMeta = Variants.metadata("name", "rare"); + VariantMetadata rootMeta = Variants.metadata("items"); + + // 100 rows, "rare" appears in only 3 rows (below MIN_FIELD_FREQUENCY = 0.10) + List rows = Lists.newArrayList(); + for (int i = 0; i < 100; i++) { + ShreddedObject item = Variants.object(itemMeta); + item.put("name", Variants.of("item_" + i)); + if (i < 3) { + item.put("rare", Variants.of(1)); + } + ValueArray arr = Variants.array(); + arr.add(item); + ShreddedObject root = Variants.object(rootMeta); + root.put("items", arr); + rows.add(root); + } + + Type schema = analyzer.analyzeAndCreateSchema(rows, 0); + assertThat(schema).isNotNull(); + + GroupType rootTv = schema.asGroupType(); + GroupType itemsGroup = rootTv.getType("items").asGroupType(); + assertThat(itemsGroup.containsField("typed_value")).isTrue(); + GroupType listType = itemsGroup.getType("typed_value").asGroupType(); + GroupType repeatedGroup = listType.getType(0).asGroupType(); + GroupType elementGroup = repeatedGroup.getType(0).asGroupType(); + assertThat(elementGroup.containsField("typed_value")).isTrue(); + GroupType elementFields = elementGroup.getType("typed_value").asGroupType(); + assertThat(elementFields.containsField("name")).isTrue(); + assertThat(elementFields.containsField("rare")).isFalse(); + } + + @Test + public void testLongArrayInFewRowsSurvivesPruning() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + + VariantMetadata itemMeta = Variants.metadata("key"); + + // 2 of 100 rows have 500-element arrays with {"key": N}. Per-element counting gives + // observationCount=1000, so key survives the 10% pruning threshold. + List rows = Lists.newArrayList(); + for (int i = 0; i < 100; i++) { + ValueArray arr = Variants.array(); + if (i < 2) { + for (int j = 0; j < 500; j++) { + ShreddedObject item = Variants.object(itemMeta); + item.put("key", Variants.of(j)); + arr.add(item); + } + } else { + arr.add(Variants.of("no_key")); + } + rows.add(arr); + } + + Type schema = analyzer.analyzeAndCreateSchema(rows, 0); + assertThat(schema).isNotNull(); + + GroupType listType = schema.asGroupType(); + GroupType repeatedGroup = listType.getType(0).asGroupType(); + GroupType elementGroup = repeatedGroup.getType(0).asGroupType(); + assertThat(elementGroup.containsField("typed_value")).isTrue(); + GroupType elementFields = elementGroup.getType("typed_value").asGroupType(); + assertThat(elementFields.containsField("key")).isTrue(); + } + + /** + * Builds 100 variant rows where "common" appears in every row and "rare" appears in only {@code + * rareCount} rows (below MIN_FIELD_FREQUENCY = 0.10 when rareCount < 10). + */ + private static List buildPruningTestRows( + int rareCount, Function wrap) { + VariantMetadata meta = Variants.metadata("common", "rare"); + List rows = Lists.newArrayList(); + for (int i = 0; i < 100; i++) { + ShreddedObject obj = Variants.object(meta); + obj.put("common", Variants.of(i)); + if (i < rareCount) { + obj.put("rare", Variants.of("text")); + } + rows.add(wrap.apply(obj)); + } + return rows; + } + + /** Count typed_value group nesting depth along field "a". */ + private static int countObjectDepth(Type type) { + int depth = 0; + Type current = type; + while (current != null && "typed_value".equals(current.getName()) && !current.isPrimitive()) { + depth++; + GroupType group = current.asGroupType(); + if (group.containsField("a")) { + GroupType fieldGroup = group.getType("a").asGroupType(); + if (fieldGroup.containsField("typed_value")) { + current = fieldGroup.getType("typed_value"); + } else { + break; + } + } else { + break; + } + } + return depth; + } + + /** Count nested array (LIST) levels in the schema. */ + private static int countArrayDepth(Type type) { + int depth = 0; + Type current = type; + while (current != null && !current.isPrimitive()) { + if (!"typed_value".equals(current.getName())) { + break; + } + GroupType group = current.asGroupType(); + if (!(group.getLogicalTypeAnnotation() + instanceof LogicalTypeAnnotation.ListLogicalTypeAnnotation)) { + break; + } + depth++; + GroupType listGroup = group.getType(0).asGroupType(); + GroupType elementGroup = listGroup.getType(0).asGroupType(); + if (elementGroup.containsField("typed_value")) { + current = elementGroup.getType("typed_value"); + } else { + break; + } + } + return depth; + } +} diff --git a/parquet/src/testFixtures/java/org/apache/iceberg/parquet/ParquetFileTestUtils.java b/parquet/src/testFixtures/java/org/apache/iceberg/parquet/ParquetFileTestUtils.java new file mode 100644 index 000000000000..a6055424c0a6 --- /dev/null +++ b/parquet/src/testFixtures/java/org/apache/iceberg/parquet/ParquetFileTestUtils.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.parquet; + +import org.apache.parquet.io.InputFile; +import org.apache.parquet.io.OutputFile; + +/** Utilities for tests that need to write Parquet files. */ +public class ParquetFileTestUtils { + + private ParquetFileTestUtils() {} + + public static OutputFile file(org.apache.iceberg.io.OutputFile file) { + return ParquetIO.file(file); + } + + public static InputFile file(org.apache.iceberg.io.InputFile file) { + return ParquetIO.file(file); + } +} diff --git a/runtime-deps.gradle b/runtime-deps.gradle new file mode 100644 index 000000000000..e08fce3fe47a --- /dev/null +++ b/runtime-deps.gradle @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Guards the runtime dependency surface for shadow JAR modules. +// +// Prevents accidental transitive dependency growth in shipped shadow JARs. +// Without this guard, adding a single catalog module as 'implementation' +// instead of 'compileOnly' can silently leak dozens of transitive artifacts +// into the runtime JAR, inflating its size and introducing unlicensed code. +// +// Apply this script in any project that ships a bundled artifact: Spark and +// Flink runtime shadow JARs, cloud bundles (aws, azure, gcp), and Kafka +// Connect runtime distribution. +// +// It adds two tasks: +// +// generateRuntimeDeps - resolves runtimeClasspath and writes a sorted +// baseline of group:artifact:version coordinates +// to runtime-deps.txt in the project directory. +// +// checkRuntimeDeps - compares the resolved dependencies against the +// checked-in baseline and fails with a diff if +// they don't match. Patch-level version changes are +// ignored so that routine Dependabot bumps don't +// require a baseline update. Wired into the 'check' +// lifecycle. +// +// Workflow: +// 1. ./gradlew check -- fails if deps changed +// 2. ./gradlew generateRuntimeDeps -- auto-updates all baselines +// 3. Update LICENSE and NOTICE if dependency licenses changed -- This is a Manual Step +// 4. Commit + +def depsFile = file("${projectDir}/runtime-deps.txt") + +def resolveRuntimeDeps = { + configurations.runtimeClasspath.resolvedConfiguration + .resolvedArtifacts + .collect { "${it.moduleVersion.id.group}:${it.moduleVersion.id.name}:${it.moduleVersion.id.version}" } + .findAll { !it.startsWith('org.apache.iceberg:') } + .toSorted() + .toUnique() +} + +tasks.register('generateRuntimeDeps') { + group = 'verification' + description = 'Regenerate the runtime dependency baseline after intentional dependency changes' + outputs.file(depsFile) + doLast { + def deps = resolveRuntimeDeps() + depsFile.text = deps.join('\n') + '\n' + logger.lifecycle("Wrote ${deps.size()} dependencies to ${depsFile}") + logger.lifecycle("Review the diff, then update LICENSE and NOTICE if licenses changed.") + } +} + +tasks.register('checkRuntimeDeps') { + group = 'verification' + description = 'Verify runtime dependencies match the checked-in baseline' + inputs.files(configurations.runtimeClasspath) + outputs.file(depsFile) + doLast { + if (!depsFile.exists()) { + logger.warn("WARNING: Missing ${depsFile.name} in ${projectDir}. " + + "Run: ./gradlew ${project.path}:generateRuntimeDeps") + return + } + + def actual = resolveRuntimeDeps() + def expected = depsFile.readLines().findAll { it.trim() }.toSorted() + + def groupArtifact = { coord -> coord.substring(0, coord.lastIndexOf(':')) } + def majorMinor = { coord -> + def ver = coord.substring(coord.lastIndexOf(':') + 1) + def parts = ver.split('\\.') + parts.length >= 2 ? "${parts[0]}.${parts[1]}" : ver + } + + def actualByModule = actual.collectEntries { [(groupArtifact(it)): it] } + def expectedByModule = expected.collectEntries { [(groupArtifact(it)): it] } + + def added = actualByModule.keySet() - expectedByModule.keySet() + def removed = expectedByModule.keySet() - actualByModule.keySet() + def shared = actualByModule.keySet().intersect(expectedByModule.keySet()) + def versionChanged = shared.findAll { + majorMinor(actualByModule[it]) != majorMinor(expectedByModule[it]) + } + + if (added || removed || versionChanged) { + def msg = new StringBuilder() + msg.append("Runtime dependency baseline mismatch for ${project.name}!\n") + if (versionChanged) { + msg.append("\n Version changed (${versionChanged.size()}):\n") + versionChanged.toSorted().each { module -> + msg.append(" ~ ${expectedByModule[module]} -> ${actualByModule[module]}\n") + } + } + if (added) { + msg.append("\n Added (${added.size()}):\n") + added.toSorted().each { module -> msg.append(" + ${actualByModule[module]}\n") } + } + if (removed) { + msg.append("\n Removed (${removed.size()}):\n") + removed.toSorted().each { module -> msg.append(" - ${expectedByModule[module]}\n") } + } + msg.append("\nTo update the baseline run:\n") + msg.append(" ./gradlew ${project.path}:generateRuntimeDeps\n") + msg.append("\nThen update LICENSE and NOTICE to reflect the dependency changes.") + throw new GradleException(msg.toString()) + } + } +} + +check.dependsOn checkRuntimeDeps diff --git a/site/docs/assets/stylesheets/home.css b/site/docs/assets/stylesheets/home.css index 1c45c0e72025..98166fdfc0d2 100644 --- a/site/docs/assets/stylesheets/home.css +++ b/site/docs/assets/stylesheets/home.css @@ -30,42 +30,6 @@ text-align: center; /* Center text horizontally */ } -/* Summit Box Styles */ -.summit-box { - margin: 20px auto 15px auto; - padding: 10px 15px 15px 15px; - max-width: 600px; - background: rgba(255, 255, 255, 0.05); - border-radius: 12px; - border: 2px solid rgba(255, 255, 255, 0.2); - box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1); - backdrop-filter: blur(10px); -} - -.summit-box h4 { - text-align: center; - font-weight: 600; - font-size: 20px; - margin-top: 0; - margin-bottom: 10px; -} - -.summit-link-item { - transition: transform 0.3s ease, box-shadow 0.3s ease; - box-shadow: 0 3px 8px rgba(0, 0, 0, 0.2); -} - -.summit-link-item:hover { - transform: translateY(-3px); - box-shadow: 0 6px 15px rgba(0, 0, 0, 0.3); -} - -.summit-link-item a { - display: block; - width: 100%; - height: 100%; -} - /* Media query for smaller screens */ @media (max-width: 767px) { .col-6 { @@ -78,18 +42,4 @@ div#termynal-expressive-sql { left: 0; } - - .summit-box { - max-width: 100%; - margin: 15px 10px; - } - - .summit-links { - flex-direction: column !important; - } - - .summit-link-item { - min-width: 100% !important; - max-width: 100% !important; - } } diff --git a/site/docs/hive-quickstart.md b/site/docs/hive-quickstart.md index 988664d9007a..fbf9dec53689 100644 --- a/site/docs/hive-quickstart.md +++ b/site/docs/hive-quickstart.md @@ -36,13 +36,12 @@ Take a look at the Tags tab in [Apache Hive docker images](https://hub.docker.co Set the version variable. ```sh -export HIVE_VERSION=4.0.0 +export HIVE_VERSION=4.2.0 ``` -To accommodate both Intel-based (x86_64) and Apple Silicon (M1, M2, M3) Macs when running your Docker container, you can use the --platform flag to specify the desired architecture. Apple Silicon Macs use the arm64 architecture, while Intel Macs use the amd64 architecture. -Start the container, using the option `--platform linux/arm64` for a Mac with an M-Series chip: +Start the HiveServer2 container: ```sh -docker run -d --platform linux/arm64 -p 10000:10000 -p 10002:10002 --env SERVICE_NAME=hiveserver2 --name hive4 apache/hive:${HIVE_VERSION} +docker run -d -p 10000:10000 -p 10002:10002 --env SERVICE_NAME=hiveserver2 --name hive4 apache/hive:${HIVE_VERSION} ``` The docker run command above configures Hive to use the embedded derby database for Hive Metastore. Hive Metastore functions as the Iceberg catalog to locate Iceberg files, which can be anywhere. @@ -106,8 +105,8 @@ SELECT * FROM nyc.taxis; #### Adding Iceberg to Hive -If you already have a Hive 4.0.0 or later environment, it comes with the Iceberg 1.4.3 included. No additional downloads or jars are needed. If you have a Hive 2.3.x or Hive 3.1.x environment see [Enabling Iceberg support in Hive](docs/latest/hive.md#hive-23x-hive-31x). +If you already have a Hive 4.0.0 or later environment, it comes with the Iceberg included. No additional downloads or jars are needed. If you have a Hive 2.3.x or Hive 3.1.x environment see [Enabling Iceberg support in Hive](docs/latest/hive.md#hive-23x-hive-31x). #### Learn More -To learn more about setting up a database other than Derby, see [Apache Hive Quick Start](https://hive.apache.org/developement/quickstart/). You can also [set up a standalone metastore, HS2 and Postgres](https://github.com/apache/hive/blob/master/packaging/src/docker/docker-compose.yml). Now that you're up and running with Iceberg and Hive, check out the [Iceberg-Hive docs](docs/latest/hive.md) to learn more! +To learn more about setting up a database other than Derby, see [Apache Hive Quick Start](https://hive.apache.org/developement/quickstart/). You can also [set up a standalone metastore, HS2 and Postgres](https://github.com/apache/hive/blob/master/packaging/src/docker/docker-compose.yml) or [use Hive Metastore as Iceberg REST Catalog](https://hive.apache.org/docs/latest/admin/iceberg-rest-catalog/). Now that you're up and running with Iceberg and Hive, check out the [Iceberg-Hive docs](docs/latest/hive.md) to learn more! diff --git a/site/docs/releases.md b/site/docs/releases.md index d3a5ed9a4904..bc8957942fc2 100644 --- a/site/docs/releases.md +++ b/site/docs/releases.md @@ -45,7 +45,7 @@ To add a dependency on Iceberg in Gradle, add the following to `build.gradle`: ``` dependencies { - compile 'org.apache.iceberg:iceberg-core:{{ icebergVersion }}' + implementation 'org.apache.iceberg:iceberg-core:{{ icebergVersion }}' } ``` diff --git a/site/docs/status.md b/site/docs/status.md index 22527a6751cf..51d2f7271561 100644 --- a/site/docs/status.md +++ b/site/docs/status.md @@ -49,11 +49,14 @@ This section lists the libraries that implement the Apache Iceberg specification | timestamptz | Y | Y | Y | Y | Y | | timestamp_ns | Y | Y | Y | Y | N | | timestamptz_ns | Y | Y | Y | Y | N | +| unknown | Y | Y | N | Y | N | | string | Y | Y | Y | Y | Y | | uuid | Y | Y | Y | Y | N | | fixed | Y | Y | Y | Y | Y | | binary | Y | Y | Y | Y | Y | | variant | Y | Y | Y | Y | N | +| geometry | Y | N | N | N | N | +| geography | Y | N | N | N | N | | list | Y | Y | Y | Y | Y | | map | Y | Y | Y | Y | Y | | struct | Y | Y | Y | Y | Y | @@ -83,29 +86,29 @@ This section lists the libraries that implement the Apache Iceberg specification | Operation | Java | PyIceberg | Rust | Go | C++ | |-----------------------------|------|-----------|------|----|-----| -| Update schema | Y | Y | Y | N | Y | -| Update partition spec | Y | Y | Y | N | Y | +| Update schema | Y | Y | Y | Y | Y | +| Update partition spec | Y | Y | Y | Y | Y | | Update table properties | Y | Y | Y | Y | Y | -| Replace sort order | Y | N | N | N | Y | -| Update table location | Y | Y | N | N | Y | -| Update statistics | Y | Y | N | N | Y | +| Replace sort order | Y | N | Y | Y | Y | +| Update table location | Y | Y | Y | Y | Y | +| Update statistics | Y | Y | Y | Y | Y | | Update partition statistics | Y | N | N | N | N | -| Expire snapshots | Y | N | N | N | N | -| Manage snapshots | Y | N | N | N | N | +| Expire snapshots | Y | N | N | Y | N | +| Manage snapshots | Y | N | N | Y | N | ### Table Spec V2 | Operation | Java | PyIceberg | Rust | Go | C++ | |-----------------------------|------|-----------|------|----|-----| -| Update schema | Y | Y | N | N | Y | -| Update partition spec | Y | Y | N | N | Y | +| Update schema | Y | Y | N | Y | Y | +| Update partition spec | Y | Y | N | Y | Y | | Update table properties | Y | Y | Y | Y | Y | -| Replace sort order | Y | N | N | N | Y | -| Update table location | Y | Y | N | N | Y | -| Update statistics | Y | Y | N | N | Y | +| Replace sort order | Y | N | Y | Y | Y | +| Update table location | Y | Y | Y | Y | Y | +| Update statistics | Y | Y | Y | Y | Y | | Update partition statistics | Y | N | N | N | N | -| Expire snapshots | Y | N | N | N | N | -| Manage snapshots | Y | N | N | N | N | +| Expire snapshots | Y | N | N | Y | N | +| Manage snapshots | Y | N | N | Y | N | ## Table Update Operations @@ -113,22 +116,22 @@ This section lists the libraries that implement the Apache Iceberg specification | Operation | Java | PyIceberg | Rust | Go | C++ | |-------------------|------|-----------|------|----|-----| -| Append data files | Y | Y | N | Y | Y | -| Rewrite files | Y | Y | N | N | N | +| Append data files | Y | Y | Y | Y | Y | +| Rewrite files | Y | Y | N | Y | N | | Rewrite manifests | Y | Y | N | Y | N | -| Overwrite files | Y | Y | N | N | N | -| Delete files | Y | Y | N | N | N | +| Overwrite files | Y | Y | N | Y | N | +| Delete files | Y | Y | N | Y | N | ### Table Spec V2 | Operation | Java | PyIceberg | Rust | Go | C++ | |-------------------|------|-----------|------|----|-----| -| Append data files | Y | Y | N | Y | Y | -| Rewrite files | Y | Y | N | N | N | +| Append data files | Y | Y | Y | Y | Y | +| Rewrite files | Y | Y | N | Y | N | | Rewrite manifests | Y | Y | N | Y | N | -| Overwrite files | Y | Y | N | N | N | -| Row delta | Y | N | N | N | N | -| Delete files | Y | Y | N | N | N | +| Overwrite files | Y | Y | N | Y | N | +| Row delta | Y | N | N | Y | N | +| Delete files | Y | Y | N | Y | N | ## Table Read Operations @@ -145,12 +148,12 @@ This section lists the libraries that implement the Apache Iceberg specification | Operation | Java | PyIceberg | Rust | Go | C++ | |-----------------------------|------|-----------|------|----|-----| | Plan with data file | Y | Y | Y | Y | Y | -| Plan with position deletes | Y | Y | N | Y | Y | -| Plan with equality deletes | Y | Y | N | N | Y | +| Plan with position deletes | Y | Y | Y | Y | Y | +| Plan with equality deletes | Y | Y | Y | Y | Y | | Plan with puffin statistics | Y | N | N | N | N | | Read data file | Y | Y | Y | Y | Y | -| Read with position deletes | Y | Y | N | Y | N | -| Read with equality deletes | Y | N | N | N | N | +| Read with position deletes | Y | Y | Y | Y | N | +| Read with equality deletes | Y | N | Y | Y | N | ## Table Write Operations @@ -165,8 +168,8 @@ This section lists the libraries that implement the Apache Iceberg specification | Operation | Java | PyIceberg | Rust | Go | C++ | |------------------------|------|-----------|------|----|-----| | Append data | Y | Y | Y | Y | N | -| Write position deletes | Y | N | N | N | N | -| Write equality deletes | Y | N | N | N | N | +| Write position deletes | Y | N | N | Y | N | +| Write equality deletes | Y | N | Y | Y | N | ## Catalogs @@ -200,10 +203,10 @@ This section lists the libraries that implement the Apache Iceberg specification | View Operation | Java | PyIceberg | Rust | Go | C++ | |----------------|------|-----------|------|----|-----| -| createView | Y | N | N | N | N | -| dropView | Y | Y | N | N | N | -| listView | Y | Y | N | N | N | -| viewExists | Y | Y | N | N | N | +| createView | Y | N | N | Y | N | +| dropView | Y | Y | N | Y | N | +| listView | Y | Y | N | Y | N | +| viewExists | Y | Y | N | Y | N | | replaceView | Y | N | N | N | N | | renameView | Y | N | N | N | N | @@ -256,10 +259,10 @@ The sql catalog is a catalog backed by a sql database, which is called jdbc cata | View Operation | Java | PyIceberg | Rust | Go | C++ | |----------------|------|-----------|------|----|-----| -| createView | Y | N | N | N | N | -| dropView | Y | N | N | N | N | -| listView | Y | N | N | N | N | -| viewExists | Y | N | N | N | N | +| createView | Y | N | N | Y | N | +| dropView | Y | N | N | Y | N | +| listView | Y | N | N | Y | N | +| viewExists | Y | N | N | Y | N | | replaceView | Y | N | N | N | N | | renameView | Y | N | N | N | N | @@ -267,12 +270,12 @@ The sql catalog is a catalog backed by a sql database, which is called jdbc cata | Namespace Operation | Java | PyIceberg | Rust | Go | C++ | |---------------------------|------|-----------|------|----|-----| -| listNamespaces | Y | Y | N | Y | N | -| createNamespace | Y | Y | N | Y | N | +| listNamespaces | Y | Y | Y | Y | N | +| createNamespace | Y | Y | Y | Y | N | | dropNamespace | Y | Y | Y | Y | N | -| namespaceExists | Y | N | N | Y | N | +| namespaceExists | Y | N | Y | Y | N | | updateNamespaceProperties | Y | Y | Y | Y | N | -| loadNamespaceMetadata | Y | Y | N | Y | N | +| loadNamespaceMetadata | Y | Y | Y | Y | N | ### Glue Catalog @@ -315,12 +318,12 @@ The sql catalog is a catalog backed by a sql database, which is called jdbc cata | Namespace Operation | Java | PyIceberg | Rust | Go | C++ | |---------------------------|------|-----------|------|----|-----| -| listNamespaces | Y | Y | N | Y | N | -| createNamespace | Y | Y | N | Y | N | -| dropNamespace | Y | Y | N | Y | N | -| namespaceExists | Y | N | N | Y | N | +| listNamespaces | Y | Y | Y | Y | N | +| createNamespace | Y | Y | Y | Y | N | +| dropNamespace | Y | Y | Y | Y | N | +| namespaceExists | Y | N | Y | Y | N | | updateNamespaceProperties | Y | Y | Y | Y | N | -| loadNamespaceMetadata | Y | Y | N | Y | N | +| loadNamespaceMetadata | Y | Y | Y | Y | N | ### Hive Metastore Catalog @@ -352,10 +355,10 @@ The sql catalog is a catalog backed by a sql database, which is called jdbc cata | View Operation | Java | PyIceberg | Rust | Go | C++ | |----------------|------|-----------|------|----|-----| -| createView | Y | N | N | N | N | -| dropView | Y | N | N | N | N | -| listView | Y | N | N | N | N | -| viewExists | Y | N | N | N | N | +| createView | Y | N | N | Y | N | +| dropView | Y | N | N | Y | N | +| listView | Y | N | N | Y | N | +| viewExists | Y | N | N | Y | N | | replaceView | Y | N | N | N | N | | renameView | Y | N | N | N | N | @@ -363,9 +366,9 @@ The sql catalog is a catalog backed by a sql database, which is called jdbc cata | Namespace Operation | Java | PyIceberg | Rust | Go | C++ | |---------------------------|------|-----------|------|----|-----| -| listNamespaces | Y | Y | N | N | N | -| createNamespace | Y | Y | N | N | N | -| dropNamespace | Y | Y | N | N | N | -| namespaceExists | Y | N | N | N | N | +| listNamespaces | Y | Y | Y | Y | N | +| createNamespace | Y | Y | Y | Y | N | +| dropNamespace | Y | Y | Y | Y | N | +| namespaceExists | Y | N | Y | Y | N | | updateNamespaceProperties | Y | Y | Y | Y | N | -| loadNamespaceMetadata | Y | Y | N | N | N | +| loadNamespaceMetadata | Y | Y | Y | Y | N | diff --git a/site/docs/vendors.md b/site/docs/vendors.md index 67a98a3c23a2..8ea8021a4d13 100644 --- a/site/docs/vendors.md +++ b/site/docs/vendors.md @@ -141,6 +141,10 @@ IOMETE is a fully-managed ready to use, batteries included Data Platform. IOMETE ### [Microsoft OneLake](https://learn.microsoft.com/en-us/fabric/onelake/) [Microsoft OneLake](https://learn.microsoft.com/en-us/fabric/onelake/) is a single unified data lake that brings together your entire data estate into an open, secure foundation for analytics across the organization. Built into Microsoft Fabric, OneLake delivers two powerful APIs: the Tables API and the Files API. The [OneLake Tables API](https://aka.ms/onelakeircdocs) supports the Apache Iceberg REST Catalog (IRC) specification, making it simple to create, manage, and integrate Iceberg tables with existing tools and workflows. The OneLake Files API offers full Azure Data Lake Storage (ADLS) compatibility, enabling seamless file operations and interoperability with familiar ADLS tools. Together, these APIs make OneLake a truly open and interoperable data lake, delivering flexibility and connectivity for modern analytics and AI-driven pipelines. +### [Oracle](https://oracle.com/) + +As a fully-managed Oracle AI Database service, Oracle [Autonomous AI Lakehouse](https://www.oracle.com/autonomous-database/autonomous-ai-lakehouse/) combines the openness of Apache Iceberg with the performance, automation, and security of Oracle Autonomous Database and Oracle Exadata. Available across Oracle Cloud Infrastructure (OCI), Microsoft Azure, Google Cloud, AWS, and on-premises, Oracle AI Database provides a multicloud and hybrid open lakehouse architecture with high-performance access to Iceberg tables through integration with existing catalogs and support for the Apache Iceberg REST Catalog specification. Oracle enables interoperability across engines such as Apache Spark, Trino, and Apache Flink while minimizing data movement and preserving vendor independence. Built-in AI, vector search, graph analytics, and JSON-relational capabilities allow organizations to run advanced analytics and AI workloads directly on Iceberg data with enterprise-grade governance, availability, and serverless scalability. + ### [PuppyGraph](https://puppygraph.com) PuppyGraph is a cloud-native graph analytics engine that enables users to query one or more relational data stores as a unified graph model. This eliminates the overhead of deploying and maintaining a siloed graph database system, with no ETL required. [PuppyGraph’s native Apache Iceberg integration](https://docs.puppygraph.com/user-manual/getting-started/iceberg) adds native graph capabilities to your existing data lake in an easy and performant way. @@ -157,6 +161,10 @@ Redpanda is both a cloud-native and self-hosted streaming platform whose [Iceber [Ryft](https://ryft.io/) is a fully automated Iceberg management platform. Ryft helps data teams create an open, automated and cost-effective Iceberg lakehouse, by maintaining and optimizing Iceberg tables in real time, based on actual usage patterns. The Ryft engine runs compaction intelligently, adapting to different use cases like streaming, batch jobs, CDC, and more. Ryft also automates compliance, disaster recovery and data lifecycle management for Iceberg tables, to ensure your lakehouse stays secure and compliant. It directly integrates with your existing catalog, storage and query engines, allowing for a very simple deployment. +### [Sail](https://lakesail.com/) + +[Sail](https://github.com/lakehq/sail) is an open-source multimodal distributed compute framework, built in Rust, unifying batch, streaming, and AI workloads. For seamless adoption, Sail offers a drop-in replacement for the Spark SQL and DataFrame APIs in both single-host and distributed settings. Learn more about using Sail with Iceberg in the [Sail Iceberg guide](https://docs.lakesail.com/sail/latest/guide/sources/iceberg). + ### [SingleStore](https://singlestore.com/) SingleStore is a high‑performance, scalable, distributed SQL platform that makes real‑time analytics and transactional processing available at scale. Its native Apache Iceberg integration removes costly ETL steps and powers intelligent, millisecond‑response applications. diff --git a/site/mkdocs-dev.yml b/site/mkdocs-dev.yml index 8891eb1a951d..b4c68aacc3fc 100644 --- a/site/mkdocs-dev.yml +++ b/site/mkdocs-dev.yml @@ -30,7 +30,7 @@ nav: - Docs: - Java: - Nightly: '!include docs/docs/nightly/mkdocs.yml' - - Latest (1.10.0): '!include docs/docs/latest/mkdocs.yml' + - Latest (1.10.1): '!include docs/docs/latest/mkdocs.yml' - Other Implementations: - Python: https://py.iceberg.apache.org/ - Rust: https://rust.iceberg.apache.org/ @@ -52,6 +52,7 @@ nav: - Apache Amoro: integrations/amoro.md - Apache Doris: https://doris.apache.org/docs/dev/lakehouse/catalogs/iceberg-catalog - Apache Druid: https://druid.apache.org/docs/latest/development/extensions-contrib/iceberg/ + - Apache Fluss: https://fluss.apache.org/docs/next/streaming-lakehouse/integrate-data-lakes/iceberg/ - BladePipe: https://www.bladepipe.com/docs/dataMigrationAndSync/datasource_func/Iceberg/props_for_iceberg_ds - ClickHouse: https://clickhouse.com/docs/en/engines/table-engines/integrations/iceberg - Daft: integrations/daft.md @@ -63,12 +64,14 @@ nav: - Google BigQuery: https://cloud.google.com/bigquery/docs/iceberg-tables - Impala: https://impala.apache.org/docs/build/html/topics/impala_iceberg.html - Memiiso Debezium: https://memiiso.github.io/debezium-server-iceberg/ + - Microsoft OneLake: https://aka.ms/onelakeircdocs - Nimtable: https://github.com/nimtable/nimtable - OLake: https://olake.io/docs - Presto: https://prestodb.io/docs/current/connector/iceberg.html - Redpanda: https://docs.redpanda.com/current/manage/iceberg/about-iceberg-topics - RisingWave: integrations/risingwave.md - Ryft: https://docs.ryft.io/platform + - Sail: https://docs.lakesail.com/sail/latest/guide/sources/iceberg - Snowflake: https://docs.snowflake.com/en/user-guide/tables-iceberg - Starburst: https://docs.starburst.io/latest/connector/iceberg.html - Starrocks: https://docs.starrocks.io/en-us/latest/data_source/catalog/iceberg_catalog diff --git a/site/nav.yml b/site/nav.yml index de1770c7ad78..dd2b0dce474f 100644 --- a/site/nav.yml +++ b/site/nav.yml @@ -85,6 +85,7 @@ nav: - Redpanda: https://docs.redpanda.com/current/manage/iceberg/about-iceberg-topics - RisingWave: integrations/risingwave.md - Ryft: https://docs.ryft.io/platform + - Sail: https://docs.lakesail.com/sail/latest/guide/sources/iceberg - Snowflake: https://docs.snowflake.com/en/user-guide/tables-iceberg - Starburst: https://docs.starburst.io/latest/connector/iceberg.html - Starrocks: https://docs.starrocks.io/en-us/latest/data_source/catalog/iceberg_catalog diff --git a/site/overrides/home.html b/site/overrides/home.html index 8d6e49176963..65d971e0a134 100644 --- a/site/overrides/home.html +++ b/site/overrides/home.html @@ -37,27 +37,6 @@

    Apache Iceberg™

    The open table format for analytic datasets.


    - - -
      {% for social in config.extra.social %}
    • @@ -331,34 +310,6 @@

      Data Compaction

      src="assets/javascript/termynal.js" data-termynal-container="#termynal|#termynal-data-compaction|#termynal-expressive-sql|#termynal-time-travel"> - - - {% endblock %} {% block content %} diff --git a/site/requirements.txt b/site/requirements.txt index f7877c7a6985..e21e4c8f1b5c 100644 --- a/site/requirements.txt +++ b/site/requirements.txt @@ -20,6 +20,6 @@ mkdocs-macros-plugin==1.5.0 mkdocs-material==9.7.5 mkdocs-material-extensions==1.3.1 mkdocs-monorepo-plugin @ git+https://github.com/bitsondatadev/mkdocs-monorepo-plugin@url-fix -mkdocs-redirects==1.2.2 -mkdocs-rss-plugin==1.17.9 +mkdocs-redirects==1.2.3 +mkdocs-rss-plugin==1.19.0 pymarkdownlnt==0.9.36 diff --git a/spark/v3.4/build.gradle b/spark/v3.4/build.gradle index 42fccdd85dad..599157ab2309 100644 --- a/spark/v3.4/build.gradle +++ b/spark/v3.4/build.gradle @@ -112,13 +112,10 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") { testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') - testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - testImplementation libs.sqlite.jdbc + testImplementation project(path: ':iceberg-orc', configuration: 'testArtifacts') + testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) testImplementation libs.awaitility - // runtime dependencies for running REST Catalog based integration test - testRuntimeOnly libs.jetty.servlet + testImplementation(testFixtures(project(':iceberg-parquet'))) } test { @@ -179,13 +176,7 @@ project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVer testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') testImplementation project(path: ":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}", configuration: 'testArtifacts') - testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - // runtime dependencies for running REST Catalog based integration test - testRuntimeOnly libs.jetty.servlet - testRuntimeOnly libs.sqlite.jdbc - + testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) testImplementation libs.avro.avro testImplementation libs.parquet.hadoop testImplementation libs.awaitility @@ -276,11 +267,7 @@ project(":iceberg-spark:iceberg-spark-runtime-${sparkMajorVersion}_${scalaVersio integrationRuntimeOnly project(':iceberg-hive-metastore') // runtime dependencies for running REST Catalog based integration test integrationRuntimeOnly project(path: ':iceberg-core', configuration: 'testArtifacts') - integrationRuntimeOnly (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - integrationRuntimeOnly libs.jetty.servlet - integrationRuntimeOnly libs.sqlite.jdbc + integrationRuntimeOnly (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) // Not allowed on our classpath, only the runtime jar is allowed integrationCompileOnly project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVersion}") @@ -343,5 +330,7 @@ project(":iceberg-spark:iceberg-spark-runtime-${sparkMajorVersion}_${scalaVersio jar { enabled = false } + + apply from: "${rootDir}/runtime-deps.gradle" } diff --git a/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java b/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java index f48e39e500c0..86f3f19de937 100644 --- a/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java +++ b/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java @@ -31,6 +31,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions; import org.apache.iceberg.util.ThreadPools; import org.apache.spark.sql.SparkSession; @@ -157,7 +158,7 @@ private void initDataAndDeletes() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) diff --git a/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java b/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java index dc625d240769..97e6b86dabce 100644 --- a/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java +++ b/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java @@ -155,7 +155,7 @@ private void runBenchmark(RowLevelOperationMode mode, double updatePercentage) { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java b/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java index 05aa9602a323..52884bf10308 100644 --- a/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java +++ b/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java @@ -157,7 +157,7 @@ public void localPlanningWithoutFilterWithStats(Blackhole blackhole) { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) diff --git a/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java b/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java index 2b85a8e385ec..5e39596f6ac6 100644 --- a/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java +++ b/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java @@ -242,7 +242,7 @@ private Dataset randomDataDF(Schema schema, int numRows) { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) diff --git a/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java b/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java index d917eae5eb0f..caa23625fc44 100644 --- a/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java +++ b/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java @@ -138,7 +138,7 @@ private void runBenchmark(RowLevelOperationMode mode, double updatePercentage) { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java b/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java index 4c1a5095916c..834640e24328 100644 --- a/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java +++ b/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java @@ -59,6 +59,7 @@ public static void startMetastoreAndSpark() { .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") .config( SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), String.valueOf(RANDOM.nextBoolean())) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java b/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java index 1dd6db48f7d8..b106e8fc38f3 100644 --- a/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java +++ b/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java @@ -1395,6 +1395,62 @@ public void testDeleteToCustomWapBranchWithoutWhereClause() throws NoSuchTableEx }); } + @TestTemplate + public void testDeleteToWapBranchCanDeleteWhereScansWapBranch() throws NoSuchTableException { + assumeThat(branch).as("WAP branch only works for table identifier without branch").isNull(); + + createAndInitPartitionedTable(); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' = 'true')", + tableName, TableProperties.WRITE_AUDIT_PUBLISH_ENABLED); + + append(tableName, new Employee(1, "hr")); + + spark.conf().set(SparkSQLProperties.WAP_BRANCH, "wap"); + try { + append(tableName, new Employee(0, "hr"), new Employee(1, "hr"), new Employee(2, "hr")); + + sql("DELETE FROM %s WHERE id = 1", tableName); + + assertThat(sql("SELECT id, dep FROM %s.branch_wap ORDER BY id", tableName)) + .as("DELETE should remove the matching rows from the WAP branch") + .containsExactly(row(0, "hr"), row(2, "hr")); + assertThat(sql("SELECT id, dep FROM %s.branch_main", tableName)) + .as("Main branch must not be modified by a WAP-targeted DELETE") + .containsExactly(row(1, "hr")); + } finally { + spark.conf().unset(SparkSQLProperties.WAP_BRANCH); + } + } + + @TestTemplate + public void testMetadataDeleteToWapBranchCommitsToWapBranch() throws NoSuchTableException { + assumeThat(branch).as("WAP branch only works for table identifier without branch").isNull(); + + createAndInitPartitionedTable(); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' = 'true')", + tableName, TableProperties.WRITE_AUDIT_PUBLISH_ENABLED); + + append(tableName, new Employee(1, "hr"), new Employee(5, "eng")); + + spark.conf().set(SparkSQLProperties.WAP_BRANCH, "wap"); + try { + append(tableName, new Employee(0, "hr"), new Employee(2, "eng")); + + sql("DELETE FROM %s WHERE dep = 'hr'", tableName); + + assertThat(sql("SELECT id, dep FROM %s.branch_wap ORDER BY id", tableName)) + .as("Metadata delete should remove the hr partition on the WAP branch") + .containsExactly(row(2, "eng"), row(5, "eng")); + assertThat(sql("SELECT id, dep FROM %s.branch_main ORDER BY id", tableName)) + .as("Metadata delete must not commit to main when WAP is set") + .containsExactly(row(1, "hr"), row(5, "eng")); + } finally { + spark.conf().unset(SparkSQLProperties.WAP_BRANCH); + } + } + @TestTemplate public void testDeleteWithFilterOnNestedColumn() { createAndInitNestedColumnsTable(); diff --git a/spark/v3.4/spark-runtime/LICENSE b/spark/v3.4/spark-runtime/LICENSE index a67296eb412c..3aceb9b01aa9 100644 --- a/spark/v3.4/spark-runtime/LICENSE +++ b/spark/v3.4/spark-runtime/LICENSE @@ -219,6 +219,94 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles FastDoubleParser (via Jackson JSON Processor). + +Copyright: 2023 Werner Randelshofer, Switzerland +Project URL: https://github.com/wrandelshofer/FastDoubleParser +License: MIT + +| Copyright (c) 2023 Werner Randelshofer, Switzerland +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles fast_float (bundled by FastDoubleParser). + +Copyright: 2021 The fast_float authors +Project URL: https://github.com/fastfloat/fast_float +License: MIT + +| Copyright (c) 2021 The fast_float authors +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles bigint (bundled by FastDoubleParser). + +Copyright: 2022 Tim Buktu +Project URL: https://github.com/tbuktu/bigint +License: BSD 2-Clause + +| Copyright 2022 Tim Buktu +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| 1. Redistributions of source code must retain the above copyright notice, this +| list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + This product bundles Apache Parquet. Copyright: 2014-2024 The Apache Software Foundation @@ -227,7 +315,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Thrift. +This product bundles Apache Thrift (bundled by Parquet). Copyright: 2006-2017 The Apache Software Foundation. Project URL: https://thrift.apache.org/ @@ -235,7 +323,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product includes code from Daniel Lemire's JavaFastPFOR project. +This product includes code from Daniel Lemire's JavaFastPFOR project (bundled by Parquet). Copyright: 2013 Daniel Lemire Project URL: https://github.com/lemire/JavaFastPFOR @@ -243,7 +331,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles fastutil. +This product bundles fastutil (bundled by Parquet). Copyright: 2002-2014 Sebastiano Vigna Project URL: http://fastutil.di.unimi.it/ @@ -251,6 +339,13 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles Zero-Allocation Hashing (bundled by Parquet). + +Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + This product bundles Apache ORC. Copyright: 2013 and onwards The Apache Software Foundation. @@ -259,7 +354,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Hive. +This product bundles Apache Hive's Storage API (bundled by ORC). Copyright: 2008-2020 The Apache Software Foundation Project URL: https://hive.apache.org/ @@ -267,11 +362,12 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Google protobuf. +This product bundles Google protobuf (bundled by ORC). Copyright: 2008 Google Inc. Project URL: https://developers.google.com/protocol-buffers License: BSD 3-Clause + | Copyright 2008 Google Inc. All rights reserved. | | Redistribution and use in source and binary forms, with or without @@ -352,6 +448,7 @@ This product bundles checkerframework checker-qual. Copyright: 2004-2019 the Checker Framework developers Project URL: https://github.com/typetools/checker-framework License: MIT license + | The annotations are licensed under the MIT License. (The text of this | license appears below.) More specifically, all the parts of the Checker | Framework that you might want to include with your own program use the @@ -390,20 +487,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Perfmark. - -Project URL: https://github.com/perfmark/perfmark -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles org.json. - -Project URL: https://github.com/douglascrockford/JSON-java -License: Public Domain - https://github.com/stleary/JSON-java/blob/master/LICENSE - --------------------------------------------------------------------------------- - This product bundles Apache Arrow. Copyright: 2016-2019 The Apache Software Foundation. @@ -420,47 +503,18 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Google FlatBuffers. +This product bundles JCTools (via Netty). -Copyright: 2013-2020 Google Inc. -Project URL: https://google.github.io/flatbuffers/ +Project URL: https://github.com/JCTools/JCTools License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles ThreeTen BP. +This product bundles Google FlatBuffers. -Project URL: https://www.threeten.org/threetenbp -License: BSD 3-Clause -| Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. -| -| All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are met: -| -| * Redistributions of source code must retain the above copyright notice, -| this list of conditions and the following disclaimer. -| -| * Redistributions in binary form must reproduce the above copyright notice, -| this list of conditions and the following disclaimer in the documentation -| and/or other materials provided with the distribution. -| -| * Neither the name of JSR-310 nor the names of its contributors -| may be used to endorse or promote products derived from this software -| without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +Copyright: 2013-2020 Google Inc. +Project URL: https://google.github.io/flatbuffers/ +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -469,6 +523,7 @@ This product bundles ThreeTen Extra. Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. Project URL: https://www.threeten.org/threeten-extra/ License: BSD 3-Clause + | All rights reserved. | | * Redistribution and use in source and binary forms, with or without @@ -540,21 +595,394 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache HttpComponents (client and core). +This product bundles and includes code from Apache HttpComponents (core/client). + +* retry and error handling logic in ExponentialHttpRequestRetryStrategy.java -Copyright: 1999-2022 The Apache Software Foundation. +Copyright: 1999-2022 The Apache Software Foundation Project URL: https://hc.apache.org/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product includes code from Apache HttpComponents Client. +This product bundles the Mozilla Public Suffix List, distributed by Apache HttpComponents. -* retry and error handling logic in ExponentialHttpRequestRetryStrategy.java +Project URL: https://publicsuffix.org/ +License: Mozilla Public License, Version 2.0 - https://mozilla.org/MPL/2.0/ -Copyright: 1999-2022 The Apache Software Foundation. -Project URL: https://hc.apache.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 +| Mozilla Public License Version 2.0 +| ================================== +| +| 1. Definitions +| -------------- +| +| 1.1. "Contributor" +| means each individual or legal entity that creates, contributes to +| the creation of, or owns Covered Software. +| +| 1.2. "Contributor Version" +| means the combination of the Contributions of others (if any) used +| by a Contributor and that particular Contributor's Contribution. +| +| 1.3. "Contribution" +| means Covered Software of a particular Contributor. +| +| 1.4. "Covered Software" +| means Source Code Form to which the initial Contributor has attached +| the notice in Exhibit A, the Executable Form of such Source Code +| Form, and Modifications of such Source Code Form, in each case +| including portions thereof. +| +| 1.5. "Incompatible With Secondary Licenses" +| means +| +| (a) that the initial Contributor has attached the notice described +| in Exhibit B to the Covered Software; or +| +| (b) that the Covered Software was made available under the terms of +| version 1.1 or earlier of the License, but not also under the +| terms of a Secondary License. +| +| 1.6. "Executable Form" +| means any form of the work other than Source Code Form. +| +| 1.7. "Larger Work" +| means a work that combines Covered Software with other material, in +| a separate file or files, that is not Covered Software. +| +| 1.8. "License" +| means this document. +| +| 1.9. "Licensable" +| means having the right to grant, to the maximum extent possible, +| whether at the time of the initial grant or subsequently, any and +| all of the rights conveyed by this License. +| +| 1.10. "Modifications" +| means any of the following: +| +| (a) any file in Source Code Form that results from an addition to, +| deletion from, or modification of the contents of Covered +| Software; or +| +| (b) any new file in Source Code Form that contains any Covered +| Software. +| +| 1.11. "Patent Claims" of a Contributor +| means any patent claim(s), including without limitation, method, +| process, and apparatus claims, in any patent Licensable by such +| Contributor that would be infringed, but for the grant of the +| License, by the making, using, selling, offering for sale, having +| made, import, or transfer of either its Contributions or its +| Contributor Version. +| +| 1.12. "Secondary License" +| means either the GNU General Public License, Version 2.0, the GNU +| Lesser General Public License, Version 2.1, the GNU Affero General +| Public License, Version 3.0, or any later versions of those +| licenses. +| +| 1.13. "Source Code Form" +| means the form of the work preferred for making modifications. +| +| 1.14. "You" (or "Your") +| means an individual or a legal entity exercising rights under this +| License. For legal entities, "You" includes any entity that +| controls, is controlled by, or is under common control with You. For +| purposes of this definition, "control" means (a) the power, direct +| or indirect, to cause the direction or management of such entity, +| whether by contract or otherwise, or (b) ownership of more than +| fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants and Conditions +| -------------------------------- +| +| 2.1. Grants +| +| Each Contributor hereby grants You a world-wide, royalty-free, +| non-exclusive license: +| +| (a) under intellectual property rights (other than patent or trademark) +| Licensable by such Contributor to use, reproduce, make available, +| modify, display, perform, distribute, and otherwise exploit its +| Contributions, either on an unmodified basis, with Modifications, or +| as part of a Larger Work; and +| +| (b) under Patent Claims of such Contributor to make, use, sell, offer +| for sale, have made, import, and otherwise transfer either its +| Contributions or its Contributor Version. +| +| 2.2. Effective Date +| +| The licenses granted in Section 2.1 with respect to any Contribution +| become effective for each Contribution on the date the Contributor first +| distributes such Contribution. +| +| 2.3. Limitations on Grant Scope +| +| The licenses granted in this Section 2 are the only rights granted under +| this License. No additional rights or licenses will be implied from the +| distribution or licensing of Covered Software under this License. +| Notwithstanding Section 2.1(b) above, no patent license is granted by a +| Contributor: +| +| (a) for any code that a Contributor has removed from Covered Software; +| or +| +| (b) for infringements caused by: (i) Your and any other third party's +| modifications of Covered Software, or (ii) the combination of its +| Contributions with other software (except as part of its Contributor +| Version); or +| +| (c) under Patent Claims infringed by Covered Software in the absence of +| its Contributions. +| +| This License does not grant any rights in the trademarks, service marks, +| or logos of any Contributor (except as may be necessary to comply with +| the notice requirements in Section 3.4). +| +| 2.4. Subsequent Licenses +| +| No Contributor makes additional grants as a result of Your choice to +| distribute the Covered Software under a subsequent version of this +| License (see Section 10.2) or under the terms of a Secondary License (if +| permitted under the terms of Section 3.3). +| +| 2.5. Representation +| +| Each Contributor represents that the Contributor believes its +| Contributions are its original creation(s) or it has sufficient rights +| to grant the rights to its Contributions conveyed by this License. +| +| 2.6. Fair Use +| +| This License is not intended to limit any rights You have under +| applicable copyright doctrines of fair use, fair dealing, or other +| equivalents. +| +| 2.7. Conditions +| +| Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +| in Section 2.1. +| +| 3. Responsibilities +| ------------------- +| +| 3.1. Distribution of Source Form +| +| All distribution of Covered Software in Source Code Form, including any +| Modifications that You create or to which You contribute, must be under +| the terms of this License. You must inform recipients that the Source +| Code Form of the Covered Software is governed by the terms of this +| License, and how they can obtain a copy of this License. You may not +| attempt to alter or restrict the recipients' rights in the Source Code +| Form. +| +| 3.2. Distribution of Executable Form +| +| If You distribute Covered Software in Executable Form then: +| +| (a) such Covered Software must also be made available in Source Code +| Form, as described in Section 3.1, and You must inform recipients of +| the Executable Form how they can obtain a copy of such Source Code +| Form by reasonable means in a timely manner, at a charge no more +| than the cost of distribution to the recipient; and +| +| (b) You may distribute such Executable Form under the terms of this +| License, or sublicense it under different terms, provided that the +| license for the Executable Form does not attempt to limit or alter +| the recipients' rights in the Source Code Form under this License. +| +| 3.3. Distribution of a Larger Work +| +| You may create and distribute a Larger Work under terms of Your choice, +| provided that You also comply with the requirements of this License for +| the Covered Software. If the Larger Work is a combination of Covered +| Software with a work governed by one or more Secondary Licenses, and the +| Covered Software is not Incompatible With Secondary Licenses, this +| License permits You to additionally distribute such Covered Software +| under the terms of such Secondary License(s), so that the recipient of +| the Larger Work may, at their option, further distribute the Covered +| Software under the terms of either this License or such Secondary +| License(s). +| +| 3.4. Notices +| +| You may not remove or alter the substance of any license notices +| (including copyright notices, patent notices, disclaimers of warranty, +| or limitations of liability) contained within the Source Code Form of +| the Covered Software, except that You may alter any license notices to +| the extent required to remedy known factual inaccuracies. +| +| 3.5. Application of Additional Terms +| +| You may choose to offer, and to charge a fee for, warranty, support, +| indemnity or liability obligations to one or more recipients of Covered +| Software. However, You may do so only on Your own behalf, and not on +| behalf of any Contributor. You must make it absolutely clear that any +| such warranty, support, indemnity, or liability obligation is offered by +| You alone, and You hereby agree to indemnify every Contributor for any +| liability incurred by such Contributor as a result of warranty, support, +| indemnity or liability terms You offer. You may include additional +| disclaimers of warranty and limitations of liability specific to any +| jurisdiction. +| +| 4. Inability to Comply Due to Statute or Regulation +| --------------------------------------------------- +| +| If it is impossible for You to comply with any of the terms of this +| License with respect to some or all of the Covered Software due to +| statute, judicial order, or regulation then You must: (a) comply with +| the terms of this License to the maximum extent possible; and (b) +| describe the limitations and the code they affect. Such description must +| be placed in a text file included with all distributions of the Covered +| Software under this License. Except to the extent prohibited by statute +| or regulation, such description must be sufficiently detailed for a +| recipient of ordinary skill to be able to understand it. +| +| 5. Termination +| -------------- +| +| 5.1. The rights granted under this License will terminate automatically +| if You fail to comply with any of its terms. However, if You become +| compliant, then the rights granted under this License from a particular +| Contributor are reinstated (a) provisionally, unless and until such +| Contributor explicitly and finally terminates Your grants, and (b) on an +| ongoing basis, if such Contributor fails to notify You of the +| non-compliance by some reasonable means prior to 60 days after You have +| come back into compliance. Moreover, Your grants from a particular +| Contributor are reinstated on an ongoing basis if such Contributor +| notifies You of the non-compliance by some reasonable means, this is the +| first time You have received notice of non-compliance with this License +| from such Contributor, and You become compliant prior to 30 days after +| Your receipt of the notice. +| +| 5.2. If You initiate litigation against any entity by asserting a patent +| infringement claim (excluding declaratory judgment actions, +| counter-claims, and cross-claims) alleging that a Contributor Version +| directly or indirectly infringes any patent, then the rights granted to +| You by any and all Contributors for the Covered Software under Section +| 2.1 of this License shall terminate. +| +| 5.3. In the event of termination under Sections 5.1 or 5.2 above, all +| end user license agreements (excluding distributors and resellers) which +| have been validly granted by You or Your distributors under this License +| prior to termination shall survive termination. +| +| ************************************************************************ +| * * +| * 6. Disclaimer of Warranty * +| * ------------------------- * +| * * +| * Covered Software is provided under this License on an "as is" * +| * basis, without warranty of any kind, either expressed, implied, or * +| * statutory, including, without limitation, warranties that the * +| * Covered Software is free of defects, merchantable, fit for a * +| * particular purpose or non-infringing. The entire risk as to the * +| * quality and performance of the Covered Software is with You. * +| * Should any Covered Software prove defective in any respect, You * +| * (not any Contributor) assume the cost of any necessary servicing, * +| * repair, or correction. This disclaimer of warranty constitutes an * +| * essential part of this License. No use of any Covered Software is * +| * authorized under this License except under this disclaimer. * +| * * +| ************************************************************************ +| +| ************************************************************************ +| * * +| * 7. Limitation of Liability * +| * -------------------------- * +| * * +| * Under no circumstances and under no legal theory, whether tort * +| * (including negligence), contract, or otherwise, shall any * +| * Contributor, or anyone who distributes Covered Software as * +| * permitted above, be liable to You for any direct, indirect, * +| * special, incidental, or consequential damages of any character * +| * including, without limitation, damages for lost profits, loss of * +| * goodwill, work stoppage, computer failure or malfunction, or any * +| * and all other commercial damages or losses, even if such party * +| * shall have been informed of the possibility of such damages. This * +| * limitation of liability shall not apply to liability for death or * +| * personal injury resulting from such party's negligence to the * +| * extent applicable law prohibits such limitation. Some * +| * jurisdictions do not allow the exclusion or limitation of * +| * incidental or consequential damages, so this exclusion and * +| * limitation may not apply to You. * +| * * +| ************************************************************************ +| +| 8. Litigation +| ------------- +| +| Any litigation relating to this License may be brought only in the +| courts of a jurisdiction where the defendant maintains its principal +| place of business and such litigation shall be governed by laws of that +| jurisdiction, without reference to its conflict-of-law provisions. +| Nothing in this Section shall prevent a party's ability to bring +| cross-claims or counter-claims. +| +| 9. Miscellaneous +| ---------------- +| +| This License represents the complete agreement concerning the subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. Any law or regulation which provides +| that the language of a contract shall be construed against the drafter +| shall not be used to construe this License against a Contributor. +| +| 10. Versions of the License +| --------------------------- +| +| 10.1. New Versions +| +| Mozilla Foundation is the license steward. Except as provided in Section +| 10.3, no one other than the license steward has the right to modify or +| publish new versions of this License. Each version will be given a +| distinguishing version number. +| +| 10.2. Effect of New Versions +| +| You may distribute the Covered Software under the terms of the version +| of the License under which You originally received the Covered Software, +| or under the terms of any subsequent version published by the license +| steward. +| +| 10.3. Modified Versions +| +| If you create software not governed by this License, and you want to +| create a new license for such software, you may create and use a +| modified version of this License if you rename the license and remove +| any references to the name of the license steward (except to note that +| such modified license differs from this License). +| +| 10.4. Distributing Source Code Form that is Incompatible With Secondary +| Licenses +| +| If You choose to distribute Source Code Form that is Incompatible With +| Secondary Licenses under the terms of this version of the License, the +| notice described in Exhibit B of this License must be attached. +| +| Exhibit A - Source Code Form License Notice +| ------------------------------------------- +| +| This Source Code Form is subject to the terms of the Mozilla Public +| License, v. 2.0. If a copy of the MPL was not distributed with this +| file, You can obtain one at http://mozilla.org/MPL/2.0/. +| +| If it is not possible or desirable to put the notice in a particular +| file, then You may include the notice in a location (such as a LICENSE +| file in a relevant directory) where a recipient would be likely to look +| for such a notice. +| +| You may add additional accurate notices of copyright ownership. +| +| Exhibit B - "Incompatible With Secondary Licenses" Notice +| --------------------------------------------------------- +| +| This Source Code Form is "Incompatible With Secondary Licenses", as +| defined by the Mozilla Public License, v. 2.0. -------------------------------------------------------------------------------- @@ -573,16 +1001,46 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Datasketches. +This product bundles Eclipse Collections. -Project URL: https://datasketches.apache.org -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 +Project URL: https://github.com/eclipse-collections/eclipse-collections +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php + +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| * Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| * Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- -This product bundles Zero-Allocation Hashing. +This product bundles Apache Datasketches. -Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing +Project URL: https://datasketches.apache.org License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -590,70 +1048,37 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 This product bundles JTS Topology Suite. Project URL: https://github.com/locationtech/jts -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt - --------------------------------------------------------------------------------- - -This product bundles JSpecify. - -Project URL: https://github.com/jspecify/jspecify -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Animal Sniffer Annotations. - -License: MIT -| The MIT License -| -| Copyright (c) 2009 codehaus.org. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in -| all copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -| THE SOFTWARE. - --------------------------------------------------------------------------------- - -This product bundles Android Annotations. - -Project URL: http://source.android.com/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Conscrypt (openjdk-uber). - -Project URL: https://conscrypt.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud BigQuery Client for Java. - -Project URL: https://github.com/googleapis/java-bigquery -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud Java Client Libraries. - -Project URL: https://github.com/googleapis/google-cloud-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| * Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| * Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- This product bundles RoaringBitmap. @@ -661,128 +1086,3 @@ This product bundles RoaringBitmap. Copyright: (c) 2013-... the RoaringBitmap authors Project URL: https://github.com/RoaringBitmap/RoaringBitmap License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Credentials for Java. - -Project URL: https://github.com/aliyun/credentials-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Tea for Java. - -Project URL: https://github.com/aliyun/tea-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google GAX. - -Project URL: https://github.com/googleapis/gax-java -License: BSD 3-Clause -| Copyright 2016, Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This product bundles Google Gson. - -Project URL: https://github.com/google/gson/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles gRPC. - -Project URL: https://github.com/grpc/grpc-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OpenCensus. - -Project URL: https://github.com/census-instrumentation/opencensus-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OkHttp. - -Project URL: https://github.com/square/okhttp -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Auto Value. - -Project URL: https://github.com/google/auto/tree/master/value -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JaCoCo. - -Project URL: https://github.com/jacoco/jacoco/ -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt - --------------------------------------------------------------------------------- - -This product bundles OpenTelemetry. - -Project URL: https://opentelemetry.io/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Kotlin. - -Project URL: https://github.com/JetBrains/kotlin -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JAXB. - -Project URL: http://jaxb.java.net/ -License: CDDL 1.1 - https://glassfish.java.net/public/CDDL+GPL_1_1.html - --------------------------------------------------------------------------------- - -This product bundles EMMA runtime. - -Project URL: https://github.com/ehelms/Emma/ -License: Common Public License - v 1.0 - --------------------------------------------------------------------------------- - -This product bundles Google j2objc. - -Project URL: https://github.com/google/j2objc/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 diff --git a/spark/v3.4/spark-runtime/NOTICE b/spark/v3.4/spark-runtime/NOTICE index 68abd73906b1..c038e853af77 100644 --- a/spark/v3.4/spark-runtime/NOTICE +++ b/spark/v3.4/spark-runtime/NOTICE @@ -66,42 +66,6 @@ This product bundles Airlift Aircompressor with the following in its NOTICE file -------------------------------------------------------------------------------- -This product bundles Google Protobuf with the following in its NOTICE file: -| Copyright 2008 Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -| -| Code generated by the Protocol Buffer compiler is owned by the owner -| of the input file used when generating it. This code is not -| standalone and requires a support library to be linked with it. This -| support library is itself covered by the above license. - --------------------------------------------------------------------------------- - This product bundles Netty with the following in its NOTICE file: | The Netty Project | ================= @@ -371,6 +335,42 @@ This product bundles Project Nessie with the following in its NOTICE file: -------------------------------------------------------------------------------- +This product bundles Jackson JSON Processor with the following in its NOTICE file: +| # Jackson JSON processor +| +| Jackson is a high-performance, Free/Open Source JSON processing library. +| It was originally written by Tatu Saloranta (tatu.saloranta@iki.fi), and has +| been in development since 2007. +| It is currently developed by a community of developers. +| +| ## Copyright +| +| Copyright 2007-, Tatu Saloranta (tatu.saloranta@iki.fi) +| +| ## Licensing +| +| Jackson 2.x core and extension components are licensed under Apache License 2.0 +| To find the details that apply to this artifact see the accompanying LICENSE file. +| +| ## Credits +| +| A list of contributors may be found from CREDITS(-2.x) file, which is included +| in some artifacts (usually source distributions); but is always available +| from the source code management (SCM) system project uses. +| +| ## FastDoubleParser +| +| jackson-core bundles a shaded copy of FastDoubleParser . +| That code is available under an MIT license +| under the following copyright. +| +| Copyright © 2023 Werner Randelshofer, Switzerland. MIT License. +| +| See FastDoubleParser-NOTICE for details of other source code included in FastDoubleParser +| and the licenses and copyrights that apply to that code. + +-------------------------------------------------------------------------------- + This product bundles Eclipse MicroProfile OpenAPI with the following in its NOTICE file: | ========================================================================= | == NOTICE file corresponding to section 4(d) of the Apache License, == @@ -391,69 +391,3 @@ This product bundles Eclipse MicroProfile OpenAPI with the following in its NOTI | PackageCopyrightText: | Arthur De Magalhaes arthurdm@ca.ibm.com | - --------------------------------------------------------------------------------- - -This product bundles gRPC with the following in its NOTICE file: -| Copyright 2014 The gRPC Authors -| -| Licensed under the Apache License, Version 2.0 (the "License"); -| you may not use this file except in compliance with the License. -| You may obtain a copy of the License at -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, -| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -| See the License for the specific language governing permissions and -| limitations under the License. -| -| ----------------------------------------------------------------------- -| -| This product contains a modified portion of 'OkHttp', an open source -| HTTP & SPDY client for Android and Java applications, which can be obtained -| at: -| -| * LICENSE: -| * okhttp/third_party/okhttp/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/square/okhttp -| * LOCATION_IN_GRPC: -| * okhttp/third_party/okhttp -| -| This product contains a modified portion of 'Envoy', an open source -| cloud-native high-performance edge/middle/service proxy, which can be -| obtained at: -| -| * LICENSE: -| * xds/third_party/envoy/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/envoy/NOTICE -| * HOMEPAGE: -| * https://www.envoyproxy.io -| * LOCATION_IN_GRPC: -| * xds/third_party/envoy -| -| This product contains a modified portion of 'protoc-gen-validate (PGV)', -| an open source protoc plugin to generate polyglot message validators, -| which can be obtained at: -| -| * LICENSE: -| * xds/third_party/protoc-gen-validate/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/protoc-gen-validate/NOTICE -| * HOMEPAGE: -| * https://github.com/envoyproxy/protoc-gen-validate -| * LOCATION_IN_GRPC: -| * xds/third_party/protoc-gen-validate -| -| This product contains a modified portion of 'udpa', -| an open source universal data plane API, which can be obtained at: -| -| * LICENSE: -| * xds/third_party/udpa/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/cncf/udpa -| * LOCATION_IN_GRPC: -| * xds/third_party/udpa diff --git a/spark/v3.4/spark-runtime/baseline-class-uniqueness.lock b/spark/v3.4/spark-runtime/baseline-class-uniqueness.lock index 35cad90d888f..6197975f3900 100644 --- a/spark/v3.4/spark-runtime/baseline-class-uniqueness.lock +++ b/spark/v3.4/spark-runtime/baseline-class-uniqueness.lock @@ -77,21 +77,21 @@ - io.netty.buffer.AdaptivePoolingAllocator$SizeClassChunkController - io.netty.buffer.AdaptivePoolingAllocator$SizeClassedChunk - io.netty.buffer.ByteBufUtil - - io.netty.buffer.ByteBufUtil$2 - - io.netty.buffer.ByteBufUtil$HexUtil - - io.netty.buffer.ByteBufUtil$ThreadLocalDirectByteBuf - - io.netty.buffer.ByteBufUtil$ThreadLocalDirectByteBuf$1 - - io.netty.buffer.ByteBufUtil$ThreadLocalUnsafeDirectByteBuf - - io.netty.buffer.ByteBufUtil$ThreadLocalUnsafeDirectByteBuf$1 - io.netty.buffer.CompositeByteBuf - io.netty.buffer.EmptyByteBuf - io.netty.buffer.PoolArena + - io.netty.buffer.PoolArena$DirectArena - io.netty.buffer.PoolThreadCache$FreeOnFinalize - io.netty.buffer.PooledByteBufAllocator + - io.netty.buffer.PooledByteBufAllocator$PoolThreadLocalCache - io.netty.buffer.ReadOnlyAbstractByteBuf - io.netty.buffer.SimpleLeakAwareByteBuf - io.netty.buffer.Unpooled + - io.netty.buffer.UnpooledByteBufAllocator$DecrementingCleanableDirectBuffer + - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledDirectByteBuf + - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeDirectByteBuf - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeNoCleanerDirectByteBuf + - io.netty.buffer.UnpooledByteBufAllocator$UnpooledByteBufAllocatorMetric - io.netty.buffer.UnpooledDirectByteBuf - io.netty.buffer.UnpooledHeapByteBuf - io.netty.buffer.UnpooledUnsafeDirectByteBuf @@ -99,11 +99,19 @@ - io.netty.buffer.UnsafeByteBufUtil [dev.vortex:vortex-jni (classifier=all), io.netty:netty-common] - io.netty.util.AbstractReferenceCounted + - io.netty.util.DefaultAttributeMap - io.netty.util.HashedWheelTimer - io.netty.util.HashedWheelTimer$HashedWheelBucket - io.netty.util.LeakPresenceDetector - io.netty.util.LeakPresenceDetector$LeakCreation - io.netty.util.LeakPresenceDetector$ResourceScope + - io.netty.util.Recycler + - io.netty.util.Recycler$BlockingMessageQueue + - io.netty.util.Recycler$DefaultHandle + - io.netty.util.Recycler$EnhancedHandle + - io.netty.util.Recycler$GuardedLocalPool + - io.netty.util.Recycler$LocalPool + - io.netty.util.Recycler$UnguardedLocalPool - io.netty.util.concurrent.AbstractScheduledEventExecutor - io.netty.util.concurrent.GlobalEventExecutor - io.netty.util.concurrent.GlobalEventExecutor$2 @@ -118,11 +126,23 @@ - io.netty.util.concurrent.SingleThreadEventExecutor$4 - io.netty.util.concurrent.SingleThreadEventExecutor$5 - io.netty.util.concurrent.SingleThreadEventExecutor$DefaultThreadProperties + - io.netty.util.internal.Cleaner - io.netty.util.internal.CleanerJava24Linker - io.netty.util.internal.CleanerJava24Linker$CleanableDirectBufferImpl - io.netty.util.internal.CleanerJava25 - io.netty.util.internal.CleanerJava25$CleanableDirectBufferImpl + - io.netty.util.internal.CleanerJava6 + - io.netty.util.internal.CleanerJava6$2 + - io.netty.util.internal.CleanerJava6$CleanableDirectBufferImpl + - io.netty.util.internal.CleanerJava9 + - io.netty.util.internal.CleanerJava9$2 + - io.netty.util.internal.CleanerJava9$CleanableDirectBufferImpl + - io.netty.util.internal.DirectCleaner + - io.netty.util.internal.DirectCleaner$CleanableDirectBufferImpl + - io.netty.util.internal.EmptyArrays - io.netty.util.internal.PlatformDependent + - io.netty.util.internal.PlatformDependent$1 + - io.netty.util.internal.PlatformDependent$1$1 - io.netty.util.internal.PlatformDependent$Mpsc - io.netty.util.internal.PlatformDependent$Mpsc$1 - io.netty.util.internal.PlatformDependent0 diff --git a/spark/v3.4/spark-runtime/runtime-deps.txt b/spark/v3.4/spark-runtime/runtime-deps.txt new file mode 100644 index 000000000000..f03bb5933758 --- /dev/null +++ b/spark/v3.4/spark-runtime/runtime-deps.txt @@ -0,0 +1,48 @@ +com.fasterxml.jackson.core:jackson-annotations:2.21 +com.fasterxml.jackson.core:jackson-core:2.14.2 +com.fasterxml.jackson.core:jackson-databind:2.14.2 +com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.21.3 +com.github.ben-manes.caffeine:caffeine:2.9.3 +com.google.errorprone:error_prone_annotations:2.41.0 +com.google.flatbuffers:flatbuffers-java:25.2.10 +com.google.guava:failureaccess:1.0.3 +com.google.guava:guava:33.5.0-jre +com.google.guava:listenablefuture:9999.0-empty-to-avoid-conflict-with-guava +com.google.j2objc:j2objc-annotations:3.1 +com.google.protobuf:protobuf-java:4.33.5 +dev.failsafe:failsafe:3.3.2 +dev.vortex:vortex-jni:0.67.0 +dev.vortex:vortex-spark_2.12:0.67.0 +io.airlift:aircompressor:2.0.3 +io.netty:netty-buffer:4.2.12.Final +io.netty:netty-common:4.2.12.Final +org.apache.arrow:arrow-c-data:18.3.0 +org.apache.arrow:arrow-format:18.3.0 +org.apache.arrow:arrow-memory-core:18.3.0 +org.apache.arrow:arrow-memory-netty-buffer-patch:18.3.0 +org.apache.arrow:arrow-memory-netty:18.3.0 +org.apache.arrow:arrow-vector:18.3.0 +org.apache.avro:avro:1.12.1 +org.apache.datasketches:datasketches-java:6.2.0 +org.apache.datasketches:datasketches-memory:3.0.2 +org.apache.httpcomponents.client5:httpclient5:5.6.1 +org.apache.httpcomponents.core5:httpcore5-h2:5.4 +org.apache.httpcomponents.core5:httpcore5:5.4 +org.apache.orc:orc-core:1.9.8 +org.apache.orc:orc-shims:1.9.8 +org.apache.parquet:parquet-avro:1.17.0 +org.apache.parquet:parquet-column:1.17.0 +org.apache.parquet:parquet-common:1.17.0 +org.apache.parquet:parquet-encoding:1.17.0 +org.apache.parquet:parquet-format-structures:1.17.0 +org.apache.parquet:parquet-hadoop:1.17.0 +org.apache.parquet:parquet-jackson:1.17.0 +org.apache.parquet:parquet-variant:1.17.0 +org.checkerframework:checker-qual:3.19.0 +org.eclipse.microprofile.openapi:microprofile-openapi-api:4.1.1 +org.jspecify:jspecify:1.0.0 +org.locationtech.jts:jts-core:1.20.0 +org.projectnessie.nessie:nessie-client:0.107.5 +org.projectnessie.nessie:nessie-model:0.107.5 +org.roaringbitmap:RoaringBitmap:1.6.14 +org.threeten:threeten-extra:1.7.1 diff --git a/spark/v3.4/spark/baseline-class-uniqueness.lock b/spark/v3.4/spark/baseline-class-uniqueness.lock index 4a6e30c63973..72c0c24fb849 100644 --- a/spark/v3.4/spark/baseline-class-uniqueness.lock +++ b/spark/v3.4/spark/baseline-class-uniqueness.lock @@ -125,21 +125,21 @@ - io.netty.buffer.AdaptivePoolingAllocator$SizeClassChunkController - io.netty.buffer.AdaptivePoolingAllocator$SizeClassedChunk - io.netty.buffer.ByteBufUtil - - io.netty.buffer.ByteBufUtil$2 - - io.netty.buffer.ByteBufUtil$HexUtil - - io.netty.buffer.ByteBufUtil$ThreadLocalDirectByteBuf - - io.netty.buffer.ByteBufUtil$ThreadLocalDirectByteBuf$1 - - io.netty.buffer.ByteBufUtil$ThreadLocalUnsafeDirectByteBuf - - io.netty.buffer.ByteBufUtil$ThreadLocalUnsafeDirectByteBuf$1 - io.netty.buffer.CompositeByteBuf - io.netty.buffer.EmptyByteBuf - io.netty.buffer.PoolArena + - io.netty.buffer.PoolArena$DirectArena - io.netty.buffer.PoolThreadCache$FreeOnFinalize - io.netty.buffer.PooledByteBufAllocator + - io.netty.buffer.PooledByteBufAllocator$PoolThreadLocalCache - io.netty.buffer.ReadOnlyAbstractByteBuf - io.netty.buffer.SimpleLeakAwareByteBuf - io.netty.buffer.Unpooled + - io.netty.buffer.UnpooledByteBufAllocator$DecrementingCleanableDirectBuffer + - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledDirectByteBuf + - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeDirectByteBuf - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeNoCleanerDirectByteBuf + - io.netty.buffer.UnpooledByteBufAllocator$UnpooledByteBufAllocatorMetric - io.netty.buffer.UnpooledDirectByteBuf - io.netty.buffer.UnpooledHeapByteBuf - io.netty.buffer.UnpooledUnsafeDirectByteBuf @@ -147,11 +147,19 @@ - io.netty.buffer.UnsafeByteBufUtil [dev.vortex:vortex-jni (classifier=all), io.netty:netty-common] - io.netty.util.AbstractReferenceCounted + - io.netty.util.DefaultAttributeMap - io.netty.util.HashedWheelTimer - io.netty.util.HashedWheelTimer$HashedWheelBucket - io.netty.util.LeakPresenceDetector - io.netty.util.LeakPresenceDetector$LeakCreation - io.netty.util.LeakPresenceDetector$ResourceScope + - io.netty.util.Recycler + - io.netty.util.Recycler$BlockingMessageQueue + - io.netty.util.Recycler$DefaultHandle + - io.netty.util.Recycler$EnhancedHandle + - io.netty.util.Recycler$GuardedLocalPool + - io.netty.util.Recycler$LocalPool + - io.netty.util.Recycler$UnguardedLocalPool - io.netty.util.concurrent.AbstractScheduledEventExecutor - io.netty.util.concurrent.GlobalEventExecutor - io.netty.util.concurrent.GlobalEventExecutor$2 @@ -166,11 +174,23 @@ - io.netty.util.concurrent.SingleThreadEventExecutor$4 - io.netty.util.concurrent.SingleThreadEventExecutor$5 - io.netty.util.concurrent.SingleThreadEventExecutor$DefaultThreadProperties + - io.netty.util.internal.Cleaner - io.netty.util.internal.CleanerJava24Linker - io.netty.util.internal.CleanerJava24Linker$CleanableDirectBufferImpl - io.netty.util.internal.CleanerJava25 - io.netty.util.internal.CleanerJava25$CleanableDirectBufferImpl + - io.netty.util.internal.CleanerJava6 + - io.netty.util.internal.CleanerJava6$2 + - io.netty.util.internal.CleanerJava6$CleanableDirectBufferImpl + - io.netty.util.internal.CleanerJava9 + - io.netty.util.internal.CleanerJava9$2 + - io.netty.util.internal.CleanerJava9$CleanableDirectBufferImpl + - io.netty.util.internal.DirectCleaner + - io.netty.util.internal.DirectCleaner$CleanableDirectBufferImpl + - io.netty.util.internal.EmptyArrays - io.netty.util.internal.PlatformDependent + - io.netty.util.internal.PlatformDependent$1 + - io.netty.util.internal.PlatformDependent$1$1 - io.netty.util.internal.PlatformDependent$Mpsc - io.netty.util.internal.PlatformDependent$Mpsc$1 - io.netty.util.internal.PlatformDependent0 diff --git a/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java b/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java index 68406a20e725..317bd96e7df1 100644 --- a/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java +++ b/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java @@ -37,6 +37,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.actions.SparkActions; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -179,6 +180,7 @@ private void setupSpark() { .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") .config("spark.sql.catalog.spark_catalog.warehouse", catalogWarehouse()) + .config(TestBase.DISABLE_UI) .master("local"); spark = builder.getOrCreate(); } diff --git a/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java b/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java index 2ac7c26992e3..073e8c9327df 100644 --- a/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java +++ b/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java @@ -41,6 +41,7 @@ import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.actions.SparkActions; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; @@ -393,6 +394,7 @@ protected void setupSpark() { "spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") .config("spark.sql.catalog.spark_catalog.type", "hadoop") .config("spark.sql.catalog.spark_catalog.warehouse", getCatalogWarehouse()) + .config(TestBase.DISABLE_UI) .master("local[*]"); spark = builder.getOrCreate(); Configuration sparkHadoopConf = spark.sessionState().newHadoopConf(); diff --git a/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java b/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java index 68c537e34a4a..debe37866ff7 100644 --- a/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java +++ b/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java @@ -30,6 +30,7 @@ import org.apache.iceberg.UpdateProperties; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SaveMode; @@ -94,7 +95,7 @@ protected void cleanupFiles() throws IOException { } protected void setupSpark(boolean enableDictionaryEncoding) { - SparkSession.Builder builder = SparkSession.builder().config("spark.ui.enabled", false); + SparkSession.Builder builder = SparkSession.builder().config(TestBase.DISABLE_UI); if (!enableDictionaryEncoding) { builder .config("parquet.dictionary.page.size", "1") diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java index ea400a779235..cb9da3edc678 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java @@ -56,7 +56,7 @@ public static R withCommitProperties( ExceptionUtil.castAndThrow(e, exClass); return null; } finally { - COMMIT_PROPERTIES.set(ImmutableMap.of()); + COMMIT_PROPERTIES.remove(); } } diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java index fbd21f737450..fec413ca079a 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java @@ -41,6 +41,7 @@ import org.apache.spark.sql.types.IntegerType$; import org.apache.spark.sql.types.LongType$; import org.apache.spark.sql.types.MapType; +import org.apache.spark.sql.types.NullType$; import org.apache.spark.sql.types.StringType$; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; @@ -238,5 +239,6 @@ public Type primitive(Type.PrimitiveType primitive) { .put(TypeID.STRING, ImmutableSet.of(StringType$.class)) .put(TypeID.FIXED, ImmutableSet.of(BinaryType$.class)) .put(TypeID.BINARY, ImmutableSet.of(BinaryType$.class)) + .put(TypeID.UNKNOWN, ImmutableSet.of(NullType$.class)) .buildOrThrow(); } diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java index bc8a966488ee..f1709277525a 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java @@ -24,17 +24,17 @@ import java.util.function.Function; import org.apache.spark.sql.connector.catalog.Identifier; import org.apache.spark.sql.connector.catalog.StagedTable; -import org.apache.spark.sql.connector.catalog.SupportsDelete; +import org.apache.spark.sql.connector.catalog.SupportsDeleteV2; import org.apache.spark.sql.connector.catalog.SupportsRead; import org.apache.spark.sql.connector.catalog.SupportsWrite; import org.apache.spark.sql.connector.catalog.Table; import org.apache.spark.sql.connector.catalog.TableCapability; import org.apache.spark.sql.connector.catalog.TableCatalog; import org.apache.spark.sql.connector.expressions.Transform; +import org.apache.spark.sql.connector.expressions.filter.Predicate; import org.apache.spark.sql.connector.read.ScanBuilder; import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.connector.write.WriteBuilder; -import org.apache.spark.sql.sources.Filter; import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.util.CaseInsensitiveStringMap; @@ -58,7 +58,7 @@ * #capabilities()}. */ public class RollbackStagedTable - implements StagedTable, SupportsRead, SupportsWrite, SupportsDelete { + implements StagedTable, SupportsRead, SupportsWrite, SupportsDeleteV2 { private final TableCatalog catalog; private final Identifier ident; private final Table table; @@ -106,8 +106,8 @@ public Set capabilities() { } @Override - public void deleteWhere(Filter[] filters) { - call(SupportsDelete.class, t -> t.deleteWhere(filters)); + public void deleteWhere(Predicate[] predicates) { + call(SupportsDeleteV2.class, t -> t.deleteWhere(predicates)); } @Override diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java index bad31d8d85f4..78d69eeaaf61 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java @@ -35,8 +35,6 @@ public abstract class SparkContentFile implements ContentFile { - private static final FileContent[] FILE_CONTENT_VALUES = FileContent.values(); - private final int fileContentPosition; private final int filePathPosition; private final int fileFormatPosition; @@ -139,7 +137,7 @@ public FileContent content() { if (wrapped.isNullAt(fileContentPosition)) { return null; } - return FILE_CONTENT_VALUES[wrapped.getInt(fileContentPosition)]; + return FileContent.fromId(wrapped.getInt(fileContentPosition)); } @Override diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java index 8beaefc5cc8f..b7ed31c274d7 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java @@ -35,6 +35,7 @@ import org.apache.spark.sql.types.IntegerType; import org.apache.spark.sql.types.LongType; import org.apache.spark.sql.types.MapType; +import org.apache.spark.sql.types.NullType; import org.apache.spark.sql.types.ShortType; import org.apache.spark.sql.types.StringType; import org.apache.spark.sql.types.StructField; @@ -155,6 +156,8 @@ public Type atomic(DataType atomic) { ((DecimalType) atomic).precision(), ((DecimalType) atomic).scale()); } else if (atomic instanceof BinaryType) { return Types.BinaryType.get(); + } else if (atomic instanceof NullType) { + return Types.UnknownType.get(); } throw new UnsupportedOperationException("Not a supported type: " + atomic.catalogString()); diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java index dfb9b30be603..d33632bbbd54 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java @@ -38,6 +38,7 @@ import org.apache.spark.sql.types.MapType$; import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.MetadataBuilder; +import org.apache.spark.sql.types.NullType$; import org.apache.spark.sql.types.StringType$; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType$; @@ -124,9 +125,11 @@ public DataType primitive(Type.PrimitiveType primitive) { case DECIMAL: Types.DecimalType decimal = (Types.DecimalType) primitive; return DecimalType$.MODULE$.apply(decimal.precision(), decimal.scale()); + case UNKNOWN: + return NullType$.MODULE$; default: throw new UnsupportedOperationException( - "Cannot convert unknown type to Spark: " + primitive); + "Cannot convert unsupported type to Spark: " + primitive); } } diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java index db359fdd62fc..bf80dcb10b30 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java @@ -24,6 +24,8 @@ import java.nio.ByteBuffer; import java.nio.charset.CharsetEncoder; import java.nio.charset.StandardCharsets; +import java.time.LocalDateTime; +import org.apache.iceberg.util.DateTimeUtil; import org.apache.iceberg.util.ZOrderByteUtils; import org.apache.spark.sql.Column; import org.apache.spark.sql.expressions.UserDefinedFunction; @@ -40,6 +42,7 @@ import org.apache.spark.sql.types.LongType; import org.apache.spark.sql.types.ShortType; import org.apache.spark.sql.types.StringType; +import org.apache.spark.sql.types.TimestampNTZType; import org.apache.spark.sql.types.TimestampType; import scala.collection.JavaConverters; import scala.collection.Seq; @@ -180,6 +183,29 @@ value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) return udf; } + private UserDefinedFunction timestampNtzToOrderedBytesUDF() { + int position = inputCol; + UserDefinedFunction udf = + functions + .udf( + (LocalDateTime value) -> { + if (value == null) { + return PRIMITIVE_EMPTY; + } + long micros = DateTimeUtil.microsFromTimestamp(value); + return ZOrderByteUtils.longToOrderedBytes( + micros, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) + .array(); + }, + DataTypes.BinaryType) + .withName("TIMESTAMP_NTZ_ORDERED_BYTES"); + + this.inputCol++; + increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); + + return udf; + } + private UserDefinedFunction floatToOrderedBytesUDF() { int position = inputCol; UserDefinedFunction udf = @@ -309,6 +335,8 @@ Column sortedLexicographically(Column column, DataType type) { return booleanToOrderedBytesUDF().apply(column); } else if (type instanceof TimestampType) { return longToOrderedBytesUDF().apply(column.cast(DataTypes.LongType)); + } else if (type instanceof TimestampNTZType) { + return timestampNtzToOrderedBytesUDF().apply(column); } else if (type instanceof DateType) { return longToOrderedBytesUDF().apply(column.cast(DataTypes.LongType)); } else { diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java index d74a76f94e87..2a2eef198b76 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java @@ -31,6 +31,7 @@ import org.apache.parquet.schema.Type.Repetition; import org.apache.spark.sql.types.ArrayType; import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.MapType; import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.StructField; @@ -181,21 +182,27 @@ private static T visitField( private static List visitFields( StructType struct, GroupType group, ParquetWithSparkSchemaVisitor visitor) { - StructField[] sFields = struct.fields(); - Preconditions.checkArgument( - sFields.length == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); - for (int i = 0; i < sFields.length; i += 1) { - Type field = group.getFields().get(i); - StructField sField = sFields[i]; - Preconditions.checkArgument( - field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())), - "Structs do not match: field %s != %s", - field.getName(), - sField.name()); - results.add(visitField(sField, field, visitor)); + + int fieldIndex = 0; + for (StructField sField : struct.fields()) { + if (sField.dataType() != DataTypes.NullType) { + Type field = group.getFields().get(fieldIndex); + Preconditions.checkArgument( + field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())), + "Structs do not match: field %s != %s", + field.getName(), + sField.name()); + results.add(visitField(sField, field, visitor)); + + fieldIndex += 1; + } } + // All the group fields should have been visited + Preconditions.checkArgument( + fieldIndex == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); + return results; } diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java index 6b799e677bf4..6fc8849c82b2 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java @@ -20,6 +20,8 @@ import java.io.Serializable; import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; import java.util.stream.Stream; import javax.annotation.Nullable; import org.apache.iceberg.FieldMetrics; @@ -77,7 +79,7 @@ public OrcValueWriter record( TypeDescription record, List names, List> fields) { - return new InternalRowWriter(fields, record.getChildren()); + return new InternalRowWriter(fields, iStruct, record.getChildren()); } @Override @@ -133,12 +135,16 @@ public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, TypeDescriptio private static class InternalRowWriter extends GenericOrcWriters.StructWriter { private final List> fieldGetters; - InternalRowWriter(List> writers, List orcTypes) { - super(writers); + InternalRowWriter( + List> writers, Types.StructType iStruct, List orcTypes) { + super(iStruct, writers); this.fieldGetters = Lists.newArrayListWithExpectedSize(orcTypes.size()); - for (TypeDescription orcType : orcTypes) { - fieldGetters.add(createFieldGetter(orcType)); + Map idToType = + orcTypes.stream().collect(Collectors.toMap(ORCSchemaUtil::fieldId, s -> s)); + + for (Types.NestedField iField : iStruct.fields()) { + fieldGetters.add(createFieldGetter(idToType.get(iField.fieldId()))); } } @@ -149,6 +155,11 @@ protected Object get(InternalRow struct, int index) { } static FieldGetter createFieldGetter(TypeDescription fieldType) { + // In the case of an UnknownType + if (fieldType == null) { + return (row, ordinal) -> null; + } + final FieldGetter fieldGetter; switch (fieldType.getCategory()) { case BOOLEAN: diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java index f4ae6114c8ab..a1ed6c66f337 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java @@ -26,6 +26,7 @@ import java.util.NoSuchElementException; import java.util.Optional; import java.util.UUID; +import java.util.stream.IntStream; import org.apache.iceberg.Schema; import org.apache.iceberg.parquet.ParquetValueReaders.ReusableEntry; import org.apache.iceberg.parquet.ParquetValueWriter; @@ -55,6 +56,7 @@ import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.Decimal; import org.apache.spark.sql.types.MapType; +import org.apache.spark.sql.types.NullType; import org.apache.spark.sql.types.ShortType; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; @@ -94,15 +96,18 @@ public ParquetValueWriter message( public ParquetValueWriter struct( StructType sStruct, GroupType struct, List> fieldWriters) { List fields = struct.getFields(); - StructField[] sparkFields = sStruct.fields(); List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); - List sparkTypes = Lists.newArrayList(); for (int i = 0; i < fields.size(); i += 1) { writers.add(newOption(struct.getType(i), fieldWriters.get(i))); - sparkTypes.add(sparkFields[i].dataType()); } - return new InternalRowWriter(writers, sparkTypes); + StructField[] sFields = sStruct.fields(); + DataType[] types = new DataType[sFields.length]; + for (int i = 0; i < sFields.length; i += 1) { + types[i] = sFields[i].dataType(); + } + + return new InternalRowWriter(writers, types); } @Override @@ -566,14 +571,33 @@ public Map.Entry next() { private static class InternalRowWriter extends ParquetValueWriters.StructWriter { private final DataType[] types; - private InternalRowWriter(List> writers, List types) { - super(writers); - this.types = types.toArray(new DataType[0]); + private InternalRowWriter(List> writers, DataType[] types) { + super(writerToFieldIndex(types, writers.size()), writers); + this.types = types; } @Override protected Object get(InternalRow struct, int index) { return struct.get(index, types[index]); } + + /** Returns a mapping from writer index to field index, skipping Unknown columns. */ + private static int[] writerToFieldIndex(DataType[] types, int numWriters) { + if (null == types) { + return IntStream.rangeClosed(0, numWriters).toArray(); + } + + // value writer index to record field index + int[] indexes = new int[numWriters]; + int writerIndex = 0; + for (int pos = 0; pos < types.length; pos += 1) { + if (!(types[pos] instanceof NullType)) { + indexes[writerIndex] = pos; + writerIndex += 1; + } + } + + return indexes; + } } } diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java index 8dceb075e604..4f324239881e 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java @@ -465,6 +465,8 @@ public ColumnVector convert( DeletedColumnVector deletedVector = new DeletedColumnVector(field.type()); deletedVector.setValue(new boolean[batchSize]); fieldVectors.add(deletedVector); + } else if (field.type().equals(Types.UnknownType.get())) { + fieldVectors.add(new ConstantColumnVector(field.type(), batchSize, null)); } else { fieldVectors.add( fieldConverters diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java index 353566eb7f34..1348afff6475 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java @@ -23,6 +23,7 @@ import java.io.IOException; import java.util.Map; +import java.util.Objects; import java.util.Set; import org.apache.iceberg.BaseMetadataTable; import org.apache.iceberg.BaseTable; @@ -56,6 +57,7 @@ import org.apache.iceberg.spark.CommitMetadata; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkReadOptions; +import org.apache.iceberg.spark.SparkSQLProperties; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkTableUtil; import org.apache.iceberg.spark.SparkUtil; @@ -333,11 +335,31 @@ public boolean canDeleteWhere(Predicate[] predicates) { } } - return canDeleteUsingMetadata(deleteExpr); + return canDeleteUsingMetadata(deleteExpr, scanBranchForDelete()); + } + + // Resolves the branch to scan during canDeleteWhere so it matches the branch deleteWhere + // will commit to. Falls back to main when WAP is configured but the WAP branch does not + // exist yet, since this is a read scan. + private String scanBranchForDelete() { + if (branch != null) { + return branch; + } + + if (!SparkTableUtil.wapEnabled(table())) { + return null; + } + + String wapBranch = sparkSession().conf().get(SparkSQLProperties.WAP_BRANCH, null); + if (wapBranch != null && table().refs().containsKey(wapBranch)) { + return wapBranch; + } + + return null; } // a metadata delete is possible iff matching files can be deleted entirely - private boolean canDeleteUsingMetadata(Expression deleteExpr) { + private boolean canDeleteUsingMetadata(Expression deleteExpr, String scanBranch) { boolean caseSensitive = SparkUtil.caseSensitive(sparkSession()); if (ExpressionUtil.selectsPartitions(deleteExpr, table(), caseSensitive)) { @@ -352,14 +374,14 @@ private boolean canDeleteUsingMetadata(Expression deleteExpr) { .includeColumnStats() .ignoreResiduals(); - if (branch != null) { - scan = scan.useRef(branch); + if (scanBranch != null) { + scan = scan.useRef(scanBranch); } try (CloseableIterable tasks = scan.planFiles()) { Map evaluators = Maps.newHashMap(); StrictMetricsEvaluator metricsEvaluator = - new StrictMetricsEvaluator(SnapshotUtil.schemaFor(table(), branch), deleteExpr); + new StrictMetricsEvaluator(SnapshotUtil.schemaFor(table(), scanBranch), deleteExpr); return Iterables.all( tasks, @@ -396,12 +418,13 @@ public void deleteWhere(Predicate[] predicates) { .set("spark.app.id", sparkSession().sparkContext().applicationId()) .deleteFromRowFilter(deleteExpr); + String writeBranch = branch; if (SparkTableUtil.wapEnabled(table())) { - branch = SparkTableUtil.determineWriteBranch(sparkSession(), branch); + writeBranch = SparkTableUtil.determineWriteBranch(sparkSession(), branch); } - if (branch != null) { - deleteFiles.toBranch(branch); + if (writeBranch != null) { + deleteFiles.toBranch(writeBranch); } if (!CommitMetadata.commitProperties().isEmpty()) { @@ -424,15 +447,16 @@ public boolean equals(Object other) { return false; } - // use only name in order to correctly invalidate Spark cache SparkTable that = (SparkTable) other; - return icebergTable.name().equals(that.icebergTable.name()); + return icebergTable.name().equals(that.icebergTable.name()) + && Objects.equals(table().uuid(), that.table().uuid()) + && Objects.equals(snapshotId, that.snapshotId) + && Objects.equals(branch, that.branch); } @Override public int hashCode() { - // use only name in order to correctly invalidate Spark cache - return icebergTable.name().hashCode(); + return Objects.hash(icebergTable.name(), table().uuid(), snapshotId, branch); } private static CaseInsensitiveStringMap addSnapshotId( diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java index e608a40b72ad..df4566da0c90 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java @@ -27,27 +27,27 @@ import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.spark.SparkFilters; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkUtil; +import org.apache.iceberg.spark.SparkV2Filters; import org.apache.iceberg.spark.SparkWriteConf; import org.apache.iceberg.spark.SparkWriteRequirements; import org.apache.iceberg.types.TypeUtil; import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.connector.expressions.filter.Predicate; import org.apache.spark.sql.connector.read.Scan; import org.apache.spark.sql.connector.write.BatchWrite; import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.connector.write.RowLevelOperation.Command; import org.apache.spark.sql.connector.write.SupportsDynamicOverwrite; -import org.apache.spark.sql.connector.write.SupportsOverwrite; +import org.apache.spark.sql.connector.write.SupportsOverwriteV2; import org.apache.spark.sql.connector.write.Write; import org.apache.spark.sql.connector.write.WriteBuilder; import org.apache.spark.sql.connector.write.streaming.StreamingWrite; -import org.apache.spark.sql.sources.Filter; import org.apache.spark.sql.types.LongType$; import org.apache.spark.sql.types.StructType; -class SparkWriteBuilder implements WriteBuilder, SupportsDynamicOverwrite, SupportsOverwrite { +class SparkWriteBuilder implements WriteBuilder, SupportsDynamicOverwrite, SupportsOverwriteV2 { private final SparkSession spark; private final Table table; private final SparkWriteConf writeConf; @@ -100,12 +100,12 @@ public WriteBuilder overwriteDynamicPartitions() { } @Override - public WriteBuilder overwrite(Filter[] filters) { + public WriteBuilder overwrite(Predicate[] predicates) { Preconditions.checkState( !overwriteFiles, "Cannot overwrite individual files and using filters"); Preconditions.checkState(rewrittenFileSetId == null, "Cannot overwrite and rewrite"); - this.overwriteExpr = SparkFilters.convert(filters); + this.overwriteExpr = SparkV2Filters.convert(predicates); if (overwriteExpr == Expressions.alwaysTrue() && "dynamic".equals(overwriteMode)) { // use the write option to override truncating the table. use dynamic overwrite instead. this.overwriteDynamic = true; diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java index 404ba7284606..9b08d6f7ab1e 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.BeforeEach; @@ -90,6 +91,7 @@ protected static SparkSession initSpark(String serializer) { .master("local[2]") .config("spark.serializer", serializer) .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java index 9361c63176e0..8eeb55171dbe 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -69,6 +70,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java index a218f965ea65..eae640528f9e 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java @@ -23,6 +23,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -62,6 +63,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java index acd4688440d1..6ad0907fffed 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -59,6 +60,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java new file mode 100644 index 000000000000..ee1f29e56fb3 --- /dev/null +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark; + +import com.codahale.metrics.MetricRegistry; +import java.util.Properties; +import org.apache.spark.SparkConf; +import org.apache.spark.metrics.sink.MetricsServlet; +import org.sparkproject.jetty.servlet.ServletContextHandler; + +/** + * A dummy implementation of {@link MetricsServlet} that does not start a server or report metrics. + * This is used in tests to avoid conflicts with Spark's jetty dependencies. + */ +public class DummyMetricsServlet extends MetricsServlet { + + /** + * Constructor required by Spark's reflection-based instantiation. + * + * @param properties Metrics properties + * @param registry Metric registry + */ + public DummyMetricsServlet(Properties properties, MetricRegistry registry) { + super(properties, registry); + } + + @Override + public ServletContextHandler[] getHandlers(SparkConf conf) { + return new ServletContextHandler[] {}; + } + + @Override + public void start() { + // No-op for tests + } + + @Override + public void stop() { + // No-op for tests + } + + @Override + public void report() { + // No-op for tests + } +} diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/TestBase.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/TestBase.java index 3c32b4693684..b89109174d90 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/TestBase.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/TestBase.java @@ -65,6 +65,13 @@ public abstract class TestBase extends SparkTestHelperBase { protected static SparkSession spark = null; protected static JavaSparkContext sparkContext = null; protected static HiveCatalog catalog = null; + // disable Spark UI and use dummy servlet to avoid dependency conflicts with Spark's Jetty version + public static final Map DISABLE_UI = + ImmutableMap.of( + "spark.ui.enabled", + "false", + "spark.metrics.conf.*.sink.servlet.class", + "org.apache.iceberg.spark.DummyMetricsServlet"); @BeforeAll public static void startMetastoreAndSpark() { @@ -79,6 +86,7 @@ public static void startMetastoreAndSpark() { .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") + .config(DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java index 9b5b207a5b6b..0846cf6f1161 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java @@ -24,9 +24,11 @@ import java.util.List; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; +import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.apache.spark.sql.catalyst.expressions.AttributeReference; import org.apache.spark.sql.catalyst.expressions.MetadataAttribute; +import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructType; import org.junit.jupiter.api.Test; @@ -79,4 +81,18 @@ public void testSchemaConversionWithMetaDataColumnSchema() { } } } + + @Test + public void testUnknownTypeToSpark() { + Schema schema = new Schema(optional(1, "col", Types.UnknownType.get())); + StructType sparkType = SparkSchemaUtil.convert(schema); + assertThat(sparkType.fields()[0].dataType()).isEqualTo(DataTypes.NullType); + } + + @Test + public void testNullTypeToIceberg() { + StructType sparkType = new StructType().add("col", DataTypes.NullType, true); + Type icebergType = SparkSchemaUtil.convert(sparkType).findField("col").type(); + assertThat(icebergType).isEqualTo(Types.UnknownType.get()); + } } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java index dd751499df30..9b0fecdaae41 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java @@ -667,7 +667,7 @@ public void schemaEvolutionTestWithSparkSQL() throws Exception { "CAST(id AS FLOAT) col1", "CAST(id AS STRING) col2", "CAST(id AS INT) col3") - .registerTempTable("tempdata"); + .createOrReplaceTempView("tempdata"); sql("INSERT INTO TABLE %s SELECT * FROM tempdata", tblName); List expectedBeforeAddColumn = sql("SELECT * FROM %s ORDER BY col0", tblName); List expectedAfterAddColumn = diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index 411b7e78116f..6abce5b24da0 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -22,6 +22,7 @@ import static org.apache.iceberg.data.FileHelpers.encrypt; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; +import static org.apache.spark.sql.functions.col; import static org.apache.spark.sql.functions.current_date; import static org.apache.spark.sql.functions.date_add; import static org.apache.spark.sql.functions.expr; @@ -127,6 +128,7 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; import org.apache.spark.sql.internal.SQLConf; +import org.apache.spark.sql.types.DataTypes; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.TestTemplate; @@ -2573,6 +2575,23 @@ public void testExecutorCacheForDeleteFilesDisabled() { .isFalse(); } + @TestTemplate + public void testZOrderUDFWithTimestampNTZType() { + SparkZOrderUDF zorderUDF = new SparkZOrderUDF(1, 16, 1024); + Dataset result = + spark + .sql("SELECT timestamp_ntz '2025-01-01 12:00:00' as test_col") + .withColumn( + "zorder_result", + zorderUDF.sortedLexicographically(col("test_col"), DataTypes.TimestampNTZType)); + + assertThat(result.schema().apply("zorder_result").dataType()).isEqualTo(DataTypes.BinaryType); + List rows = result.collectAsList(); + Row row = rows.get(0); + byte[] zorderBytes = row.getAs("zorder_result"); + assertThat(zorderBytes).isNotNull().isNotEmpty(); + } + private double percentFilesRequired(Table table, String col, String value) { return percentFilesRequired(table, new String[] {col}, new String[] {value}); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTestBase.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTestBase.java index 0db6a65fd394..45053c1a4f1f 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTestBase.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTestBase.java @@ -32,6 +32,7 @@ import java.util.Map; import java.util.UUID; import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; @@ -108,8 +109,8 @@ protected boolean supportsRowLineage() { required(114, "dec_9_0", Types.DecimalType.of(9, 0)), // int encoded required(115, "dec_11_2", Types.DecimalType.of(11, 2)), // long encoded required(116, "dec_20_5", Types.DecimalType.of(20, 5)), // requires padding - required(117, "dec_38_10", Types.DecimalType.of(38, 10)) // Spark's maximum precision - ); + required(117, "dec_38_10", Types.DecimalType.of(38, 10)), // Spark's maximum precision + optional(118, "unk", Types.UnknownType.get())); @TempDir protected Path temp; @@ -120,10 +121,13 @@ public void testSimpleStruct() throws IOException { @Test public void testStructWithRequiredFields() throws IOException { + List supportedPrimitives = + SUPPORTED_PRIMITIVES.fields().stream() + .filter(f -> f.type().typeId() != Type.TypeID.UNKNOWN) + .collect(Collectors.toList()); writeAndValidate( TypeUtil.assignIncreasingFreshIds( - new Schema( - Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asRequired)))); + new Schema(Lists.transform(supportedPrimitives, Types.NestedField::asRequired)))); } @Test @@ -603,4 +607,48 @@ public void testRowLineage() throws Exception { record.copy(Map.of("id", 4L, "data", "d", "_row_id", 1_001L)), record.copy(Map.of("id", 5L, "data", "e")))); } + + @Test + public void testUnknownNestedLevel() throws IOException { + assumeThat(supportsNestedTypes()).isTrue(); + + Schema schema = + new Schema( + required(1, "id", LongType.get()), + optional( + 2, + "nested", + Types.StructType.of( + required(20, "int", Types.IntegerType.get()), + optional(21, "unk", Types.UnknownType.get())))); + + writeAndValidate(schema); + } + + @Test + public void testUnknownListType() throws IOException { + assumeThat(supportsNestedTypes()).isTrue(); + + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional(1, "data", ListType.ofOptional(2, Types.UnknownType.get()))); + + writeAndValidate(schema); + } + + @Test + public void testUnknownMapType() throws IOException { + assumeThat(supportsNestedTypes()).isTrue(); + + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional(2, 3, Types.StringType.get(), Types.UnknownType.get()))); + + writeAndValidate(schema); + } } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkFormatModel.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkFormatModel.java index c18e4c053f50..291bb2bca4f5 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkFormatModel.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkFormatModel.java @@ -25,6 +25,8 @@ import org.apache.iceberg.data.BaseFormatModelTests; import org.apache.iceberg.data.Record; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.SparkUtil; +import org.apache.iceberg.types.Type; import org.apache.spark.sql.catalyst.InternalRow; public class TestSparkFormatModel extends BaseFormatModelTests { @@ -51,4 +53,9 @@ protected void assertEquals(Schema schema, List expected, List expect private Iterator batchesToRows(Iterator batches) { return Iterators.concat(Iterators.transform(batches, ColumnarBatch::rowIterator)); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create ListType with unknown element type"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create MapType with unknown value type"); + } } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java index 9ae8b8cbe530..993dc868bba8 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java @@ -249,4 +249,20 @@ public void testMissingRequiredWithoutDefault() { .isInstanceOf(IllegalArgumentException.class) .hasMessage("Missing required field: missing_str"); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert element Parquet: unknown"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert value Parquet: unknown"); + } } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java index 8e1f860085c6..3c88db139e47 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java @@ -20,6 +20,7 @@ import static org.apache.iceberg.types.Types.NestedField.required; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.File; import java.io.IOException; @@ -152,4 +153,20 @@ private static void assertEqualsUnsafe( .isFalse(); assertThat(actualIter.hasNext()).as("Actual iterator should not have any extra rows").isFalse(); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create ListType with unknown element type"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create MapType with unknown value type"); + } } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java index 6647a1b483e0..91d07e3647c9 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java @@ -37,6 +37,7 @@ import org.apache.iceberg.TableProperties; import org.apache.iceberg.hadoop.HadoopTables; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.AvroDataTestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.TestHelpers; @@ -62,6 +63,7 @@ public static void startSpark() { SparkSession.builder() .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) .master("local[2]") + .config(TestBase.DISABLE_UI) .getOrCreate(); ScanTestBase.sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); } @@ -93,14 +95,11 @@ protected void writeAndValidate(Schema writeSchema, Schema expectedSchema) throw HadoopTables tables = new HadoopTables(CONF); // If V3 spec features are used, set the format version to 3 - Map tableProperties = - writeSchema.columns().stream() - .anyMatch(f -> f.initialDefaultLiteral() != null || f.writeDefaultLiteral() != null) - ? ImmutableMap.of(TableProperties.FORMAT_VERSION, "3") - : ImmutableMap.of(); + Map tableProperties = ImmutableMap.of(TableProperties.FORMAT_VERSION, "3"); Table table = tables.create( writeSchema, PartitionSpec.unpartitioned(), tableProperties, location.toString()); + configureTable(table); // Important: use the table's schema for the rest of the test // When tables are created, the column ids are reassigned. diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java index 308b1bd2c646..cfc38ed66fac 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java @@ -58,6 +58,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.GenericsHelpers; import org.apache.iceberg.transforms.Transforms; import org.apache.iceberg.types.Types; @@ -125,6 +126,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); // define UDFs used by partition tests diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java index c03f7b94eca9..dcd9c2897e08 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java @@ -46,6 +46,7 @@ import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.TestHelpers; import org.apache.iceberg.types.Types; @@ -99,6 +100,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java index f4f57157e479..a637b975fe2b 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java @@ -28,6 +28,7 @@ import java.sql.Timestamp; import java.util.List; import org.apache.iceberg.spark.IcebergSpark; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.transforms.Transforms; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Row; @@ -51,6 +52,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestORCDataFrameWrite.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestORCDataFrameWrite.java index 35be6423ee23..892e260f66f0 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestORCDataFrameWrite.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestORCDataFrameWrite.java @@ -18,9 +18,13 @@ */ package org.apache.iceberg.spark.source; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + import org.apache.iceberg.FileFormat; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; +import org.apache.spark.SparkException; +import org.junit.jupiter.api.Test; public class TestORCDataFrameWrite extends DataFrameWriteTestBase { @Override @@ -30,4 +34,24 @@ protected void configureTable(Table table) { .set(TableProperties.DEFAULT_FILE_FORMAT, FileFormat.ORC.toString()) .commit(); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(SparkException.class) + .cause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create ListType with unknown element type"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(SparkException.class) + .cause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create MapType with unknown value type"); + } } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetDataFrameWrite.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetDataFrameWrite.java index 90a9ac48a486..c24d92ef30af 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetDataFrameWrite.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetDataFrameWrite.java @@ -18,9 +18,13 @@ */ package org.apache.iceberg.spark.source; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + import org.apache.iceberg.FileFormat; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; +import org.apache.spark.SparkException; +import org.junit.jupiter.api.Test; public class TestParquetDataFrameWrite extends DataFrameWriteTestBase { @Override @@ -30,4 +34,24 @@ protected void configureTable(Table table) { .set(TableProperties.DEFAULT_FILE_FORMAT, FileFormat.PARQUET.toString()) .commit(); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(SparkException.class) + .cause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert element Parquet: unknown"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(SparkException.class) + .cause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert value Parquet: unknown"); + } } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java index 6b9ec85b7f0b..6056f1a7929d 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java @@ -19,6 +19,7 @@ package org.apache.iceberg.spark.source; import static org.apache.iceberg.Files.localOutput; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.assertj.core.api.Assumptions.assumeThat; import java.io.File; @@ -37,6 +38,7 @@ import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; public class TestParquetScan extends ScanTestBase { protected boolean vectorized() { @@ -84,4 +86,20 @@ protected void writeAndValidate(Schema writeSchema, Schema expectedSchema) throw super.writeAndValidate(writeSchema, expectedSchema); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert element Parquet: unknown"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert value Parquet: unknown"); + } } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java index becb9dcb4aca..cf3097ebdb30 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java @@ -59,6 +59,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.transforms.Transforms; import org.apache.iceberg.types.Types; import org.apache.spark.api.java.JavaRDD; @@ -118,6 +119,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); TestPartitionPruning.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java index b0ad930487b1..82575a720236 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java @@ -46,6 +46,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.TestHelpers; import org.apache.iceberg.types.Types; @@ -112,6 +113,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java index 11865db7fce5..fe754f4a02ba 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java @@ -42,6 +42,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -91,6 +92,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java index 3c00835da382..f56333649261 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java @@ -59,6 +59,7 @@ import org.apache.iceberg.spark.SparkDataFile; import org.apache.iceberg.spark.SparkDeleteFile; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types; @@ -125,6 +126,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); TestSparkDataFile.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java index 1957f258e1ed..439c4443b990 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java @@ -51,6 +51,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.SnapshotUtil; import org.apache.spark.sql.Dataset; @@ -99,6 +100,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java index 8ccea303d0c1..de6a5e59029c 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java @@ -50,6 +50,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkValueConverter; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; @@ -88,6 +89,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); ImmutableMap config = ImmutableMap.of( diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java index eff032743e3b..33b5a1d6e600 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java @@ -71,6 +71,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkStructLike; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.SparkParquetWriters; import org.apache.iceberg.spark.source.metrics.NumDeletes; @@ -132,6 +133,7 @@ public static void startMetastoreAndSpark() { .config("spark.ui.liveUpdate.period", 0) .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java index d22ecb02d483..cb2f866fab10 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java @@ -64,6 +64,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkValueConverter; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.PropertyUtil; import org.apache.spark.sql.Dataset; @@ -182,6 +183,7 @@ public static void startMetastoreAndSpark() { SparkSession.builder() .master("local[2]") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java index d14b1a52cf82..e3934faa60ce 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java @@ -20,7 +20,9 @@ import static org.assertj.core.api.Assertions.assertThat; +import org.apache.iceberg.HistoryEntry; import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Table; import org.apache.iceberg.spark.CatalogTestBase; import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; import org.apache.spark.sql.connector.catalog.CatalogManager; @@ -56,4 +58,58 @@ public void testTableEquality() throws NoSuchTableException { assertThat(table1).as("References must be different").isNotSameAs(table2); assertThat(table1).as("Tables must be equivalent").isEqualTo(table2); } + + @TestTemplate + public void testTableInequalityWithDifferentSnapshots() throws NoSuchTableException { + sql("INSERT INTO %s VALUES (1, 'a')", tableName); + sql("INSERT INTO %s VALUES (2, 'b')", tableName); + + CatalogManager catalogManager = spark.sessionState().catalogManager(); + TableCatalog catalog = (TableCatalog) catalogManager.catalog(catalogName); + Identifier identifier = Identifier.of(tableIdent.namespace().levels(), tableIdent.name()); + SparkTable table = (SparkTable) catalog.loadTable(identifier); + + Table icebergTable = validationCatalog.loadTable(tableIdent); + long[] snapshotIds = + icebergTable.history().stream().mapToLong(HistoryEntry::snapshotId).toArray(); + + SparkTable tableAtSnapshot1 = table.copyWithSnapshotId(snapshotIds[0]); + SparkTable tableAtSnapshot2 = table.copyWithSnapshotId(snapshotIds[1]); + + assertThat(tableAtSnapshot1) + .as("Tables at different snapshots must not be equal") + .isNotEqualTo(tableAtSnapshot2); + assertThat(tableAtSnapshot1.hashCode()) + .as("Hash codes should differ for different snapshots") + .isNotEqualTo(tableAtSnapshot2.hashCode()); + } + + @TestTemplate + public void testTableInequalityWithDifferentBranches() throws NoSuchTableException { + sql("INSERT INTO %s VALUES (1, 'a')", tableName); + + CatalogManager catalogManager = spark.sessionState().catalogManager(); + TableCatalog catalog = (TableCatalog) catalogManager.catalog(catalogName); + Identifier identifier = Identifier.of(tableIdent.namespace().levels(), tableIdent.name()); + + Table icebergTable = validationCatalog.loadTable(tableIdent); + icebergTable + .manageSnapshots() + .createBranch("testBranch", icebergTable.currentSnapshot().snapshotId()) + .commit(); + + // reload after branch creation so the table sees the new ref + SparkTable table = (SparkTable) catalog.loadTable(identifier); + table.table().refresh(); + + SparkTable tableOnMain = table.copyWithBranch("main"); + SparkTable tableOnBranch = table.copyWithBranch("testBranch"); + + assertThat(tableOnMain) + .as("Tables on different branches must not be equal") + .isNotEqualTo(tableOnBranch); + assertThat(tableOnMain.hashCode()) + .as("Hash codes should differ for different branches") + .isNotEqualTo(tableOnBranch.hashCode()); + } } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java index dc4fc7e187fb..a974b58a9714 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java @@ -34,6 +34,7 @@ import org.apache.iceberg.Table; import org.apache.iceberg.hadoop.HadoopTables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoder; @@ -69,6 +70,7 @@ public static void startSpark() { .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) .config("spark.sql.shuffle.partitions", 4) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java index c3fac70dd3fc..45ff9184566b 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java @@ -41,6 +41,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.ByteBuffers; @@ -84,6 +85,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); TestWriteMetricsConfig.sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java index 5ce56b4feca7..946456fe2be8 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java @@ -63,6 +63,7 @@ public static void startMetastoreAndSpark() { SparkSession.builder() .master("local[2]") .config("spark.sql.iceberg.aggregate_pushdown", "true") + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v3.5/build.gradle b/spark/v3.5/build.gradle index 143297a6079a..f14553c9388d 100644 --- a/spark/v3.5/build.gradle +++ b/spark/v3.5/build.gradle @@ -117,14 +117,10 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") { testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') - testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - testImplementation libs.sqlite.jdbc + testImplementation project(path: ':iceberg-orc', configuration: 'testArtifacts') + testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) testImplementation libs.awaitility testImplementation(testFixtures(project(':iceberg-parquet'))) - // runtime dependencies for running REST Catalog based integration test - testRuntimeOnly libs.jetty.servlet } test { @@ -185,13 +181,7 @@ project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVer testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') testImplementation project(path: ":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}", configuration: 'testArtifacts') - testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - // runtime dependencies for running REST Catalog based integration test - testRuntimeOnly libs.jetty.servlet - testRuntimeOnly libs.sqlite.jdbc - + testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) testImplementation libs.avro.avro testImplementation libs.parquet.hadoop testImplementation libs.awaitility @@ -290,11 +280,7 @@ project(":iceberg-spark:iceberg-spark-runtime-${sparkMajorVersion}_${scalaVersio integrationRuntimeOnly project(':iceberg-hive-metastore') // runtime dependencies for running REST Catalog based integration test integrationRuntimeOnly project(path: ':iceberg-core', configuration: 'testArtifacts') - integrationRuntimeOnly (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - integrationRuntimeOnly libs.jetty.servlet - integrationRuntimeOnly libs.sqlite.jdbc + integrationRuntimeOnly (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) // Not allowed on our classpath, only the runtime jar is allowed @@ -360,5 +346,7 @@ project(":iceberg-spark:iceberg-spark-runtime-${sparkMajorVersion}_${scalaVersio jar { enabled = false } + + apply from: "${rootDir}/runtime-deps.gradle" } diff --git a/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java b/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java index 8b0b05911f66..242ef7439a39 100644 --- a/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java +++ b/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java @@ -31,6 +31,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions; import org.apache.iceberg.util.ThreadPools; import org.apache.spark.sql.SparkSession; @@ -205,7 +206,7 @@ private void initDataAndDVs() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) diff --git a/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java b/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java index d7f285288004..03e0410c0adc 100644 --- a/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java +++ b/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java @@ -155,7 +155,7 @@ private void runBenchmark(RowLevelOperationMode mode, double updatePercentage) { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java b/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java index 1d51350487c4..5cd8143f17bf 100644 --- a/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java +++ b/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java @@ -215,7 +215,7 @@ public void localPlanningViaDistributedScanWithoutFilterWithStats(Blackhole blac private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.driver.maxResultSize", "8G") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) diff --git a/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java b/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java index ad78205ce98c..a77c130ee17a 100644 --- a/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java +++ b/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java @@ -199,7 +199,7 @@ private void initDataAndDeletes() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) diff --git a/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java b/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java index d917eae5eb0f..caa23625fc44 100644 --- a/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java +++ b/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java @@ -138,7 +138,7 @@ private void runBenchmark(RowLevelOperationMode mode, double updatePercentage) { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java index bfcb5af235d3..ef4f0090292c 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java +++ b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java @@ -49,7 +49,12 @@ public class TestExtendedParser { @BeforeAll public static void before() { - spark = SparkSession.builder().master("local").appName("TestExtendedParser").getOrCreate(); + spark = + SparkSession.builder() + .master("local") + .appName("TestExtendedParser") + .config(TestBase.DISABLE_UI) + .getOrCreate(); } @AfterAll diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java index 4c1a5095916c..834640e24328 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java +++ b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java @@ -59,6 +59,7 @@ public static void startMetastoreAndSpark() { .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") .config( SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), String.valueOf(RANDOM.nextBoolean())) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java index f7ded0c4d7d2..d39dff060c9a 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java +++ b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java @@ -162,6 +162,25 @@ public synchronized void testDeleteWithConcurrentTableRefresh() throws Exception assertThat(executorService.awaitTermination(2, TimeUnit.MINUTES)).as("Timeout").isTrue(); } + @TestTemplate + public void testCopyOnWriteDeleteSetsSortOrderIdOnRewrittenDataFiles() { + createAndInitTable( + "id INT, dep STRING", + "PARTITIONED BY (dep)", + "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + + sql("ALTER TABLE %s WRITE ORDERED BY id", tableName); + + sql("DELETE FROM %s WHERE id = 1", commitTarget()); + + Table table = validationCatalog.loadTable(tableIdent); + Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch); + assertThat(snapshot.addedDataFiles(table.io())) + .extracting(DataFile::sortOrderId) + .as("Rewritten data files should carry the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + @TestTemplate public void testRuntimeFilteringWithPreservedDataGrouping() throws NoSuchTableException { createAndInitPartitionedTable(); diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java index fef8b28c689a..394dbbda1a3d 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java +++ b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java @@ -151,6 +151,34 @@ public synchronized void testMergeWithConcurrentTableRefresh() throws Exception assertThat(executorService.awaitTermination(2, TimeUnit.MINUTES)).as("Timeout").isTrue(); } + @TestTemplate + public void testCopyOnWriteMergeSetsSortOrderIdOnRewrittenDataFiles() { + createAndInitTable("id INT, dep STRING"); + sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); + sql("ALTER TABLE %s WRITE ORDERED BY id", tableName); + + append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + createBranchIfNeeded(); + + createOrReplaceView("source", Collections.singletonList(1), Encoders.INT()); + + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.value " + + "WHEN MATCHED THEN " + + " UPDATE SET dep = 'changed' " + + "WHEN NOT MATCHED THEN " + + " INSERT (id, dep) VALUES (s.value, 'new')", + commitTarget()); + + Table table = validationCatalog.loadTable(tableIdent); + Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch); + assertThat(snapshot.addedDataFiles(table.io())) + .extracting(DataFile::sortOrderId) + .as("Rewritten data files should carry the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + @TestTemplate public void testRuntimeFilteringWithReportedPartitioning() { createAndInitTable("id INT, dep STRING"); diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java index 21d1377b2b98..b547218acbd4 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java +++ b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java @@ -149,6 +149,25 @@ public synchronized void testUpdateWithConcurrentTableRefresh() throws Exception assertThat(executorService.awaitTermination(2, TimeUnit.MINUTES)).as("Timeout").isTrue(); } + @TestTemplate + public void testCopyOnWriteUpdateSetsSortOrderIdOnRewrittenDataFiles() { + createAndInitTable( + "id INT, dep STRING", + "PARTITIONED BY (dep)", + "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + + sql("ALTER TABLE %s WRITE ORDERED BY id", tableName); + + sql("UPDATE %s SET dep = 'changed' WHERE id = 1", commitTarget()); + + Table table = validationCatalog.loadTable(tableIdent); + Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch); + assertThat(snapshot.addedDataFiles(table.io())) + .extracting(DataFile::sortOrderId) + .as("Rewritten data files should carry the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + @TestTemplate public void testRuntimeFilteringWithReportedPartitioning() { createAndInitTable("id INT, dep STRING"); diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java index fbf6ce3559a7..79d6bea12f67 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java +++ b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java @@ -1422,6 +1422,62 @@ public void testDeleteToCustomWapBranchWithoutWhereClause() throws NoSuchTableEx }); } + @TestTemplate + public void testDeleteToWapBranchCanDeleteWhereScansWapBranch() throws NoSuchTableException { + assumeThat(branch).as("WAP branch only works for table identifier without branch").isNull(); + + createAndInitPartitionedTable(); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' = 'true')", + tableName, TableProperties.WRITE_AUDIT_PUBLISH_ENABLED); + + append(tableName, new Employee(1, "hr")); + + spark.conf().set(SparkSQLProperties.WAP_BRANCH, "wap"); + try { + append(tableName, new Employee(0, "hr"), new Employee(1, "hr"), new Employee(2, "hr")); + + sql("DELETE FROM %s WHERE id = 1", tableName); + + assertThat(sql("SELECT id, dep FROM %s.branch_wap ORDER BY id", tableName)) + .as("DELETE should remove the matching rows from the WAP branch") + .containsExactly(row(0, "hr"), row(2, "hr")); + assertThat(sql("SELECT id, dep FROM %s.branch_main", tableName)) + .as("Main branch must not be modified by a WAP-targeted DELETE") + .containsExactly(row(1, "hr")); + } finally { + spark.conf().unset(SparkSQLProperties.WAP_BRANCH); + } + } + + @TestTemplate + public void testMetadataDeleteToWapBranchCommitsToWapBranch() throws NoSuchTableException { + assumeThat(branch).as("WAP branch only works for table identifier without branch").isNull(); + + createAndInitPartitionedTable(); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' = 'true')", + tableName, TableProperties.WRITE_AUDIT_PUBLISH_ENABLED); + + append(tableName, new Employee(1, "hr"), new Employee(5, "eng")); + + spark.conf().set(SparkSQLProperties.WAP_BRANCH, "wap"); + try { + append(tableName, new Employee(0, "hr"), new Employee(2, "eng")); + + sql("DELETE FROM %s WHERE dep = 'hr'", tableName); + + assertThat(sql("SELECT id, dep FROM %s.branch_wap ORDER BY id", tableName)) + .as("Metadata delete should remove the hr partition on the WAP branch") + .containsExactly(row(2, "eng"), row(5, "eng")); + assertThat(sql("SELECT id, dep FROM %s.branch_main ORDER BY id", tableName)) + .as("Metadata delete must not commit to main when WAP is set") + .containsExactly(row(1, "hr"), row(5, "eng")); + } finally { + spark.conf().unset(SparkSQLProperties.WAP_BRANCH); + } + } + @TestTemplate public void testDeleteWithFilterOnNestedColumn() { createAndInitNestedColumnsTable(); diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java index 737f19e86a95..9a42b58e3434 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java +++ b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java @@ -26,6 +26,7 @@ import java.util.Set; import java.util.stream.Collectors; import java.util.stream.IntStream; +import org.apache.iceberg.DataFile; import org.apache.iceberg.DeleteFile; import org.apache.iceberg.FileFormat; import org.apache.iceberg.ParameterizedTestExtension; @@ -136,6 +137,34 @@ public void testMergeWithDVAndHistoricalPositionDeletes() { assertThat(dvs).allMatch(dv -> FileFormat.fromFileName(dv.location()) == FileFormat.PUFFIN); } + @TestTemplate + public void testMergeOnReadMergeSetsSortOrderIdOnNewDataFiles() { + createAndInitTable( + "id INT, dep STRING", + "PARTITIONED BY (dep)", + "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + + sql("ALTER TABLE %s WRITE ORDERED BY id", tableName); + + createOrReplaceView("source", ImmutableList.of(1, 3), Encoders.INT()); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.value " + + "WHEN MATCHED THEN " + + " UPDATE SET id = id + 10 " + + "WHEN NOT MATCHED THEN " + + " INSERT (id, dep) VALUES (s.value, 'hr')", + commitTarget()); + + Table table = validationCatalog.loadTable(tableIdent); + Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch); + assertThat(snapshot.addedDataFiles(table.io())) + .extracting(DataFile::sortOrderId) + .as("All new data files should carry the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + private void checkMergeDeleteGranularity(DeleteGranularity deleteGranularity) { createTableWithDeleteGranularity( "id INT, dep STRING", "PARTITIONED BY (dep)", deleteGranularity); diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java index 2398bc45b19b..d1c336d5ddeb 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java +++ b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java @@ -25,6 +25,7 @@ import java.util.Map; import java.util.Set; import java.util.stream.Collectors; +import org.apache.iceberg.DataFile; import org.apache.iceberg.DeleteFile; import org.apache.iceberg.FileFormat; import org.apache.iceberg.ParameterizedTestExtension; @@ -224,6 +225,25 @@ public void testUpdateWithDVAndHistoricalPositionDeletes() { assertThat(dvs).allMatch(dv -> FileFormat.fromFileName(dv.location()) == FileFormat.PUFFIN); } + @TestTemplate + public void testMergeOnReadUpdateSetsSortOrderIdOnNewDataFiles() { + createAndInitTable( + "id INT, dep STRING", + "PARTITIONED BY (dep)", + "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + + sql("ALTER TABLE %s WRITE ORDERED BY id", tableName); + + sql("UPDATE %s SET id = id + 10 WHERE id = 1", commitTarget()); + + Table table = validationCatalog.loadTable(tableIdent); + Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch); + assertThat(snapshot.addedDataFiles(table.io())) + .extracting(DataFile::sortOrderId) + .as("All new data files should carry the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + private void initTable(String partitionedBy, DeleteGranularity deleteGranularity) { createTableWithDeleteGranularity("id INT, dep STRING", partitionedBy, deleteGranularity); diff --git a/spark/v3.5/spark-runtime/LICENSE b/spark/v3.5/spark-runtime/LICENSE index a67296eb412c..50c91faf8edb 100644 --- a/spark/v3.5/spark-runtime/LICENSE +++ b/spark/v3.5/spark-runtime/LICENSE @@ -219,6 +219,94 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles FastDoubleParser (via Jackson JSON Processor). + +Copyright: 2023 Werner Randelshofer, Switzerland +Project URL: https://github.com/wrandelshofer/FastDoubleParser +License: MIT + +| Copyright (c) 2023 Werner Randelshofer, Switzerland +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles fast_float (bundled by FastDoubleParser). + +Copyright: 2021 The fast_float authors +Project URL: https://github.com/fastfloat/fast_float +License: MIT + +| Copyright (c) 2021 The fast_float authors +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles bigint (bundled by FastDoubleParser). + +Copyright: 2022 Tim Buktu +Project URL: https://github.com/tbuktu/bigint +License: BSD 2-Clause + +| Copyright 2022 Tim Buktu +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| 1. Redistributions of source code must retain the above copyright notice, this +| list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + This product bundles Apache Parquet. Copyright: 2014-2024 The Apache Software Foundation @@ -227,7 +315,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Thrift. +This product bundles Apache Thrift (bundled by Parquet). Copyright: 2006-2017 The Apache Software Foundation. Project URL: https://thrift.apache.org/ @@ -235,7 +323,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product includes code from Daniel Lemire's JavaFastPFOR project. +This product includes code from Daniel Lemire's JavaFastPFOR project (bundled by Parquet). Copyright: 2013 Daniel Lemire Project URL: https://github.com/lemire/JavaFastPFOR @@ -243,7 +331,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles fastutil. +This product bundles fastutil (bundled by Parquet). Copyright: 2002-2014 Sebastiano Vigna Project URL: http://fastutil.di.unimi.it/ @@ -251,6 +339,13 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles Zero-Allocation Hashing (bundled by Parquet). + +Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + This product bundles Apache ORC. Copyright: 2013 and onwards The Apache Software Foundation. @@ -259,7 +354,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Hive. +This product bundles Apache Hive's Storage API (bundled by ORC). Copyright: 2008-2020 The Apache Software Foundation Project URL: https://hive.apache.org/ @@ -267,11 +362,12 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Google protobuf. +This product bundles Google protobuf (bundled by ORC). Copyright: 2008 Google Inc. Project URL: https://developers.google.com/protocol-buffers License: BSD 3-Clause + | Copyright 2008 Google Inc. All rights reserved. | | Redistribution and use in source and binary forms, with or without @@ -339,6 +435,387 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles the Mozilla Public Suffix List, distributed by Apache HttpComponents. + +Project URL: https://publicsuffix.org/ +License: Mozilla Public License, Version 2.0 - https://mozilla.org/MPL/2.0/ + +| Mozilla Public License Version 2.0 +| ================================== +| +| 1. Definitions +| -------------- +| +| 1.1. "Contributor" +| means each individual or legal entity that creates, contributes to +| the creation of, or owns Covered Software. +| +| 1.2. "Contributor Version" +| means the combination of the Contributions of others (if any) used +| by a Contributor and that particular Contributor's Contribution. +| +| 1.3. "Contribution" +| means Covered Software of a particular Contributor. +| +| 1.4. "Covered Software" +| means Source Code Form to which the initial Contributor has attached +| the notice in Exhibit A, the Executable Form of such Source Code +| Form, and Modifications of such Source Code Form, in each case +| including portions thereof. +| +| 1.5. "Incompatible With Secondary Licenses" +| means +| +| (a) that the initial Contributor has attached the notice described +| in Exhibit B to the Covered Software; or +| +| (b) that the Covered Software was made available under the terms of +| version 1.1 or earlier of the License, but not also under the +| terms of a Secondary License. +| +| 1.6. "Executable Form" +| means any form of the work other than Source Code Form. +| +| 1.7. "Larger Work" +| means a work that combines Covered Software with other material, in +| a separate file or files, that is not Covered Software. +| +| 1.8. "License" +| means this document. +| +| 1.9. "Licensable" +| means having the right to grant, to the maximum extent possible, +| whether at the time of the initial grant or subsequently, any and +| all of the rights conveyed by this License. +| +| 1.10. "Modifications" +| means any of the following: +| +| (a) any file in Source Code Form that results from an addition to, +| deletion from, or modification of the contents of Covered +| Software; or +| +| (b) any new file in Source Code Form that contains any Covered +| Software. +| +| 1.11. "Patent Claims" of a Contributor +| means any patent claim(s), including without limitation, method, +| process, and apparatus claims, in any patent Licensable by such +| Contributor that would be infringed, but for the grant of the +| License, by the making, using, selling, offering for sale, having +| made, import, or transfer of either its Contributions or its +| Contributor Version. +| +| 1.12. "Secondary License" +| means either the GNU General Public License, Version 2.0, the GNU +| Lesser General Public License, Version 2.1, the GNU Affero General +| Public License, Version 3.0, or any later versions of those +| licenses. +| +| 1.13. "Source Code Form" +| means the form of the work preferred for making modifications. +| +| 1.14. "You" (or "Your") +| means an individual or a legal entity exercising rights under this +| License. For legal entities, "You" includes any entity that +| controls, is controlled by, or is under common control with You. For +| purposes of this definition, "control" means (a) the power, direct +| or indirect, to cause the direction or management of such entity, +| whether by contract or otherwise, or (b) ownership of more than +| fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants and Conditions +| -------------------------------- +| +| 2.1. Grants +| +| Each Contributor hereby grants You a world-wide, royalty-free, +| non-exclusive license: +| +| (a) under intellectual property rights (other than patent or trademark) +| Licensable by such Contributor to use, reproduce, make available, +| modify, display, perform, distribute, and otherwise exploit its +| Contributions, either on an unmodified basis, with Modifications, or +| as part of a Larger Work; and +| +| (b) under Patent Claims of such Contributor to make, use, sell, offer +| for sale, have made, import, and otherwise transfer either its +| Contributions or its Contributor Version. +| +| 2.2. Effective Date +| +| The licenses granted in Section 2.1 with respect to any Contribution +| become effective for each Contribution on the date the Contributor first +| distributes such Contribution. +| +| 2.3. Limitations on Grant Scope +| +| The licenses granted in this Section 2 are the only rights granted under +| this License. No additional rights or licenses will be implied from the +| distribution or licensing of Covered Software under this License. +| Notwithstanding Section 2.1(b) above, no patent license is granted by a +| Contributor: +| +| (a) for any code that a Contributor has removed from Covered Software; +| or +| +| (b) for infringements caused by: (i) Your and any other third party's +| modifications of Covered Software, or (ii) the combination of its +| Contributions with other software (except as part of its Contributor +| Version); or +| +| (c) under Patent Claims infringed by Covered Software in the absence of +| its Contributions. +| +| This License does not grant any rights in the trademarks, service marks, +| or logos of any Contributor (except as may be necessary to comply with +| the notice requirements in Section 3.4). +| +| 2.4. Subsequent Licenses +| +| No Contributor makes additional grants as a result of Your choice to +| distribute the Covered Software under a subsequent version of this +| License (see Section 10.2) or under the terms of a Secondary License (if +| permitted under the terms of Section 3.3). +| +| 2.5. Representation +| +| Each Contributor represents that the Contributor believes its +| Contributions are its original creation(s) or it has sufficient rights +| to grant the rights to its Contributions conveyed by this License. +| +| 2.6. Fair Use +| +| This License is not intended to limit any rights You have under +| applicable copyright doctrines of fair use, fair dealing, or other +| equivalents. +| +| 2.7. Conditions +| +| Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +| in Section 2.1. +| +| 3. Responsibilities +| ------------------- +| +| 3.1. Distribution of Source Form +| +| All distribution of Covered Software in Source Code Form, including any +| Modifications that You create or to which You contribute, must be under +| the terms of this License. You must inform recipients that the Source +| Code Form of the Covered Software is governed by the terms of this +| License, and how they can obtain a copy of this License. You may not +| attempt to alter or restrict the recipients' rights in the Source Code +| Form. +| +| 3.2. Distribution of Executable Form +| +| If You distribute Covered Software in Executable Form then: +| +| (a) such Covered Software must also be made available in Source Code +| Form, as described in Section 3.1, and You must inform recipients of +| the Executable Form how they can obtain a copy of such Source Code +| Form by reasonable means in a timely manner, at a charge no more +| than the cost of distribution to the recipient; and +| +| (b) You may distribute such Executable Form under the terms of this +| License, or sublicense it under different terms, provided that the +| license for the Executable Form does not attempt to limit or alter +| the recipients' rights in the Source Code Form under this License. +| +| 3.3. Distribution of a Larger Work +| +| You may create and distribute a Larger Work under terms of Your choice, +| provided that You also comply with the requirements of this License for +| the Covered Software. If the Larger Work is a combination of Covered +| Software with a work governed by one or more Secondary Licenses, and the +| Covered Software is not Incompatible With Secondary Licenses, this +| License permits You to additionally distribute such Covered Software +| under the terms of such Secondary License(s), so that the recipient of +| the Larger Work may, at their option, further distribute the Covered +| Software under the terms of either this License or such Secondary +| License(s). +| +| 3.4. Notices +| +| You may not remove or alter the substance of any license notices +| (including copyright notices, patent notices, disclaimers of warranty, +| or limitations of liability) contained within the Source Code Form of +| the Covered Software, except that You may alter any license notices to +| the extent required to remedy known factual inaccuracies. +| +| 3.5. Application of Additional Terms +| +| You may choose to offer, and to charge a fee for, warranty, support, +| indemnity or liability obligations to one or more recipients of Covered +| Software. However, You may do so only on Your own behalf, and not on +| behalf of any Contributor. You must make it absolutely clear that any +| such warranty, support, indemnity, or liability obligation is offered by +| You alone, and You hereby agree to indemnify every Contributor for any +| liability incurred by such Contributor as a result of warranty, support, +| indemnity or liability terms You offer. You may include additional +| disclaimers of warranty and limitations of liability specific to any +| jurisdiction. +| +| 4. Inability to Comply Due to Statute or Regulation +| --------------------------------------------------- +| +| If it is impossible for You to comply with any of the terms of this +| License with respect to some or all of the Covered Software due to +| statute, judicial order, or regulation then You must: (a) comply with +| the terms of this License to the maximum extent possible; and (b) +| describe the limitations and the code they affect. Such description must +| be placed in a text file included with all distributions of the Covered +| Software under this License. Except to the extent prohibited by statute +| or regulation, such description must be sufficiently detailed for a +| recipient of ordinary skill to be able to understand it. +| +| 5. Termination +| -------------- +| +| 5.1. The rights granted under this License will terminate automatically +| if You fail to comply with any of its terms. However, if You become +| compliant, then the rights granted under this License from a particular +| Contributor are reinstated (a) provisionally, unless and until such +| Contributor explicitly and finally terminates Your grants, and (b) on an +| ongoing basis, if such Contributor fails to notify You of the +| non-compliance by some reasonable means prior to 60 days after You have +| come back into compliance. Moreover, Your grants from a particular +| Contributor are reinstated on an ongoing basis if such Contributor +| notifies You of the non-compliance by some reasonable means, this is the +| first time You have received notice of non-compliance with this License +| from such Contributor, and You become compliant prior to 30 days after +| Your receipt of the notice. +| +| 5.2. If You initiate litigation against any entity by asserting a patent +| infringement claim (excluding declaratory judgment actions, +| counter-claims, and cross-claims) alleging that a Contributor Version +| directly or indirectly infringes any patent, then the rights granted to +| You by any and all Contributors for the Covered Software under Section +| 2.1 of this License shall terminate. +| +| 5.3. In the event of termination under Sections 5.1 or 5.2 above, all +| end user license agreements (excluding distributors and resellers) which +| have been validly granted by You or Your distributors under this License +| prior to termination shall survive termination. +| +| ************************************************************************ +| * * +| * 6. Disclaimer of Warranty * +| * ------------------------- * +| * * +| * Covered Software is provided under this License on an "as is" * +| * basis, without warranty of any kind, either expressed, implied, or * +| * statutory, including, without limitation, warranties that the * +| * Covered Software is free of defects, merchantable, fit for a * +| * particular purpose or non-infringing. The entire risk as to the * +| * quality and performance of the Covered Software is with You. * +| * Should any Covered Software prove defective in any respect, You * +| * (not any Contributor) assume the cost of any necessary servicing, * +| * repair, or correction. This disclaimer of warranty constitutes an * +| * essential part of this License. No use of any Covered Software is * +| * authorized under this License except under this disclaimer. * +| * * +| ************************************************************************ +| +| ************************************************************************ +| * * +| * 7. Limitation of Liability * +| * -------------------------- * +| * * +| * Under no circumstances and under no legal theory, whether tort * +| * (including negligence), contract, or otherwise, shall any * +| * Contributor, or anyone who distributes Covered Software as * +| * permitted above, be liable to You for any direct, indirect, * +| * special, incidental, or consequential damages of any character * +| * including, without limitation, damages for lost profits, loss of * +| * goodwill, work stoppage, computer failure or malfunction, or any * +| * and all other commercial damages or losses, even if such party * +| * shall have been informed of the possibility of such damages. This * +| * limitation of liability shall not apply to liability for death or * +| * personal injury resulting from such party's negligence to the * +| * extent applicable law prohibits such limitation. Some * +| * jurisdictions do not allow the exclusion or limitation of * +| * incidental or consequential damages, so this exclusion and * +| * limitation may not apply to You. * +| * * +| ************************************************************************ +| +| 8. Litigation +| ------------- +| +| Any litigation relating to this License may be brought only in the +| courts of a jurisdiction where the defendant maintains its principal +| place of business and such litigation shall be governed by laws of that +| jurisdiction, without reference to its conflict-of-law provisions. +| Nothing in this Section shall prevent a party's ability to bring +| cross-claims or counter-claims. +| +| 9. Miscellaneous +| ---------------- +| +| This License represents the complete agreement concerning the subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. Any law or regulation which provides +| that the language of a contract shall be construed against the drafter +| shall not be used to construe this License against a Contributor. +| +| 10. Versions of the License +| --------------------------- +| +| 10.1. New Versions +| +| Mozilla Foundation is the license steward. Except as provided in Section +| 10.3, no one other than the license steward has the right to modify or +| publish new versions of this License. Each version will be given a +| distinguishing version number. +| +| 10.2. Effect of New Versions +| +| You may distribute the Covered Software under the terms of the version +| of the License under which You originally received the Covered Software, +| or under the terms of any subsequent version published by the license +| steward. +| +| 10.3. Modified Versions +| +| If you create software not governed by this License, and you want to +| create a new license for such software, you may create and use a +| modified version of this License if you rename the license and remove +| any references to the name of the license steward (except to note that +| such modified license differs from this License). +| +| 10.4. Distributing Source Code Form that is Incompatible With Secondary +| Licenses +| +| If You choose to distribute Source Code Form that is Incompatible With +| Secondary Licenses under the terms of this version of the License, the +| notice described in Exhibit B of this License must be attached. +| +| Exhibit A - Source Code Form License Notice +| ------------------------------------------- +| +| This Source Code Form is subject to the terms of the Mozilla Public +| License, v. 2.0. If a copy of the MPL was not distributed with this +| file, You can obtain one at http://mozilla.org/MPL/2.0/. +| +| If it is not possible or desirable to put the notice in a particular +| file, then You may include the notice in a location (such as a LICENSE +| file in a relevant directory) where a recipient would be likely to look +| for such a notice. +| +| You may add additional accurate notices of copyright ownership. +| +| Exhibit B - "Incompatible With Secondary Licenses" Notice +| --------------------------------------------------------- +| +| This Source Code Form is "Incompatible With Secondary Licenses", as +| defined by the Mozilla Public License, v. 2.0. + +-------------------------------------------------------------------------------- + This product bundles Google Error Prone Annotations. Copyright: Copyright 2011-2019 The Error Prone Authors @@ -352,6 +829,7 @@ This product bundles checkerframework checker-qual. Copyright: 2004-2019 the Checker Framework developers Project URL: https://github.com/typetools/checker-framework License: MIT license + | The annotations are licensed under the MIT License. (The text of this | license appears below.) More specifically, all the parts of the Checker | Framework that you might want to include with your own program use the @@ -390,20 +868,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Perfmark. - -Project URL: https://github.com/perfmark/perfmark -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles org.json. - -Project URL: https://github.com/douglascrockford/JSON-java -License: Public Domain - https://github.com/stleary/JSON-java/blob/master/LICENSE - --------------------------------------------------------------------------------- - This product bundles Apache Arrow. Copyright: 2016-2019 The Apache Software Foundation. @@ -420,47 +884,18 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Google FlatBuffers. +This product bundles JCTools (via Netty). -Copyright: 2013-2020 Google Inc. -Project URL: https://google.github.io/flatbuffers/ +Project URL: https://github.com/JCTools/JCTools License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles ThreeTen BP. +This product bundles Google FlatBuffers. -Project URL: https://www.threeten.org/threetenbp -License: BSD 3-Clause -| Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. -| -| All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are met: -| -| * Redistributions of source code must retain the above copyright notice, -| this list of conditions and the following disclaimer. -| -| * Redistributions in binary form must reproduce the above copyright notice, -| this list of conditions and the following disclaimer in the documentation -| and/or other materials provided with the distribution. -| -| * Neither the name of JSR-310 nor the names of its contributors -| may be used to endorse or promote products derived from this software -| without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +Copyright: 2013-2020 Google Inc. +Project URL: https://google.github.io/flatbuffers/ +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -469,6 +904,7 @@ This product bundles ThreeTen Extra. Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. Project URL: https://www.threeten.org/threeten-extra/ License: BSD 3-Clause + | All rights reserved. | | * Redistribution and use in source and binary forms, with or without @@ -540,19 +976,11 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache HttpComponents (client and core). - -Copyright: 1999-2022 The Apache Software Foundation. -Project URL: https://hc.apache.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product includes code from Apache HttpComponents Client. +This product bundles and includes code from Apache HttpComponents (core/client). * retry and error handling logic in ExponentialHttpRequestRetryStrategy.java -Copyright: 1999-2022 The Apache Software Foundation. +Copyright: 1999-2022 The Apache Software Foundation Project URL: https://hc.apache.org/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 @@ -573,16 +1001,46 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Datasketches. +This product bundles Eclipse Collections. -Project URL: https://datasketches.apache.org -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 +Project URL: https://github.com/eclipse-collections/eclipse-collections +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php + +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| * Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| * Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- -This product bundles Zero-Allocation Hashing. +This product bundles Apache Datasketches. -Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing +Project URL: https://datasketches.apache.org License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -590,70 +1048,37 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 This product bundles JTS Topology Suite. Project URL: https://github.com/locationtech/jts -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt - --------------------------------------------------------------------------------- - -This product bundles JSpecify. - -Project URL: https://github.com/jspecify/jspecify -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Animal Sniffer Annotations. - -License: MIT -| The MIT License -| -| Copyright (c) 2009 codehaus.org. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in -| all copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -| THE SOFTWARE. - --------------------------------------------------------------------------------- - -This product bundles Android Annotations. - -Project URL: http://source.android.com/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Conscrypt (openjdk-uber). - -Project URL: https://conscrypt.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud BigQuery Client for Java. - -Project URL: https://github.com/googleapis/java-bigquery -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud Java Client Libraries. - -Project URL: https://github.com/googleapis/google-cloud-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| * Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| * Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- This product bundles RoaringBitmap. @@ -661,128 +1086,3 @@ This product bundles RoaringBitmap. Copyright: (c) 2013-... the RoaringBitmap authors Project URL: https://github.com/RoaringBitmap/RoaringBitmap License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Credentials for Java. - -Project URL: https://github.com/aliyun/credentials-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Tea for Java. - -Project URL: https://github.com/aliyun/tea-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google GAX. - -Project URL: https://github.com/googleapis/gax-java -License: BSD 3-Clause -| Copyright 2016, Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This product bundles Google Gson. - -Project URL: https://github.com/google/gson/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles gRPC. - -Project URL: https://github.com/grpc/grpc-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OpenCensus. - -Project URL: https://github.com/census-instrumentation/opencensus-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OkHttp. - -Project URL: https://github.com/square/okhttp -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Auto Value. - -Project URL: https://github.com/google/auto/tree/master/value -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JaCoCo. - -Project URL: https://github.com/jacoco/jacoco/ -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt - --------------------------------------------------------------------------------- - -This product bundles OpenTelemetry. - -Project URL: https://opentelemetry.io/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Kotlin. - -Project URL: https://github.com/JetBrains/kotlin -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JAXB. - -Project URL: http://jaxb.java.net/ -License: CDDL 1.1 - https://glassfish.java.net/public/CDDL+GPL_1_1.html - --------------------------------------------------------------------------------- - -This product bundles EMMA runtime. - -Project URL: https://github.com/ehelms/Emma/ -License: Common Public License - v 1.0 - --------------------------------------------------------------------------------- - -This product bundles Google j2objc. - -Project URL: https://github.com/google/j2objc/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 diff --git a/spark/v3.5/spark-runtime/NOTICE b/spark/v3.5/spark-runtime/NOTICE index 68abd73906b1..c038e853af77 100644 --- a/spark/v3.5/spark-runtime/NOTICE +++ b/spark/v3.5/spark-runtime/NOTICE @@ -66,42 +66,6 @@ This product bundles Airlift Aircompressor with the following in its NOTICE file -------------------------------------------------------------------------------- -This product bundles Google Protobuf with the following in its NOTICE file: -| Copyright 2008 Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -| -| Code generated by the Protocol Buffer compiler is owned by the owner -| of the input file used when generating it. This code is not -| standalone and requires a support library to be linked with it. This -| support library is itself covered by the above license. - --------------------------------------------------------------------------------- - This product bundles Netty with the following in its NOTICE file: | The Netty Project | ================= @@ -371,6 +335,42 @@ This product bundles Project Nessie with the following in its NOTICE file: -------------------------------------------------------------------------------- +This product bundles Jackson JSON Processor with the following in its NOTICE file: +| # Jackson JSON processor +| +| Jackson is a high-performance, Free/Open Source JSON processing library. +| It was originally written by Tatu Saloranta (tatu.saloranta@iki.fi), and has +| been in development since 2007. +| It is currently developed by a community of developers. +| +| ## Copyright +| +| Copyright 2007-, Tatu Saloranta (tatu.saloranta@iki.fi) +| +| ## Licensing +| +| Jackson 2.x core and extension components are licensed under Apache License 2.0 +| To find the details that apply to this artifact see the accompanying LICENSE file. +| +| ## Credits +| +| A list of contributors may be found from CREDITS(-2.x) file, which is included +| in some artifacts (usually source distributions); but is always available +| from the source code management (SCM) system project uses. +| +| ## FastDoubleParser +| +| jackson-core bundles a shaded copy of FastDoubleParser . +| That code is available under an MIT license +| under the following copyright. +| +| Copyright © 2023 Werner Randelshofer, Switzerland. MIT License. +| +| See FastDoubleParser-NOTICE for details of other source code included in FastDoubleParser +| and the licenses and copyrights that apply to that code. + +-------------------------------------------------------------------------------- + This product bundles Eclipse MicroProfile OpenAPI with the following in its NOTICE file: | ========================================================================= | == NOTICE file corresponding to section 4(d) of the Apache License, == @@ -391,69 +391,3 @@ This product bundles Eclipse MicroProfile OpenAPI with the following in its NOTI | PackageCopyrightText: | Arthur De Magalhaes arthurdm@ca.ibm.com | - --------------------------------------------------------------------------------- - -This product bundles gRPC with the following in its NOTICE file: -| Copyright 2014 The gRPC Authors -| -| Licensed under the Apache License, Version 2.0 (the "License"); -| you may not use this file except in compliance with the License. -| You may obtain a copy of the License at -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, -| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -| See the License for the specific language governing permissions and -| limitations under the License. -| -| ----------------------------------------------------------------------- -| -| This product contains a modified portion of 'OkHttp', an open source -| HTTP & SPDY client for Android and Java applications, which can be obtained -| at: -| -| * LICENSE: -| * okhttp/third_party/okhttp/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/square/okhttp -| * LOCATION_IN_GRPC: -| * okhttp/third_party/okhttp -| -| This product contains a modified portion of 'Envoy', an open source -| cloud-native high-performance edge/middle/service proxy, which can be -| obtained at: -| -| * LICENSE: -| * xds/third_party/envoy/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/envoy/NOTICE -| * HOMEPAGE: -| * https://www.envoyproxy.io -| * LOCATION_IN_GRPC: -| * xds/third_party/envoy -| -| This product contains a modified portion of 'protoc-gen-validate (PGV)', -| an open source protoc plugin to generate polyglot message validators, -| which can be obtained at: -| -| * LICENSE: -| * xds/third_party/protoc-gen-validate/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/protoc-gen-validate/NOTICE -| * HOMEPAGE: -| * https://github.com/envoyproxy/protoc-gen-validate -| * LOCATION_IN_GRPC: -| * xds/third_party/protoc-gen-validate -| -| This product contains a modified portion of 'udpa', -| an open source universal data plane API, which can be obtained at: -| -| * LICENSE: -| * xds/third_party/udpa/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/cncf/udpa -| * LOCATION_IN_GRPC: -| * xds/third_party/udpa diff --git a/spark/v3.5/spark-runtime/baseline-class-uniqueness.lock b/spark/v3.5/spark-runtime/baseline-class-uniqueness.lock index 35cad90d888f..6197975f3900 100644 --- a/spark/v3.5/spark-runtime/baseline-class-uniqueness.lock +++ b/spark/v3.5/spark-runtime/baseline-class-uniqueness.lock @@ -77,21 +77,21 @@ - io.netty.buffer.AdaptivePoolingAllocator$SizeClassChunkController - io.netty.buffer.AdaptivePoolingAllocator$SizeClassedChunk - io.netty.buffer.ByteBufUtil - - io.netty.buffer.ByteBufUtil$2 - - io.netty.buffer.ByteBufUtil$HexUtil - - io.netty.buffer.ByteBufUtil$ThreadLocalDirectByteBuf - - io.netty.buffer.ByteBufUtil$ThreadLocalDirectByteBuf$1 - - io.netty.buffer.ByteBufUtil$ThreadLocalUnsafeDirectByteBuf - - io.netty.buffer.ByteBufUtil$ThreadLocalUnsafeDirectByteBuf$1 - io.netty.buffer.CompositeByteBuf - io.netty.buffer.EmptyByteBuf - io.netty.buffer.PoolArena + - io.netty.buffer.PoolArena$DirectArena - io.netty.buffer.PoolThreadCache$FreeOnFinalize - io.netty.buffer.PooledByteBufAllocator + - io.netty.buffer.PooledByteBufAllocator$PoolThreadLocalCache - io.netty.buffer.ReadOnlyAbstractByteBuf - io.netty.buffer.SimpleLeakAwareByteBuf - io.netty.buffer.Unpooled + - io.netty.buffer.UnpooledByteBufAllocator$DecrementingCleanableDirectBuffer + - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledDirectByteBuf + - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeDirectByteBuf - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeNoCleanerDirectByteBuf + - io.netty.buffer.UnpooledByteBufAllocator$UnpooledByteBufAllocatorMetric - io.netty.buffer.UnpooledDirectByteBuf - io.netty.buffer.UnpooledHeapByteBuf - io.netty.buffer.UnpooledUnsafeDirectByteBuf @@ -99,11 +99,19 @@ - io.netty.buffer.UnsafeByteBufUtil [dev.vortex:vortex-jni (classifier=all), io.netty:netty-common] - io.netty.util.AbstractReferenceCounted + - io.netty.util.DefaultAttributeMap - io.netty.util.HashedWheelTimer - io.netty.util.HashedWheelTimer$HashedWheelBucket - io.netty.util.LeakPresenceDetector - io.netty.util.LeakPresenceDetector$LeakCreation - io.netty.util.LeakPresenceDetector$ResourceScope + - io.netty.util.Recycler + - io.netty.util.Recycler$BlockingMessageQueue + - io.netty.util.Recycler$DefaultHandle + - io.netty.util.Recycler$EnhancedHandle + - io.netty.util.Recycler$GuardedLocalPool + - io.netty.util.Recycler$LocalPool + - io.netty.util.Recycler$UnguardedLocalPool - io.netty.util.concurrent.AbstractScheduledEventExecutor - io.netty.util.concurrent.GlobalEventExecutor - io.netty.util.concurrent.GlobalEventExecutor$2 @@ -118,11 +126,23 @@ - io.netty.util.concurrent.SingleThreadEventExecutor$4 - io.netty.util.concurrent.SingleThreadEventExecutor$5 - io.netty.util.concurrent.SingleThreadEventExecutor$DefaultThreadProperties + - io.netty.util.internal.Cleaner - io.netty.util.internal.CleanerJava24Linker - io.netty.util.internal.CleanerJava24Linker$CleanableDirectBufferImpl - io.netty.util.internal.CleanerJava25 - io.netty.util.internal.CleanerJava25$CleanableDirectBufferImpl + - io.netty.util.internal.CleanerJava6 + - io.netty.util.internal.CleanerJava6$2 + - io.netty.util.internal.CleanerJava6$CleanableDirectBufferImpl + - io.netty.util.internal.CleanerJava9 + - io.netty.util.internal.CleanerJava9$2 + - io.netty.util.internal.CleanerJava9$CleanableDirectBufferImpl + - io.netty.util.internal.DirectCleaner + - io.netty.util.internal.DirectCleaner$CleanableDirectBufferImpl + - io.netty.util.internal.EmptyArrays - io.netty.util.internal.PlatformDependent + - io.netty.util.internal.PlatformDependent$1 + - io.netty.util.internal.PlatformDependent$1$1 - io.netty.util.internal.PlatformDependent$Mpsc - io.netty.util.internal.PlatformDependent$Mpsc$1 - io.netty.util.internal.PlatformDependent0 diff --git a/spark/v3.5/spark-runtime/runtime-deps.txt b/spark/v3.5/spark-runtime/runtime-deps.txt new file mode 100644 index 000000000000..8c7143234ceb --- /dev/null +++ b/spark/v3.5/spark-runtime/runtime-deps.txt @@ -0,0 +1,48 @@ +com.fasterxml.jackson.core:jackson-annotations:2.21 +com.fasterxml.jackson.core:jackson-core:2.15.2 +com.fasterxml.jackson.core:jackson-databind:2.15.2 +com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.21.3 +com.github.ben-manes.caffeine:caffeine:2.9.3 +com.google.errorprone:error_prone_annotations:2.41.0 +com.google.flatbuffers:flatbuffers-java:25.2.10 +com.google.guava:failureaccess:1.0.3 +com.google.guava:guava:33.5.0-jre +com.google.guava:listenablefuture:9999.0-empty-to-avoid-conflict-with-guava +com.google.j2objc:j2objc-annotations:3.1 +com.google.protobuf:protobuf-java:4.33.5 +dev.failsafe:failsafe:3.3.2 +dev.vortex:vortex-jni:0.67.0 +dev.vortex:vortex-spark_2.12:0.67.0 +io.airlift:aircompressor:2.0.3 +io.netty:netty-buffer:4.2.12.Final +io.netty:netty-common:4.2.12.Final +org.apache.arrow:arrow-c-data:18.3.0 +org.apache.arrow:arrow-format:18.3.0 +org.apache.arrow:arrow-memory-core:18.3.0 +org.apache.arrow:arrow-memory-netty-buffer-patch:18.3.0 +org.apache.arrow:arrow-memory-netty:18.3.0 +org.apache.arrow:arrow-vector:18.3.0 +org.apache.avro:avro:1.12.1 +org.apache.datasketches:datasketches-java:6.2.0 +org.apache.datasketches:datasketches-memory:3.0.2 +org.apache.httpcomponents.client5:httpclient5:5.6.1 +org.apache.httpcomponents.core5:httpcore5-h2:5.4 +org.apache.httpcomponents.core5:httpcore5:5.4 +org.apache.orc:orc-core:1.9.8 +org.apache.orc:orc-shims:1.9.8 +org.apache.parquet:parquet-avro:1.17.0 +org.apache.parquet:parquet-column:1.17.0 +org.apache.parquet:parquet-common:1.17.0 +org.apache.parquet:parquet-encoding:1.17.0 +org.apache.parquet:parquet-format-structures:1.17.0 +org.apache.parquet:parquet-hadoop:1.17.0 +org.apache.parquet:parquet-jackson:1.17.0 +org.apache.parquet:parquet-variant:1.17.0 +org.checkerframework:checker-qual:3.19.0 +org.eclipse.microprofile.openapi:microprofile-openapi-api:4.1.1 +org.jspecify:jspecify:1.0.0 +org.locationtech.jts:jts-core:1.20.0 +org.projectnessie.nessie:nessie-client:0.107.5 +org.projectnessie.nessie:nessie-model:0.107.5 +org.roaringbitmap:RoaringBitmap:1.6.14 +org.threeten:threeten-extra:1.7.1 diff --git a/spark/v3.5/spark/baseline-class-uniqueness.lock b/spark/v3.5/spark/baseline-class-uniqueness.lock index 4a6e30c63973..72c0c24fb849 100644 --- a/spark/v3.5/spark/baseline-class-uniqueness.lock +++ b/spark/v3.5/spark/baseline-class-uniqueness.lock @@ -125,21 +125,21 @@ - io.netty.buffer.AdaptivePoolingAllocator$SizeClassChunkController - io.netty.buffer.AdaptivePoolingAllocator$SizeClassedChunk - io.netty.buffer.ByteBufUtil - - io.netty.buffer.ByteBufUtil$2 - - io.netty.buffer.ByteBufUtil$HexUtil - - io.netty.buffer.ByteBufUtil$ThreadLocalDirectByteBuf - - io.netty.buffer.ByteBufUtil$ThreadLocalDirectByteBuf$1 - - io.netty.buffer.ByteBufUtil$ThreadLocalUnsafeDirectByteBuf - - io.netty.buffer.ByteBufUtil$ThreadLocalUnsafeDirectByteBuf$1 - io.netty.buffer.CompositeByteBuf - io.netty.buffer.EmptyByteBuf - io.netty.buffer.PoolArena + - io.netty.buffer.PoolArena$DirectArena - io.netty.buffer.PoolThreadCache$FreeOnFinalize - io.netty.buffer.PooledByteBufAllocator + - io.netty.buffer.PooledByteBufAllocator$PoolThreadLocalCache - io.netty.buffer.ReadOnlyAbstractByteBuf - io.netty.buffer.SimpleLeakAwareByteBuf - io.netty.buffer.Unpooled + - io.netty.buffer.UnpooledByteBufAllocator$DecrementingCleanableDirectBuffer + - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledDirectByteBuf + - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeDirectByteBuf - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeNoCleanerDirectByteBuf + - io.netty.buffer.UnpooledByteBufAllocator$UnpooledByteBufAllocatorMetric - io.netty.buffer.UnpooledDirectByteBuf - io.netty.buffer.UnpooledHeapByteBuf - io.netty.buffer.UnpooledUnsafeDirectByteBuf @@ -147,11 +147,19 @@ - io.netty.buffer.UnsafeByteBufUtil [dev.vortex:vortex-jni (classifier=all), io.netty:netty-common] - io.netty.util.AbstractReferenceCounted + - io.netty.util.DefaultAttributeMap - io.netty.util.HashedWheelTimer - io.netty.util.HashedWheelTimer$HashedWheelBucket - io.netty.util.LeakPresenceDetector - io.netty.util.LeakPresenceDetector$LeakCreation - io.netty.util.LeakPresenceDetector$ResourceScope + - io.netty.util.Recycler + - io.netty.util.Recycler$BlockingMessageQueue + - io.netty.util.Recycler$DefaultHandle + - io.netty.util.Recycler$EnhancedHandle + - io.netty.util.Recycler$GuardedLocalPool + - io.netty.util.Recycler$LocalPool + - io.netty.util.Recycler$UnguardedLocalPool - io.netty.util.concurrent.AbstractScheduledEventExecutor - io.netty.util.concurrent.GlobalEventExecutor - io.netty.util.concurrent.GlobalEventExecutor$2 @@ -166,11 +174,23 @@ - io.netty.util.concurrent.SingleThreadEventExecutor$4 - io.netty.util.concurrent.SingleThreadEventExecutor$5 - io.netty.util.concurrent.SingleThreadEventExecutor$DefaultThreadProperties + - io.netty.util.internal.Cleaner - io.netty.util.internal.CleanerJava24Linker - io.netty.util.internal.CleanerJava24Linker$CleanableDirectBufferImpl - io.netty.util.internal.CleanerJava25 - io.netty.util.internal.CleanerJava25$CleanableDirectBufferImpl + - io.netty.util.internal.CleanerJava6 + - io.netty.util.internal.CleanerJava6$2 + - io.netty.util.internal.CleanerJava6$CleanableDirectBufferImpl + - io.netty.util.internal.CleanerJava9 + - io.netty.util.internal.CleanerJava9$2 + - io.netty.util.internal.CleanerJava9$CleanableDirectBufferImpl + - io.netty.util.internal.DirectCleaner + - io.netty.util.internal.DirectCleaner$CleanableDirectBufferImpl + - io.netty.util.internal.EmptyArrays - io.netty.util.internal.PlatformDependent + - io.netty.util.internal.PlatformDependent$1 + - io.netty.util.internal.PlatformDependent$1$1 - io.netty.util.internal.PlatformDependent$Mpsc - io.netty.util.internal.PlatformDependent$Mpsc$1 - io.netty.util.internal.PlatformDependent0 diff --git a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java index 64edb1002e99..47fe46558d7e 100644 --- a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java +++ b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java @@ -37,6 +37,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.actions.SparkActions; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -179,6 +180,7 @@ private void setupSpark() { .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") .config("spark.sql.catalog.spark_catalog.warehouse", catalogWarehouse()) + .config(TestBase.DISABLE_UI) .master("local"); spark = builder.getOrCreate(); } diff --git a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java index 4978961be641..683f6bb46d05 100644 --- a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java +++ b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java @@ -41,6 +41,7 @@ import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.actions.SparkActions; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; @@ -394,6 +395,7 @@ protected void setupSpark() { "spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") .config("spark.sql.catalog.spark_catalog.type", "hadoop") .config("spark.sql.catalog.spark_catalog.warehouse", getCatalogWarehouse()) + .config(TestBase.DISABLE_UI) .master("local[*]"); spark = builder.getOrCreate(); Configuration sparkHadoopConf = spark.sessionState().newHadoopConf(); diff --git a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java index c6794e43c636..3f242ce228ca 100644 --- a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java +++ b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java @@ -49,6 +49,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.util.ContentFileUtil; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.catalyst.InternalRow; @@ -234,7 +235,7 @@ private String generateDataFilePath() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java index ac74fb5a109c..db5789724056 100644 --- a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java +++ b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java @@ -43,6 +43,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; @@ -218,7 +219,7 @@ private String generateDataFilePath() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java index 68c537e34a4a..debe37866ff7 100644 --- a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java +++ b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java @@ -30,6 +30,7 @@ import org.apache.iceberg.UpdateProperties; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SaveMode; @@ -94,7 +95,7 @@ protected void cleanupFiles() throws IOException { } protected void setupSpark(boolean enableDictionaryEncoding) { - SparkSession.Builder builder = SparkSession.builder().config("spark.ui.enabled", false); + SparkSession.Builder builder = SparkSession.builder().config(TestBase.DISABLE_UI); if (!enableDictionaryEncoding) { builder .config("parquet.dictionary.page.size", "1") diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java index ea400a779235..cb9da3edc678 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java @@ -56,7 +56,7 @@ public static R withCommitProperties( ExceptionUtil.castAndThrow(e, exClass); return null; } finally { - COMMIT_PROPERTIES.set(ImmutableMap.of()); + COMMIT_PROPERTIES.remove(); } } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java index fbd21f737450..fec413ca079a 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java @@ -41,6 +41,7 @@ import org.apache.spark.sql.types.IntegerType$; import org.apache.spark.sql.types.LongType$; import org.apache.spark.sql.types.MapType; +import org.apache.spark.sql.types.NullType$; import org.apache.spark.sql.types.StringType$; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; @@ -238,5 +239,6 @@ public Type primitive(Type.PrimitiveType primitive) { .put(TypeID.STRING, ImmutableSet.of(StringType$.class)) .put(TypeID.FIXED, ImmutableSet.of(BinaryType$.class)) .put(TypeID.BINARY, ImmutableSet.of(BinaryType$.class)) + .put(TypeID.UNKNOWN, ImmutableSet.of(NullType$.class)) .buildOrThrow(); } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java index bc8a966488ee..f1709277525a 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java @@ -24,17 +24,17 @@ import java.util.function.Function; import org.apache.spark.sql.connector.catalog.Identifier; import org.apache.spark.sql.connector.catalog.StagedTable; -import org.apache.spark.sql.connector.catalog.SupportsDelete; +import org.apache.spark.sql.connector.catalog.SupportsDeleteV2; import org.apache.spark.sql.connector.catalog.SupportsRead; import org.apache.spark.sql.connector.catalog.SupportsWrite; import org.apache.spark.sql.connector.catalog.Table; import org.apache.spark.sql.connector.catalog.TableCapability; import org.apache.spark.sql.connector.catalog.TableCatalog; import org.apache.spark.sql.connector.expressions.Transform; +import org.apache.spark.sql.connector.expressions.filter.Predicate; import org.apache.spark.sql.connector.read.ScanBuilder; import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.connector.write.WriteBuilder; -import org.apache.spark.sql.sources.Filter; import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.util.CaseInsensitiveStringMap; @@ -58,7 +58,7 @@ * #capabilities()}. */ public class RollbackStagedTable - implements StagedTable, SupportsRead, SupportsWrite, SupportsDelete { + implements StagedTable, SupportsRead, SupportsWrite, SupportsDeleteV2 { private final TableCatalog catalog; private final Identifier ident; private final Table table; @@ -106,8 +106,8 @@ public Set capabilities() { } @Override - public void deleteWhere(Filter[] filters) { - call(SupportsDelete.class, t -> t.deleteWhere(filters)); + public void deleteWhere(Predicate[] predicates) { + call(SupportsDeleteV2.class, t -> t.deleteWhere(predicates)); } @Override diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java index bad31d8d85f4..78d69eeaaf61 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java @@ -35,8 +35,6 @@ public abstract class SparkContentFile implements ContentFile { - private static final FileContent[] FILE_CONTENT_VALUES = FileContent.values(); - private final int fileContentPosition; private final int filePathPosition; private final int fileFormatPosition; @@ -139,7 +137,7 @@ public FileContent content() { if (wrapped.isNullAt(fileContentPosition)) { return null; } - return FILE_CONTENT_VALUES[wrapped.getInt(fileContentPosition)]; + return FileContent.fromId(wrapped.getInt(fileContentPosition)); } @Override diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java index b38c041507bb..61b1db160457 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java @@ -261,6 +261,39 @@ public int maxRecordsPerMicroBatch() { .parse(); } + public boolean asyncMicroBatchPlanningEnabled() { + return confParser + .booleanConf() + .option(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED) + .sessionConf(SparkSQLProperties.ASYNC_MICRO_BATCH_PLANNING_ENABLED) + .defaultValue(SparkSQLProperties.ASYNC_MICRO_BATCH_PLANNING_ENABLED_DEFAULT) + .parse(); + } + + public long streamingSnapshotPollingIntervalMs() { + return confParser + .longConf() + .option(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS) + .defaultValue(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS_DEFAULT) + .parse(); + } + + public long asyncQueuePreloadFileLimit() { + return confParser + .longConf() + .option(SparkReadOptions.ASYNC_QUEUE_PRELOAD_FILE_LIMIT) + .defaultValue(SparkReadOptions.ASYNC_QUEUE_PRELOAD_FILE_LIMIT_DEFAULT) + .parse(); + } + + public long asyncQueuePreloadRowLimit() { + return confParser + .longConf() + .option(SparkReadOptions.ASYNC_QUEUE_PRELOAD_ROW_LIMIT) + .defaultValue(SparkReadOptions.ASYNC_QUEUE_PRELOAD_ROW_LIMIT_DEFAULT) + .parse(); + } + public boolean preserveDataGrouping() { return confParser .booleanConf() diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java index 17f2bfee69b8..5262310e2c5e 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java @@ -87,6 +87,21 @@ private SparkReadOptions() {} public static final String STREAMING_MAX_ROWS_PER_MICRO_BATCH = "streaming-max-rows-per-micro-batch"; + // Enable async micro batch planning + public static final String ASYNC_MICRO_BATCH_PLANNING_ENABLED = + "async-micro-batch-planning-enabled"; + + // Polling interval for async planner to refresh table metadata (ms) + public static final String STREAMING_SNAPSHOT_POLLING_INTERVAL_MS = + "streaming-snapshot-polling-interval-ms"; + public static final long STREAMING_SNAPSHOT_POLLING_INTERVAL_MS_DEFAULT = 30000L; + + // Initial queue preload limits for async micro batch planner + public static final String ASYNC_QUEUE_PRELOAD_FILE_LIMIT = "async-queue-preload-file-limit"; + public static final long ASYNC_QUEUE_PRELOAD_FILE_LIMIT_DEFAULT = 100L; + public static final String ASYNC_QUEUE_PRELOAD_ROW_LIMIT = "async-queue-preload-row-limit"; + public static final long ASYNC_QUEUE_PRELOAD_ROW_LIMIT_DEFAULT = 100000L; + // Table path public static final String PATH = "path"; diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java index e3ee288affbe..74adb0bc95da 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java @@ -103,4 +103,9 @@ private SparkSQLProperties() {} // Controls whether to report available column statistics to Spark for query optimization. public static final String REPORT_COLUMN_STATS = "spark.sql.iceberg.report-column-stats"; public static final boolean REPORT_COLUMN_STATS_DEFAULT = true; + + // Controls whether to enable async micro batch planning for session + public static final String ASYNC_MICRO_BATCH_PLANNING_ENABLED = + "spark.sql.iceberg.async-micro-batch-planning-enabled"; + public static final boolean ASYNC_MICRO_BATCH_PLANNING_ENABLED_DEFAULT = false; } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java index 8beaefc5cc8f..b7ed31c274d7 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java @@ -35,6 +35,7 @@ import org.apache.spark.sql.types.IntegerType; import org.apache.spark.sql.types.LongType; import org.apache.spark.sql.types.MapType; +import org.apache.spark.sql.types.NullType; import org.apache.spark.sql.types.ShortType; import org.apache.spark.sql.types.StringType; import org.apache.spark.sql.types.StructField; @@ -155,6 +156,8 @@ public Type atomic(DataType atomic) { ((DecimalType) atomic).precision(), ((DecimalType) atomic).scale()); } else if (atomic instanceof BinaryType) { return Types.BinaryType.get(); + } else if (atomic instanceof NullType) { + return Types.UnknownType.get(); } throw new UnsupportedOperationException("Not a supported type: " + atomic.catalogString()); diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java index b3e8af5fe056..9da48ae51e5c 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java @@ -42,6 +42,7 @@ import org.apache.iceberg.FileFormat; import org.apache.iceberg.IsolationLevel; import org.apache.iceberg.SnapshotSummary; +import org.apache.iceberg.SortOrder; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; import org.apache.iceberg.TableUtil; @@ -162,6 +163,25 @@ public int outputSpecId() { return outputSpecId; } + public int outputSortOrderId(SparkWriteRequirements writeRequirements) { + Integer explicitId = + confParser.intConf().option(SparkWriteOptions.OUTPUT_SORT_ORDER_ID).parseOptional(); + + if (explicitId != null) { + Preconditions.checkArgument( + table.sortOrders().containsKey(explicitId), + "Cannot use output sort order id %s because the table does not contain a sort order with that id", + explicitId); + return explicitId; + } + + if (writeRequirements.hasOrdering()) { + return table.sortOrder().orderId(); + } + + return SortOrder.unsorted().orderId(); + } + public FileFormat dataFileFormat() { String valueAsString = confParser diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java index 33db70bae587..1be02feaf0c0 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java @@ -54,6 +54,7 @@ private SparkWriteOptions() {} public static final String REWRITTEN_FILE_SCAN_TASK_SET_ID = "rewritten-file-scan-task-set-id"; public static final String OUTPUT_SPEC_ID = "output-spec-id"; + public static final String OUTPUT_SORT_ORDER_ID = "output-sort-order-id"; public static final String OVERWRITE_MODE = "overwrite-mode"; diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java index dfb9b30be603..d33632bbbd54 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java @@ -38,6 +38,7 @@ import org.apache.spark.sql.types.MapType$; import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.MetadataBuilder; +import org.apache.spark.sql.types.NullType$; import org.apache.spark.sql.types.StringType$; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType$; @@ -124,9 +125,11 @@ public DataType primitive(Type.PrimitiveType primitive) { case DECIMAL: Types.DecimalType decimal = (Types.DecimalType) primitive; return DecimalType$.MODULE$.apply(decimal.precision(), decimal.scale()); + case UNKNOWN: + return NullType$.MODULE$; default: throw new UnsupportedOperationException( - "Cannot convert unknown type to Spark: " + primitive); + "Cannot convert unsupported type to Spark: " + primitive); } } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingFileRewriteRunner.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingFileRewriteRunner.java index b1c5a5c0901a..346abaee5e63 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingFileRewriteRunner.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingFileRewriteRunner.java @@ -47,10 +47,14 @@ import org.apache.spark.sql.connector.expressions.SortOrder; import org.apache.spark.sql.connector.write.RequiresDistributionAndOrdering; import org.apache.spark.sql.execution.datasources.v2.DistributionAndOrderingUtils$; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import scala.Option; abstract class SparkShufflingFileRewriteRunner extends SparkDataFileRewriteRunner { + private static final Logger LOG = LoggerFactory.getLogger(SparkShufflingFileRewriteRunner.class); + /** * The number of shuffle partitions to use for each output file. By default, this file rewriter * assumes each shuffle partition would become a separate output file. Attempting to generate @@ -119,6 +123,17 @@ public void doRewrite(String groupId, RewriteFileGroup fileGroup) { spec(fileGroup.outputSpecId()), fileGroup.expectedOutputFiles())); + org.apache.iceberg.SortOrder sortOrderInJobSpec = sortOrder(); + + org.apache.iceberg.SortOrder maybeMatchingTableSortOrder = + SortOrderUtil.findTableSortOrder(table(), sortOrder()); + + if (sortOrderInJobSpec.isSorted() && maybeMatchingTableSortOrder.isUnsorted()) { + LOG.warn( + "Sort order specified for job {} doesn't match any table sort orders, rewritten files will not be marked as sorted in the manifest files", + Spark3Util.describe(sortOrderInJobSpec)); + } + sortedDF .write() .format("iceberg") @@ -126,6 +141,7 @@ public void doRewrite(String groupId, RewriteFileGroup fileGroup) { .option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES, fileGroup.maxOutputFileSize()) .option(SparkWriteOptions.USE_TABLE_DISTRIBUTION_AND_ORDERING, "false") .option(SparkWriteOptions.OUTPUT_SPEC_ID, fileGroup.outputSpecId()) + .option(SparkWriteOptions.OUTPUT_SORT_ORDER_ID, maybeMatchingTableSortOrder.orderId()) .mode("append") .save(groupId); } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java index db359fdd62fc..bf80dcb10b30 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java @@ -24,6 +24,8 @@ import java.nio.ByteBuffer; import java.nio.charset.CharsetEncoder; import java.nio.charset.StandardCharsets; +import java.time.LocalDateTime; +import org.apache.iceberg.util.DateTimeUtil; import org.apache.iceberg.util.ZOrderByteUtils; import org.apache.spark.sql.Column; import org.apache.spark.sql.expressions.UserDefinedFunction; @@ -40,6 +42,7 @@ import org.apache.spark.sql.types.LongType; import org.apache.spark.sql.types.ShortType; import org.apache.spark.sql.types.StringType; +import org.apache.spark.sql.types.TimestampNTZType; import org.apache.spark.sql.types.TimestampType; import scala.collection.JavaConverters; import scala.collection.Seq; @@ -180,6 +183,29 @@ value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) return udf; } + private UserDefinedFunction timestampNtzToOrderedBytesUDF() { + int position = inputCol; + UserDefinedFunction udf = + functions + .udf( + (LocalDateTime value) -> { + if (value == null) { + return PRIMITIVE_EMPTY; + } + long micros = DateTimeUtil.microsFromTimestamp(value); + return ZOrderByteUtils.longToOrderedBytes( + micros, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) + .array(); + }, + DataTypes.BinaryType) + .withName("TIMESTAMP_NTZ_ORDERED_BYTES"); + + this.inputCol++; + increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); + + return udf; + } + private UserDefinedFunction floatToOrderedBytesUDF() { int position = inputCol; UserDefinedFunction udf = @@ -309,6 +335,8 @@ Column sortedLexicographically(Column column, DataType type) { return booleanToOrderedBytesUDF().apply(column); } else if (type instanceof TimestampType) { return longToOrderedBytesUDF().apply(column.cast(DataTypes.LongType)); + } else if (type instanceof TimestampNTZType) { + return timestampNtzToOrderedBytesUDF().apply(column); } else if (type instanceof DateType) { return longToOrderedBytesUDF().apply(column.cast(DataTypes.LongType)); } else { diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java index 9480385d5452..e11a85d538a6 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java @@ -31,6 +31,7 @@ import org.apache.parquet.schema.Type.Repetition; import org.apache.spark.sql.types.ArrayType; import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.MapType; import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.StructField; @@ -173,21 +174,27 @@ private static T visitField( private static List visitFields( StructType struct, GroupType group, ParquetWithSparkSchemaVisitor visitor) { - StructField[] sFields = struct.fields(); - Preconditions.checkArgument( - sFields.length == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); - for (int i = 0; i < sFields.length; i += 1) { - Type field = group.getFields().get(i); - StructField sField = sFields[i]; - Preconditions.checkArgument( - field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())), - "Structs do not match: field %s != %s", - field.getName(), - sField.name()); - results.add(visitField(sField, field, visitor)); + + int fieldIndex = 0; + for (StructField sField : struct.fields()) { + if (sField.dataType() != DataTypes.NullType) { + Type field = group.getFields().get(fieldIndex); + Preconditions.checkArgument( + field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())), + "Structs do not match: field %s != %s", + field.getName(), + sField.name()); + results.add(visitField(sField, field, visitor)); + + fieldIndex += 1; + } } + // All the group fields should have been visited + Preconditions.checkArgument( + fieldIndex == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); + return results; } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java index 6b799e677bf4..6fc8849c82b2 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java @@ -20,6 +20,8 @@ import java.io.Serializable; import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; import java.util.stream.Stream; import javax.annotation.Nullable; import org.apache.iceberg.FieldMetrics; @@ -77,7 +79,7 @@ public OrcValueWriter record( TypeDescription record, List names, List> fields) { - return new InternalRowWriter(fields, record.getChildren()); + return new InternalRowWriter(fields, iStruct, record.getChildren()); } @Override @@ -133,12 +135,16 @@ public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, TypeDescriptio private static class InternalRowWriter extends GenericOrcWriters.StructWriter { private final List> fieldGetters; - InternalRowWriter(List> writers, List orcTypes) { - super(writers); + InternalRowWriter( + List> writers, Types.StructType iStruct, List orcTypes) { + super(iStruct, writers); this.fieldGetters = Lists.newArrayListWithExpectedSize(orcTypes.size()); - for (TypeDescription orcType : orcTypes) { - fieldGetters.add(createFieldGetter(orcType)); + Map idToType = + orcTypes.stream().collect(Collectors.toMap(ORCSchemaUtil::fieldId, s -> s)); + + for (Types.NestedField iField : iStruct.fields()) { + fieldGetters.add(createFieldGetter(idToType.get(iField.fieldId()))); } } @@ -149,6 +155,11 @@ protected Object get(InternalRow struct, int index) { } static FieldGetter createFieldGetter(TypeDescription fieldType) { + // In the case of an UnknownType + if (fieldType == null) { + return (row, ordinal) -> null; + } + final FieldGetter fieldGetter; switch (fieldType.getCategory()) { case BOOLEAN: diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java index 58be7f610c81..a1ed6c66f337 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java @@ -26,6 +26,7 @@ import java.util.NoSuchElementException; import java.util.Optional; import java.util.UUID; +import java.util.stream.IntStream; import org.apache.iceberg.Schema; import org.apache.iceberg.parquet.ParquetValueReaders.ReusableEntry; import org.apache.iceberg.parquet.ParquetValueWriter; @@ -55,6 +56,7 @@ import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.Decimal; import org.apache.spark.sql.types.MapType; +import org.apache.spark.sql.types.NullType; import org.apache.spark.sql.types.ShortType; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; @@ -94,14 +96,18 @@ public ParquetValueWriter message( public ParquetValueWriter struct( StructType sStruct, GroupType struct, List> fieldWriters) { List fields = struct.getFields(); - StructField[] sparkFields = sStruct.fields(); List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); - List sparkTypes = Lists.newArrayList(); for (int i = 0; i < fields.size(); i += 1) { writers.add(newOption(struct.getType(i), fieldWriters.get(i))); - sparkTypes.add(sparkFields[i].dataType()); } - return new InternalRowWriter(writers, sparkTypes); + + StructField[] sFields = sStruct.fields(); + DataType[] types = new DataType[sFields.length]; + for (int i = 0; i < sFields.length; i += 1) { + types[i] = sFields[i].dataType(); + } + + return new InternalRowWriter(writers, types); } @Override @@ -565,14 +571,33 @@ public Map.Entry next() { private static class InternalRowWriter extends ParquetValueWriters.StructWriter { private final DataType[] types; - private InternalRowWriter(List> writers, List types) { - super(writers); - this.types = types.toArray(new DataType[0]); + private InternalRowWriter(List> writers, DataType[] types) { + super(writerToFieldIndex(types, writers.size()), writers); + this.types = types; } @Override protected Object get(InternalRow struct, int index) { return struct.get(index, types[index]); } + + /** Returns a mapping from writer index to field index, skipping Unknown columns. */ + private static int[] writerToFieldIndex(DataType[] types, int numWriters) { + if (null == types) { + return IntStream.rangeClosed(0, numWriters).toArray(); + } + + // value writer index to record field index + int[] indexes = new int[numWriters]; + int writerIndex = 0; + for (int pos = 0; pos < types.length; pos += 1) { + if (!(types[pos] instanceof NullType)) { + indexes[writerIndex] = pos; + writerIndex += 1; + } + } + + return indexes; + } } } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java new file mode 100644 index 000000000000..3e442f9917d4 --- /dev/null +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java @@ -0,0 +1,543 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; +import java.util.LinkedList; +import java.util.List; +import java.util.concurrent.Executors; +import java.util.concurrent.LinkedBlockingDeque; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.MicroBatches; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.util.Pair; +import org.apache.spark.sql.connector.read.streaming.ReadAllAvailable; +import org.apache.spark.sql.connector.read.streaming.ReadLimit; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class AsyncSparkMicroBatchPlanner extends BaseSparkMicroBatchPlanner implements AutoCloseable { + private static final Logger LOG = LoggerFactory.getLogger(AsyncSparkMicroBatchPlanner.class); + private static final int PLAN_FILES_CACHE_MAX_SIZE = 10; + private static final long QUEUE_POLL_TIMEOUT_MS = 100L; // 100 ms + + private final long minQueuedFiles; + private final long minQueuedRows; + + // Cache for planFiles results to handle duplicate calls + private final Cache, List> planFilesCache; + + // Queue to buffer pre-fetched file scan tasks + private final LinkedBlockingDeque> queue; + + // Background executor for async operations + private final ScheduledExecutorService executor; + + // Error tracking + private volatile Throwable refreshFailedThrowable; + private volatile Throwable fillQueueFailedThrowable; + + // Tracking queue state + private final AtomicLong queuedFileCount = new AtomicLong(0); + private final AtomicLong queuedRowCount = new AtomicLong(0); + private Snapshot lastQueuedSnapshot; + private boolean stopped; + + // Cap for Trigger.AvailableNow - don't process beyond this offset + private final StreamingOffset lastOffsetForTriggerAvailableNow; + + /** + * This class manages a queue of FileScanTask + StreamingOffset. On creation, it starts up an + * asynchronous polling process which populates the queue when a new snapshot arrives or the + * minimum amount of queued data is too low. + * + *

      Note: this will capture the state of the table when snapshots are added to the queue. If a + * snapshot is expired after being added to the queue, the job will still process it. + */ + AsyncSparkMicroBatchPlanner( + Table table, + SparkReadConf readConf, + StreamingOffset initialOffset, + StreamingOffset maybeEndOffset, + StreamingOffset lastOffsetForTriggerAvailableNow) { + super(table, readConf); + this.minQueuedFiles = readConf().maxFilesPerMicroBatch(); + this.minQueuedRows = readConf().maxRecordsPerMicroBatch(); + this.lastOffsetForTriggerAvailableNow = lastOffsetForTriggerAvailableNow; + this.planFilesCache = Caffeine.newBuilder().maximumSize(PLAN_FILES_CACHE_MAX_SIZE).build(); + this.queue = new LinkedBlockingDeque<>(); + + table().refresh(); + + // Synchronously add data to the queue to meet our initial constraints. + // For Trigger.AvailableNow, constructor-time preload is normally initialized from + // latestOffset(...) with no explicit end offset, so bounded preload must stop at + // Trigger.AvailableNow snapshot. + fillQueue(initialOffset, maybeEndOffset); + + this.executor = + Executors.newSingleThreadScheduledExecutor( + r -> { + Thread thread = new Thread(r, "iceberg-async-planner-" + table().name()); + thread.setDaemon(true); + return thread; + }); + // Schedule table refresh at configured interval + long pollingIntervalMs = readConf().streamingSnapshotPollingIntervalMs(); + this.executor.scheduleWithFixedDelay( + this::refreshAndTrapException, pollingIntervalMs, pollingIntervalMs, TimeUnit.MILLISECONDS); + // Schedule queue fill to run frequently (use polling interval for tests, cap at 100ms for + // production) + long queueFillIntervalMs = Math.min(QUEUE_POLL_TIMEOUT_MS, pollingIntervalMs); + executor.scheduleWithFixedDelay( + () -> fillQueueAndTrapException(lastQueuedSnapshot), + 0, + queueFillIntervalMs, + TimeUnit.MILLISECONDS); + + LOG.info( + "Started AsyncSparkMicroBatchPlanner for {} from initialOffset: {}", + table().name(), + initialOffset); + } + + @Override + public synchronized void stop() { + Preconditions.checkArgument( + !stopped, "AsyncSparkMicroBatchPlanner for {} was already stopped", table().name()); + stopped = true; + LOG.info("Stopping AsyncSparkMicroBatchPlanner for table: {}", table().name()); + executor.shutdownNow(); + boolean terminated = false; + try { + terminated = + executor.awaitTermination( + readConf().streamingSnapshotPollingIntervalMs() * 2, TimeUnit.MILLISECONDS); + } catch (InterruptedException ignored) { + // Restore interrupt status + Thread.currentThread().interrupt(); + } + LOG.info("AsyncSparkMicroBatchPlanner for table: {}, stopped: {}", table().name(), terminated); + } + + @Override + public void close() { + stop(); + } + + /** + * Spark can call this multiple times; it should produce the same answer every time. + * + * @param startOffset the starting offset of this microbatch, position is inclusive + * @param endOffset the end offset of this microbatch, position is exclusive + * @return the list of files to scan between these offsets + */ + @Override + public synchronized List planFiles( + StreamingOffset startOffset, StreamingOffset endOffset) { + return planFilesCache.get( + Pair.of(startOffset, endOffset), + key -> { + LOG.info( + "running planFiles for {}, startOffset: {}, endOffset: {}", + table().name(), + startOffset, + endOffset); + List result = new LinkedList<>(); + Pair elem; + StreamingOffset currentOffset; + boolean shouldTerminate = false; + long filesInPlan = 0; + long rowsInPlan = 0; + + do { + try { + elem = queue.pollFirst(QUEUE_POLL_TIMEOUT_MS, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException("Interrupted while polling queue", e); + } + + if (elem != null) { + currentOffset = elem.first(); + LOG.debug("planFiles consumed: {}", currentOffset); + FileScanTask currentTask = elem.second(); + filesInPlan += 1; + long elemRows = currentTask.file().recordCount(); + rowsInPlan += elemRows; + queuedFileCount.decrementAndGet(); + queuedRowCount.addAndGet(-elemRows); + result.add(currentTask); + + // try to peek at the next entry of the queue and see if we should stop + Pair nextElem = queue.peekFirst(); + boolean endOffsetPeek = false; + if (nextElem != null) { + endOffsetPeek = endOffset.equals(nextElem.first()); + } + // end offset may be synthetic and not exist in the queue + boolean endOffsetSynthetic = + currentOffset.snapshotId() == endOffset.snapshotId() + && (currentOffset.position() + 1) == endOffset.position(); + shouldTerminate = endOffsetPeek || endOffsetSynthetic; + } else { + LOG.trace("planFiles hasn't reached {}, waiting", endOffset); + } + } while (!shouldTerminate + && refreshFailedThrowable == null + && fillQueueFailedThrowable == null); + + if (refreshFailedThrowable != null) { + throw new RuntimeException("Table refresh failed", refreshFailedThrowable); + } + + if (fillQueueFailedThrowable != null) { + throw new RuntimeException("Queue filling failed", fillQueueFailedThrowable); + } + + LOG.info( + "completed planFiles for {}, startOffset: {}, endOffset: {}, files: {}, rows: {}", + table().name(), + startOffset, + endOffset, + filesInPlan, + rowsInPlan); + return result; + }); + } + + /** + * This needs to be non destructive on the queue as spark could call this multiple times. Each + * time, depending on the table state it could return something different + * + * @param startOffset the starting offset of the next microbatch + * @param limit a limit for how many files/bytes/rows the next microbatch should include + * @return The end offset to use for the next microbatch, null signals that no data is available + */ + @Override + public synchronized StreamingOffset latestOffset(StreamingOffset startOffset, ReadLimit limit) { + LOG.info( + "running latestOffset for {}, startOffset: {}, limit: {}", + table().name(), + startOffset, + limit); + + if (table().currentSnapshot() == null) { + LOG.info("latestOffset returning START_OFFSET, currentSnapshot() is null"); + return StreamingOffset.START_OFFSET; + } + + if (table().currentSnapshot().timestampMillis() < readConf().streamFromTimestamp()) { + LOG.info("latestOffset returning START_OFFSET, currentSnapshot() < fromTimestamp"); + return StreamingOffset.START_OFFSET; + } + + // if any exceptions were encountered in the background process, raise them here + if (refreshFailedThrowable != null) { + throw new RuntimeException(refreshFailedThrowable); + } + if (fillQueueFailedThrowable != null) { + throw new RuntimeException(fillQueueFailedThrowable); + } + + // if we want to read all available we don't need to scan files, just snapshots + if (limit instanceof ReadAllAvailable) { + // If Trigger.AvailableNow cap is set, return it directly + if (this.lastOffsetForTriggerAvailableNow != null) { + return this.lastOffsetForTriggerAvailableNow; + } + Snapshot lastValidSnapshot = table().snapshot(startOffset.snapshotId()); + Snapshot nextValidSnapshot; + do { + nextValidSnapshot = nextValidSnapshot(lastValidSnapshot); + if (nextValidSnapshot != null) { + lastValidSnapshot = nextValidSnapshot; + } + } while (nextValidSnapshot != null); + return new StreamingOffset( + lastValidSnapshot.snapshotId(), + MicroBatchUtils.addedFilesCount(table(), lastValidSnapshot), + false); + } + + return computeLimitedOffset(limit); + } + + private StreamingOffset computeLimitedOffset(ReadLimit limit) { + UnpackedLimits unpackedLimits = new UnpackedLimits(limit); + long rowsSeen = 0; + long filesSeen = 0; + LOG.debug( + "latestOffset queue status, queuedFiles: {}, queuedRows: {}", + queuedFileCount.get(), + queuedRowCount.get()); + + List> queueSnapshot = Lists.newArrayList(queue); + Pair queueTail = + queueSnapshot.isEmpty() ? null : queueSnapshot.get(queueSnapshot.size() - 1); + + for (int i = 0; i < queueSnapshot.size(); i++) { + Pair elem = queueSnapshot.get(i); + long fileRows = elem.second().file().recordCount(); + + // Hard limit on files - stop BEFORE exceeding + if (filesSeen + 1 > unpackedLimits.getMaxFiles()) { + if (filesSeen == 0) { + return null; + } + LOG.debug( + "latestOffset hit file limit at {}, rows: {}, files: {}", + elem.first(), + rowsSeen, + filesSeen); + return elem.first(); + } + + // Soft limit on rows - include file FIRST, then check + rowsSeen += fileRows; + filesSeen += 1; + + // Check if we've hit the row limit after including this file + if (rowsSeen >= unpackedLimits.getMaxRows()) { + if (filesSeen == 1 && rowsSeen > unpackedLimits.getMaxRows()) { + LOG.warn( + "File {} at offset {} contains {} records, exceeding maxRecordsPerMicroBatch limit of {}. " + + "This file will be processed entirely to guarantee forward progress. " + + "Consider increasing the limit or writing smaller files to avoid unexpected memory usage.", + elem.second().file().location(), + elem.first(), + fileRows, + unpackedLimits.getMaxRows()); + } + // Return the offset of the NEXT element (or synthesize tail+1) + if (i + 1 < queueSnapshot.size()) { + LOG.debug( + "latestOffset hit row limit at {}, rows: {}, files: {}", + queueSnapshot.get(i + 1).first(), + rowsSeen, + filesSeen); + return queueSnapshot.get(i + 1).first(); + } else { + // This is the last element - return tail+1 + StreamingOffset current = elem.first(); + StreamingOffset result = + new StreamingOffset( + current.snapshotId(), current.position() + 1, current.shouldScanAllFiles()); + LOG.debug( + "latestOffset hit row limit at tail {}, rows: {}, files: {}", + result, + rowsSeen, + filesSeen); + return result; + } + } + } + + // if we got here there aren't enough files to exceed our limits + if (queueTail != null) { + StreamingOffset tailOffset = queueTail.first(); + // we have to increment the position by 1 since we want to include the tail in the read and + // position is non-inclusive + StreamingOffset latestOffset = + new StreamingOffset( + tailOffset.snapshotId(), tailOffset.position() + 1, tailOffset.shouldScanAllFiles()); + LOG.debug("latestOffset returning all queued data {}", latestOffset); + return latestOffset; + } + + // if we got here the queue is empty + LOG.debug("latestOffset no data, returning null"); + return null; + } + + // Background task wrapper that traps exceptions + private void refreshAndTrapException() { + try { + table().refresh(); + } catch (Throwable t) { + LOG.error("Failed to refresh table {}", table().name(), t); + refreshFailedThrowable = t; + } + } + + // Background task wrapper that traps exceptions + private void fillQueueAndTrapException(Snapshot snapshot) { + try { + fillQueue(snapshot); + } catch (Throwable t) { + LOG.error("Failed to fill queue for table {}", table().name(), t); + fillQueueFailedThrowable = t; + } + } + + /** Generate a MicroBatch based on input parameters and add to the queue */ + private void addMicroBatchToQueue( + Snapshot snapshot, long startFileIndex, long endFileIndex, boolean shouldScanAllFile) { + LOG.info("Adding MicroBatch for snapshot: {} to the queue", snapshot.snapshotId()); + MicroBatches.MicroBatch microBatch = + MicroBatches.from(snapshot, table().io()) + .caseSensitive(readConf().caseSensitive()) + .specsById(table().specs()) + .generate(startFileIndex, endFileIndex, Long.MAX_VALUE, shouldScanAllFile); + + long position = startFileIndex; + for (FileScanTask task : microBatch.tasks()) { + Pair elem = + Pair.of(new StreamingOffset(microBatch.snapshotId(), position, shouldScanAllFile), task); + queuedFileCount.incrementAndGet(); + queuedRowCount.addAndGet(task.file().recordCount()); + queue.addLast(elem); + position += 1; + } + if (LOG.isDebugEnabled()) { + StringBuilder sb = new StringBuilder("\n"); + for (Pair elem : queue) { + sb.append(elem.first()).append("\n"); + } + LOG.debug(sb.toString()); + } + lastQueuedSnapshot = snapshot; + } + + private void fillQueue(StreamingOffset fromOffset, StreamingOffset toOffset) { + LOG.debug("filling queue from {}, to: {}", fromOffset, toOffset); + Snapshot currentSnapshot = table().snapshot(fromOffset.snapshotId()); + // this could be a partial snapshot so add it outside the loop + if (currentSnapshot != null) { + addMicroBatchToQueue( + currentSnapshot, + fromOffset.position(), + MicroBatchUtils.addedFilesCount(table(), currentSnapshot), + fromOffset.shouldScanAllFiles()); + } + if (toOffset != null) { + if (currentSnapshot != null) { + while (currentSnapshot.snapshotId() != toOffset.snapshotId()) { + currentSnapshot = nextValidSnapshot(currentSnapshot); + if (currentSnapshot != null) { + addMicroBatchToQueue( + currentSnapshot, + 0, + MicroBatchUtils.addedFilesCount(table(), currentSnapshot), + false); + } else { + break; + } + } + } + // toOffset snapshot already added in loop when currentSnapshot == toOffset + } else { + fillQueueInitialBuffer(currentSnapshot); + } + } + + private void fillQueueInitialBuffer(Snapshot startSnapshot) { + // toOffset is null - fill initial buffer to prevent queue starvation before background + // thread starts. Use configured limits to avoid loading all snapshots + // (which could cause OOM on tables with thousands of snapshots). + long targetRows = readConf().asyncQueuePreloadRowLimit(); + long targetFiles = readConf().asyncQueuePreloadFileLimit(); + + Snapshot preloadEndSnapshot = initialPreloadEndSnapshot(); + if (preloadEndSnapshot == null) { + return; // Empty table + } + + // START_OFFSET case: initialize using nextValidSnapshot which respects timestamp filtering + Snapshot current = startSnapshot; + if (current == null) { + current = nextValidSnapshot(null); + if (current != null) { + addMicroBatchToQueue(current, 0, MicroBatchUtils.addedFilesCount(table(), current), false); + } + } + + // Continue loading more snapshots within safety limits + if (current != null) { + while ((queuedRowCount.get() < targetRows || queuedFileCount.get() < targetFiles) + && current.snapshotId() != preloadEndSnapshot.snapshotId()) { + current = nextValidSnapshot(current); + if (current != null) { + addMicroBatchToQueue( + current, 0, MicroBatchUtils.addedFilesCount(table(), current), false); + } else { + break; + } + } + } + } + + private Snapshot initialPreloadEndSnapshot() { + if (lastOffsetForTriggerAvailableNow != null) { + return table().snapshot(lastOffsetForTriggerAvailableNow.snapshotId()); + } + + return table().currentSnapshot(); + } + + @VisibleForTesting + static boolean reachedAvailableNowCap( + Snapshot readFrom, StreamingOffset lastOffsetForTriggerAvailableNow) { + return lastOffsetForTriggerAvailableNow != null + && readFrom != null + && readFrom.snapshotId() == lastOffsetForTriggerAvailableNow.snapshotId(); + } + + /** Try to populate the queue with data from unread snapshots */ + private void fillQueue(Snapshot readFrom) { + // Don't add beyond cap for Trigger.AvailableNow + if (reachedAvailableNowCap(readFrom, lastOffsetForTriggerAvailableNow)) { + LOG.debug( + "Reached cap snapshot {}, not adding more", + this.lastOffsetForTriggerAvailableNow.snapshotId()); + return; + } + + if ((queuedRowCount.get() > minQueuedRows) || (queuedFileCount.get() > minQueuedFiles)) { + // we have enough data buffered, check back shortly + LOG.debug( + "Buffer is full, {} > {} or {} > {}", + queuedRowCount.get(), + minQueuedRows, + queuedFileCount.get(), + minQueuedFiles); + } else { + // add an entire snapshot to the queue + Snapshot nextValidSnapshot = nextValidSnapshot(readFrom); + if (nextValidSnapshot != null) { + addMicroBatchToQueue( + nextValidSnapshot, + 0, + MicroBatchUtils.addedFilesCount(table(), nextValidSnapshot), + false); + } else { + LOG.debug("No snapshots ready to be read"); + } + } + } +} diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseSparkMicroBatchPlanner.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseSparkMicroBatchPlanner.java new file mode 100644 index 000000000000..9298c2bbdfcc --- /dev/null +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseSparkMicroBatchPlanner.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import java.util.Locale; +import org.apache.iceberg.DataOperations; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.SparkReadOptions; +import org.apache.iceberg.util.SnapshotUtil; +import org.apache.spark.sql.connector.read.streaming.CompositeReadLimit; +import org.apache.spark.sql.connector.read.streaming.ReadLimit; +import org.apache.spark.sql.connector.read.streaming.ReadMaxFiles; +import org.apache.spark.sql.connector.read.streaming.ReadMaxRows; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +abstract class BaseSparkMicroBatchPlanner implements SparkMicroBatchPlanner { + private static final Logger LOG = LoggerFactory.getLogger(BaseSparkMicroBatchPlanner.class); + private final Table table; + private final SparkReadConf readConf; + + BaseSparkMicroBatchPlanner(Table table, SparkReadConf readConf) { + this.table = table; + this.readConf = readConf; + } + + protected Table table() { + return table; + } + + protected SparkReadConf readConf() { + return readConf; + } + + protected boolean shouldProcess(Snapshot snapshot) { + String op = snapshot.operation(); + switch (op) { + case DataOperations.APPEND: + return true; + case DataOperations.REPLACE: + return false; + case DataOperations.DELETE: + Preconditions.checkState( + readConf.streamingSkipDeleteSnapshots(), + "Cannot process delete snapshot: %s, to ignore deletes, set %s=true", + snapshot.snapshotId(), + SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS); + return false; + case DataOperations.OVERWRITE: + Preconditions.checkState( + readConf.streamingSkipOverwriteSnapshots(), + "Cannot process overwrite snapshot: %s, to ignore overwrites, set %s=true", + snapshot.snapshotId(), + SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS); + return false; + default: + throw new IllegalStateException( + String.format( + "Cannot process unknown snapshot operation: %s (snapshot id %s)", + op.toLowerCase(Locale.ROOT), snapshot.snapshotId())); + } + } + + /** + * Get the next snapshot skipping over rewrite and delete snapshots. Async must handle nulls. + * + * @param curSnapshot the current snapshot + * @return the next valid snapshot (not a rewrite or delete snapshot), returns null if all + * remaining snapshots should be skipped. + */ + protected Snapshot nextValidSnapshot(Snapshot curSnapshot) { + Snapshot nextSnapshot; + // if there were no valid snapshots, check for an initialOffset again + if (curSnapshot == null) { + StreamingOffset startingOffset = + MicroBatchUtils.determineStartingOffset(table, readConf.streamFromTimestamp()); + LOG.debug("determineStartingOffset picked startingOffset: {}", startingOffset); + if (StreamingOffset.START_OFFSET.equals(startingOffset)) { + return null; + } + nextSnapshot = table.snapshot(startingOffset.snapshotId()); + } else { + if (curSnapshot.snapshotId() == table.currentSnapshot().snapshotId()) { + return null; + } + nextSnapshot = SnapshotUtil.snapshotAfter(table, curSnapshot.snapshotId()); + } + // skip over rewrite and delete snapshots + while (!shouldProcess(nextSnapshot)) { + LOG.debug("Skipping snapshot: {}", nextSnapshot); + // if the currentSnapShot was also the mostRecentSnapshot then break + // avoids snapshotAfter throwing exception since there are no more snapshots to process + if (nextSnapshot.snapshotId() == table.currentSnapshot().snapshotId()) { + return null; + } + nextSnapshot = SnapshotUtil.snapshotAfter(table, nextSnapshot.snapshotId()); + } + return nextSnapshot; + } + + static class UnpackedLimits { + private long maxRows = Integer.MAX_VALUE; + private long maxFiles = Integer.MAX_VALUE; + + UnpackedLimits(ReadLimit limit) { + if (limit instanceof CompositeReadLimit) { + ReadLimit[] compositeLimits = ((CompositeReadLimit) limit).getReadLimits(); + for (ReadLimit individualLimit : compositeLimits) { + if (individualLimit instanceof ReadMaxRows) { + ReadMaxRows readMaxRows = (ReadMaxRows) individualLimit; + this.maxRows = Math.min(this.maxRows, readMaxRows.maxRows()); + } else if (individualLimit instanceof ReadMaxFiles) { + ReadMaxFiles readMaxFiles = (ReadMaxFiles) individualLimit; + this.maxFiles = Math.min(this.maxFiles, readMaxFiles.maxFiles()); + } + } + } else if (limit instanceof ReadMaxRows) { + this.maxRows = ((ReadMaxRows) limit).maxRows(); + } else if (limit instanceof ReadMaxFiles) { + this.maxFiles = ((ReadMaxFiles) limit).maxFiles(); + } + } + + public long getMaxRows() { + return maxRows; + } + + public long getMaxFiles() { + return maxFiles; + } + } +} diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/MicroBatchUtils.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/MicroBatchUtils.java new file mode 100644 index 000000000000..7c73e3f416e3 --- /dev/null +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/MicroBatchUtils.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.SnapshotChanges; +import org.apache.iceberg.SnapshotSummary; +import org.apache.iceberg.Table; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.util.PropertyUtil; +import org.apache.iceberg.util.SnapshotUtil; + +class MicroBatchUtils { + + private MicroBatchUtils() {} + + static StreamingOffset determineStartingOffset(Table table, long fromTimestamp) { + if (table.currentSnapshot() == null) { + return StreamingOffset.START_OFFSET; + } + + if (fromTimestamp == Long.MIN_VALUE) { + // start from the oldest snapshot, since default value is MIN_VALUE + // avoids looping to find first snapshot + return new StreamingOffset(SnapshotUtil.oldestAncestor(table).snapshotId(), 0, false); + } + + if (table.currentSnapshot().timestampMillis() < fromTimestamp) { + return StreamingOffset.START_OFFSET; + } + + try { + Snapshot snapshot = SnapshotUtil.oldestAncestorAfter(table, fromTimestamp); + if (snapshot != null) { + return new StreamingOffset(snapshot.snapshotId(), 0, false); + } else { + return StreamingOffset.START_OFFSET; + } + } catch (IllegalStateException e) { + // could not determine the first snapshot after the timestamp. use the oldest ancestor instead + return new StreamingOffset(SnapshotUtil.oldestAncestor(table).snapshotId(), 0, false); + } + } + + static long addedFilesCount(Table table, Snapshot snapshot) { + long addedFilesCount = + PropertyUtil.propertyAsLong(snapshot.summary(), SnapshotSummary.ADDED_FILES_PROP, -1); + return addedFilesCount == -1 + ? Iterables.size( + SnapshotChanges.builderFor(table).snapshot(snapshot).build().addedDataFiles()) + : addedFilesCount; + } +} diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchPlanner.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchPlanner.java new file mode 100644 index 000000000000..1986ddac5d8e --- /dev/null +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchPlanner.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import java.util.List; +import org.apache.iceberg.FileScanTask; +import org.apache.spark.sql.connector.read.streaming.ReadLimit; + +interface SparkMicroBatchPlanner { + /** + * Return the {@link FileScanTask}s for data added between the start and end offsets. + * + * @param startOffset the offset to start planning from + * @param endOffset the offset to plan up to + * @return file scan tasks for data in the offset range + */ + List planFiles(StreamingOffset startOffset, StreamingOffset endOffset); + + /** + * Return the latest offset the stream can advance to from {@code startOffset}, respecting the + * given {@link ReadLimit}. + * + * @param startOffset the current offset of the stream + * @param limit the read limit bounding how far ahead to advance + * @return the latest available offset, or {@code null} if no new data is available + */ + StreamingOffset latestOffset(StreamingOffset startOffset, ReadLimit limit); + + /** Stop the planner and release any resources. */ + void stop(); +} diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java index a82583747a64..a1ff767fe2a0 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java @@ -26,48 +26,32 @@ import java.io.UncheckedIOException; import java.nio.charset.StandardCharsets; import java.util.List; -import java.util.Locale; import java.util.function.Supplier; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.DataOperations; import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.MicroBatches; -import org.apache.iceberg.MicroBatches.MicroBatch; import org.apache.iceberg.Schema; import org.apache.iceberg.SchemaParser; import org.apache.iceberg.Snapshot; -import org.apache.iceberg.SnapshotChanges; -import org.apache.iceberg.SnapshotSummary; import org.apache.iceberg.Table; import org.apache.iceberg.hadoop.HadoopFileIO; import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.CloseableIterator; import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.relocated.com.google.common.base.Joiner; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadConf; -import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.Pair; -import org.apache.iceberg.util.PropertyUtil; -import org.apache.iceberg.util.SnapshotUtil; import org.apache.iceberg.util.TableScanUtil; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.broadcast.Broadcast; import org.apache.spark.sql.connector.read.InputPartition; import org.apache.spark.sql.connector.read.PartitionReaderFactory; -import org.apache.spark.sql.connector.read.streaming.CompositeReadLimit; import org.apache.spark.sql.connector.read.streaming.MicroBatchStream; import org.apache.spark.sql.connector.read.streaming.Offset; import org.apache.spark.sql.connector.read.streaming.ReadLimit; -import org.apache.spark.sql.connector.read.streaming.ReadMaxFiles; -import org.apache.spark.sql.connector.read.streaming.ReadMaxRows; import org.apache.spark.sql.connector.read.streaming.SupportsTriggerAvailableNow; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -79,6 +63,7 @@ public class SparkMicroBatchStream implements MicroBatchStream, SupportsTriggerA private final Table table; private final Supplier fileIO; + private final SparkReadConf readConf; private final String branch; private final boolean caseSensitive; private final String expectedSchema; @@ -89,12 +74,11 @@ public class SparkMicroBatchStream implements MicroBatchStream, SupportsTriggerA private final long splitOpenFileCost; private final boolean localityPreferred; private final StreamingOffset initialOffset; - private final boolean skipDelete; - private final boolean skipOverwrite; private final long fromTimestamp; private final int maxFilesPerMicroBatch; private final int maxRecordsPerMicroBatch; private final boolean cacheDeleteFilesOnExecutors; + private SparkMicroBatchPlanner planner; private StreamingOffset lastOffsetForTriggerAvailableNow; SparkMicroBatchStream( @@ -106,6 +90,7 @@ public class SparkMicroBatchStream implements MicroBatchStream, SupportsTriggerA String checkpointLocation) { this.table = table; this.fileIO = fileIO; + this.readConf = readConf; this.branch = readConf.branch(); this.caseSensitive = readConf.caseSensitive(); this.expectedSchema = SchemaParser.toJson(expectedSchema); @@ -124,9 +109,6 @@ public class SparkMicroBatchStream implements MicroBatchStream, SupportsTriggerA new InitialOffsetStore( table, checkpointLocation, fromTimestamp, sparkContext.hadoopConfiguration()); this.initialOffset = initialOffsetStore.initialOffset(); - - this.skipDelete = readConf.streamingSkipDeleteSnapshots(); - this.skipOverwrite = readConf.streamingSkipOverwriteSnapshots(); } @Override @@ -141,8 +123,8 @@ public Offset latestOffset() { } Snapshot latestSnapshot = table.currentSnapshot(); - - return new StreamingOffset(latestSnapshot.snapshotId(), addedFilesCount(latestSnapshot), false); + return new StreamingOffset( + latestSnapshot.snapshotId(), MicroBatchUtils.addedFilesCount(table, latestSnapshot), false); } @Override @@ -161,7 +143,11 @@ public InputPartition[] planInputPartitions(Offset start, Offset end) { StreamingOffset endOffset = (StreamingOffset) end; StreamingOffset startOffset = (StreamingOffset) start; - List fileScanTasks = planFiles(startOffset, endOffset); + if (planner == null) { + initializePlanner(startOffset, endOffset); + } + + List fileScanTasks = planner.planFiles(startOffset, endOffset); CloseableIterable splitTasks = TableScanUtil.splitFiles(CloseableIterable.withNoopClose(fileScanTasks), splitSize); @@ -171,7 +157,6 @@ public InputPartition[] planInputPartitions(Offset start, Offset end) { String[][] locations = computePreferredLocations(combinedScanTasks); InputPartition[] partitions = new InputPartition[combinedScanTasks.size()]; - for (int index = 0; index < combinedScanTasks.size(); index++) { partitions[index] = new SparkInputPartition( @@ -214,318 +199,35 @@ public Offset deserializeOffset(String json) { public void commit(Offset end) {} @Override - public void stop() {} - - private List planFiles(StreamingOffset startOffset, StreamingOffset endOffset) { - List fileScanTasks = Lists.newArrayList(); - StreamingOffset batchStartOffset = - StreamingOffset.START_OFFSET.equals(startOffset) - ? determineStartingOffset(table, fromTimestamp) - : startOffset; - - StreamingOffset currentOffset = null; - - // [(startOffset : startFileIndex), (endOffset : endFileIndex) ) - do { - long endFileIndex; - if (currentOffset == null) { - currentOffset = batchStartOffset; - } else { - Snapshot snapshotAfter = SnapshotUtil.snapshotAfter(table, currentOffset.snapshotId()); - // it may happen that we need to read this snapshot partially in case it's equal to - // endOffset. - if (currentOffset.snapshotId() != endOffset.snapshotId()) { - currentOffset = new StreamingOffset(snapshotAfter.snapshotId(), 0L, false); - } else { - currentOffset = endOffset; - } - } - - Snapshot snapshot = table.snapshot(currentOffset.snapshotId()); - - validateCurrentSnapshotExists(snapshot, currentOffset); - - if (!shouldProcess(snapshot)) { - LOG.debug("Skipping snapshot: {} of table {}", currentOffset.snapshotId(), table.name()); - continue; - } - - Snapshot currentSnapshot = table.snapshot(currentOffset.snapshotId()); - if (currentOffset.snapshotId() == endOffset.snapshotId()) { - endFileIndex = endOffset.position(); - } else { - endFileIndex = addedFilesCount(currentSnapshot); - } - - MicroBatch latestMicroBatch = - MicroBatches.from(currentSnapshot, table.io()) - .caseSensitive(caseSensitive) - .specsById(table.specs()) - .generate( - currentOffset.position(), - endFileIndex, - Long.MAX_VALUE, - currentOffset.shouldScanAllFiles()); - - fileScanTasks.addAll(latestMicroBatch.tasks()); - } while (currentOffset.snapshotId() != endOffset.snapshotId()); - - return fileScanTasks; - } - - private boolean shouldProcess(Snapshot snapshot) { - String op = snapshot.operation(); - switch (op) { - case DataOperations.APPEND: - return true; - case DataOperations.REPLACE: - return false; - case DataOperations.DELETE: - Preconditions.checkState( - skipDelete, - "Cannot process delete snapshot: %s, to ignore deletes, set %s=true", - snapshot.snapshotId(), - SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS); - return false; - case DataOperations.OVERWRITE: - Preconditions.checkState( - skipOverwrite, - "Cannot process overwrite snapshot: %s, to ignore overwrites, set %s=true", - snapshot.snapshotId(), - SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS); - return false; - default: - throw new IllegalStateException( - String.format( - "Cannot process unknown snapshot operation: %s (snapshot id %s)", - op.toLowerCase(Locale.ROOT), snapshot.snapshotId())); - } - } - - private static StreamingOffset determineStartingOffset(Table table, Long fromTimestamp) { - if (table.currentSnapshot() == null) { - return StreamingOffset.START_OFFSET; - } - - if (fromTimestamp == null) { - // match existing behavior and start from the oldest snapshot - return new StreamingOffset(SnapshotUtil.oldestAncestor(table).snapshotId(), 0, false); - } - - if (table.currentSnapshot().timestampMillis() < fromTimestamp) { - return StreamingOffset.START_OFFSET; - } - - try { - Snapshot snapshot = SnapshotUtil.oldestAncestorAfter(table, fromTimestamp); - if (snapshot != null) { - return new StreamingOffset(snapshot.snapshotId(), 0, false); - } else { - return StreamingOffset.START_OFFSET; - } - } catch (IllegalStateException e) { - // could not determine the first snapshot after the timestamp. use the oldest ancestor instead - return new StreamingOffset(SnapshotUtil.oldestAncestor(table).snapshotId(), 0, false); + public void stop() { + if (planner != null) { + planner.stop(); } } - private static int getMaxFiles(ReadLimit readLimit) { - if (readLimit instanceof ReadMaxFiles) { - return ((ReadMaxFiles) readLimit).maxFiles(); - } - - if (readLimit instanceof CompositeReadLimit) { - // We do not expect a CompositeReadLimit to contain a nested CompositeReadLimit. - // In fact, it should only be a composite of two or more of ReadMinRows, ReadMaxRows and - // ReadMaxFiles, with no more than one of each. - ReadLimit[] limits = ((CompositeReadLimit) readLimit).getReadLimits(); - for (ReadLimit limit : limits) { - if (limit instanceof ReadMaxFiles) { - return ((ReadMaxFiles) limit).maxFiles(); - } - } - } - - // there is no ReadMaxFiles, so return the default - return Integer.MAX_VALUE; - } - - private static int getMaxRows(ReadLimit readLimit) { - if (readLimit instanceof ReadMaxRows) { - long maxRows = ((ReadMaxRows) readLimit).maxRows(); - return Math.toIntExact(maxRows); - } - - if (readLimit instanceof CompositeReadLimit) { - ReadLimit[] limits = ((CompositeReadLimit) readLimit).getReadLimits(); - for (ReadLimit limit : limits) { - if (limit instanceof ReadMaxRows) { - long maxRows = ((ReadMaxRows) limit).maxRows(); - return Math.toIntExact(maxRows); - } - } + private void initializePlanner(StreamingOffset startOffset, StreamingOffset endOffset) { + if (readConf.asyncMicroBatchPlanningEnabled()) { + this.planner = + new AsyncSparkMicroBatchPlanner( + table, readConf, startOffset, endOffset, lastOffsetForTriggerAvailableNow); + } else { + this.planner = + new SyncSparkMicroBatchPlanner(table, readConf, lastOffsetForTriggerAvailableNow); } - - // There is no ReadMaxRows, so return the default - return Integer.MAX_VALUE; } @Override - @SuppressWarnings("checkstyle:CyclomaticComplexity") public Offset latestOffset(Offset startOffset, ReadLimit limit) { - // calculate end offset get snapshotId from the startOffset Preconditions.checkArgument( startOffset instanceof StreamingOffset, "Invalid start offset: %s is not a StreamingOffset", startOffset); - table.refresh(); - if (table.currentSnapshot() == null) { - return StreamingOffset.START_OFFSET; - } - - if (table.currentSnapshot().timestampMillis() < fromTimestamp) { - return StreamingOffset.START_OFFSET; + if (planner == null) { + initializePlanner((StreamingOffset) startOffset, null); } - // end offset can expand to multiple snapshots - StreamingOffset startingOffset = (StreamingOffset) startOffset; - - if (startOffset.equals(StreamingOffset.START_OFFSET)) { - startingOffset = determineStartingOffset(table, fromTimestamp); - } - - Snapshot curSnapshot = table.snapshot(startingOffset.snapshotId()); - validateCurrentSnapshotExists(curSnapshot, startingOffset); - - // Use the pre-computed snapshotId when Trigger.AvailableNow is enabled. - long latestSnapshotId = - lastOffsetForTriggerAvailableNow != null - ? lastOffsetForTriggerAvailableNow.snapshotId() - : table.currentSnapshot().snapshotId(); - - int startPosOfSnapOffset = (int) startingOffset.position(); - - boolean scanAllFiles = startingOffset.shouldScanAllFiles(); - - boolean shouldContinueReading = true; - int curFilesAdded = 0; - long curRecordCount = 0; - int curPos = 0; - - // Note : we produce nextOffset with pos as non-inclusive - while (shouldContinueReading) { - // generate manifest index for the curSnapshot - List> indexedManifests = - MicroBatches.skippedManifestIndexesFromSnapshot( - table.io(), curSnapshot, startPosOfSnapOffset, scanAllFiles); - // this is under assumption we will be able to add at-least 1 file in the new offset - for (int idx = 0; idx < indexedManifests.size() && shouldContinueReading; idx++) { - // be rest assured curPos >= startFileIndex - curPos = indexedManifests.get(idx).second(); - try (CloseableIterable taskIterable = - MicroBatches.openManifestFile( - table.io(), - table.specs(), - caseSensitive, - curSnapshot, - indexedManifests.get(idx).first(), - scanAllFiles); - CloseableIterator taskIter = taskIterable.iterator()) { - while (taskIter.hasNext()) { - FileScanTask task = taskIter.next(); - if (curPos >= startPosOfSnapOffset) { - if ((curFilesAdded + 1) > getMaxFiles(limit)) { - // On including the file it might happen that we might exceed, the configured - // soft limit on the number of records, since this is a soft limit its acceptable. - shouldContinueReading = false; - break; - } - - curFilesAdded += 1; - curRecordCount += task.file().recordCount(); - - if (curRecordCount >= getMaxRows(limit)) { - // we included the file, so increment the number of files - // read in the current snapshot. - ++curPos; - shouldContinueReading = false; - break; - } - } - ++curPos; - } - } catch (IOException ioe) { - LOG.warn("Failed to close task iterable", ioe); - } - } - // if the currentSnapShot was also the latestSnapshot then break - if (curSnapshot.snapshotId() == latestSnapshotId) { - break; - } - - // if everything was OK and we consumed complete snapshot then move to next snapshot - if (shouldContinueReading) { - Snapshot nextValid = nextValidSnapshot(curSnapshot); - if (nextValid == null) { - // nextValid implies all the remaining snapshots should be skipped. - break; - } - // we found the next available snapshot, continue from there. - curSnapshot = nextValid; - startPosOfSnapOffset = -1; - // if anyhow we are moving to next snapshot we should only scan addedFiles - scanAllFiles = false; - } - } - - StreamingOffset latestStreamingOffset = - new StreamingOffset(curSnapshot.snapshotId(), curPos, scanAllFiles); - - // if no new data arrived, then return null. - return latestStreamingOffset.equals(startingOffset) ? null : latestStreamingOffset; - } - - /** - * Get the next snapshot skiping over rewrite and delete snapshots. - * - * @param curSnapshot the current snapshot - * @return the next valid snapshot (not a rewrite or delete snapshot), returns null if all - * remaining snapshots should be skipped. - */ - private Snapshot nextValidSnapshot(Snapshot curSnapshot) { - Snapshot nextSnapshot = SnapshotUtil.snapshotAfter(table, curSnapshot.snapshotId()); - // skip over rewrite and delete snapshots - while (!shouldProcess(nextSnapshot)) { - LOG.debug("Skipping snapshot: {} of table {}", nextSnapshot.snapshotId(), table.name()); - // if the currentSnapShot was also the mostRecentSnapshot then break - if (nextSnapshot.snapshotId() == table.currentSnapshot().snapshotId()) { - return null; - } - nextSnapshot = SnapshotUtil.snapshotAfter(table, nextSnapshot.snapshotId()); - } - return nextSnapshot; - } - - private long addedFilesCount(Snapshot snapshot) { - long addedFilesCount = - PropertyUtil.propertyAsLong(snapshot.summary(), SnapshotSummary.ADDED_FILES_PROP, -1); - // If snapshotSummary doesn't have SnapshotSummary.ADDED_FILES_PROP, - // iterate through addedFiles iterator to find addedFilesCount. - return addedFilesCount == -1 - ? Iterables.size( - SnapshotChanges.builderFor(table).snapshot(snapshot).build().addedDataFiles()) - : addedFilesCount; - } - - private void validateCurrentSnapshotExists(Snapshot snapshot, StreamingOffset currentOffset) { - if (snapshot == null) { - throw new IllegalStateException( - String.format( - Locale.ROOT, - "Cannot load current offset at snapshot %d, the snapshot was expired or removed", - currentOffset.snapshotId())); - } + return planner.latestOffset((StreamingOffset) startOffset, limit); } @Override @@ -553,6 +255,11 @@ public void prepareForTriggerAvailableNow() { (StreamingOffset) latestOffset(initialOffset, ReadLimit.allAvailable()); LOG.info("lastOffset for Trigger.AvailableNow is {}", lastOffsetForTriggerAvailableNow.json()); + + if (planner != null) { + planner.stop(); + planner = null; + } } private static class InitialOffsetStore { @@ -576,7 +283,7 @@ public StreamingOffset initialOffset() { } table.refresh(); - StreamingOffset offset = determineStartingOffset(table, fromTimestamp); + StreamingOffset offset = MicroBatchUtils.determineStartingOffset(table, fromTimestamp); OutputFile outputFile = io.newOutputFile(initialOffsetLocation); writeOffset(offset, outputFile); diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java index ddad1a749aa9..f926bd96389a 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java @@ -110,6 +110,7 @@ class SparkPositionDeltaWrite implements DeltaWrite, RequiresDistributionAndOrde private final String branch; private final Map extraSnapshotMetadata; private final SparkWriteRequirements writeRequirements; + private final int sortOrderId; private final Context context; private final Map writeProperties; @@ -135,6 +136,7 @@ class SparkPositionDeltaWrite implements DeltaWrite, RequiresDistributionAndOrde this.branch = writeConf.branch(); this.extraSnapshotMetadata = writeConf.extraSnapshotMetadata(); this.writeRequirements = writeConf.positionDeltaRequirements(command); + this.sortOrderId = writeConf.outputSortOrderId(writeRequirements); this.context = new Context(dataSchema, writeConf, info, writeRequirements); this.writeProperties = writeConf.writeProperties(); } @@ -180,7 +182,8 @@ public DeltaWriterFactory createBatchWriterFactory(PhysicalWriteInfo info) { broadcastRewritableDeletes(), command, context, - writeProperties); + writeProperties, + sortOrderId); } private Broadcast> broadcastRewritableDeletes() { @@ -390,18 +393,21 @@ private static class PositionDeltaWriteFactory implements DeltaWriterFactory { private final Command command; private final Context context; private final Map writeProperties; + private final int sortOrderId; PositionDeltaWriteFactory( Broadcast tableBroadcast, Broadcast> rewritableDeletesBroadcast, Command command, Context context, - Map writeProperties) { + Map writeProperties, + int sortOrderId) { this.tableBroadcast = tableBroadcast; this.rewritableDeletesBroadcast = rewritableDeletesBroadcast; this.command = command; this.context = context; this.writeProperties = writeProperties; + this.sortOrderId = sortOrderId; } @Override @@ -428,6 +434,7 @@ public DeltaWriter createWriter(int partitionId, long taskId) { .deleteFileFormat(context.deleteFileFormat()) .positionDeleteSparkType(context.deleteSparkType()) .writeProperties(writeProperties) + .dataSortOrder(table.sortOrders().get(sortOrderId)) .build(); if (command == DELETE) { diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java index 353566eb7f34..1348afff6475 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java @@ -23,6 +23,7 @@ import java.io.IOException; import java.util.Map; +import java.util.Objects; import java.util.Set; import org.apache.iceberg.BaseMetadataTable; import org.apache.iceberg.BaseTable; @@ -56,6 +57,7 @@ import org.apache.iceberg.spark.CommitMetadata; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkReadOptions; +import org.apache.iceberg.spark.SparkSQLProperties; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkTableUtil; import org.apache.iceberg.spark.SparkUtil; @@ -333,11 +335,31 @@ public boolean canDeleteWhere(Predicate[] predicates) { } } - return canDeleteUsingMetadata(deleteExpr); + return canDeleteUsingMetadata(deleteExpr, scanBranchForDelete()); + } + + // Resolves the branch to scan during canDeleteWhere so it matches the branch deleteWhere + // will commit to. Falls back to main when WAP is configured but the WAP branch does not + // exist yet, since this is a read scan. + private String scanBranchForDelete() { + if (branch != null) { + return branch; + } + + if (!SparkTableUtil.wapEnabled(table())) { + return null; + } + + String wapBranch = sparkSession().conf().get(SparkSQLProperties.WAP_BRANCH, null); + if (wapBranch != null && table().refs().containsKey(wapBranch)) { + return wapBranch; + } + + return null; } // a metadata delete is possible iff matching files can be deleted entirely - private boolean canDeleteUsingMetadata(Expression deleteExpr) { + private boolean canDeleteUsingMetadata(Expression deleteExpr, String scanBranch) { boolean caseSensitive = SparkUtil.caseSensitive(sparkSession()); if (ExpressionUtil.selectsPartitions(deleteExpr, table(), caseSensitive)) { @@ -352,14 +374,14 @@ private boolean canDeleteUsingMetadata(Expression deleteExpr) { .includeColumnStats() .ignoreResiduals(); - if (branch != null) { - scan = scan.useRef(branch); + if (scanBranch != null) { + scan = scan.useRef(scanBranch); } try (CloseableIterable tasks = scan.planFiles()) { Map evaluators = Maps.newHashMap(); StrictMetricsEvaluator metricsEvaluator = - new StrictMetricsEvaluator(SnapshotUtil.schemaFor(table(), branch), deleteExpr); + new StrictMetricsEvaluator(SnapshotUtil.schemaFor(table(), scanBranch), deleteExpr); return Iterables.all( tasks, @@ -396,12 +418,13 @@ public void deleteWhere(Predicate[] predicates) { .set("spark.app.id", sparkSession().sparkContext().applicationId()) .deleteFromRowFilter(deleteExpr); + String writeBranch = branch; if (SparkTableUtil.wapEnabled(table())) { - branch = SparkTableUtil.determineWriteBranch(sparkSession(), branch); + writeBranch = SparkTableUtil.determineWriteBranch(sparkSession(), branch); } - if (branch != null) { - deleteFiles.toBranch(branch); + if (writeBranch != null) { + deleteFiles.toBranch(writeBranch); } if (!CommitMetadata.commitProperties().isEmpty()) { @@ -424,15 +447,16 @@ public boolean equals(Object other) { return false; } - // use only name in order to correctly invalidate Spark cache SparkTable that = (SparkTable) other; - return icebergTable.name().equals(that.icebergTable.name()); + return icebergTable.name().equals(that.icebergTable.name()) + && Objects.equals(table().uuid(), that.table().uuid()) + && Objects.equals(snapshotId, that.snapshotId) + && Objects.equals(branch, that.branch); } @Override public int hashCode() { - // use only name in order to correctly invalidate Spark cache - return icebergTable.name().hashCode(); + return Objects.hash(icebergTable.name(), table().uuid(), snapshotId, branch); } private static CaseInsensitiveStringMap addSnapshotId( diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java index 15c70e4a6621..aff8864b6d2a 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java @@ -193,6 +193,7 @@ private WriterFactory createWriterFactory() { // broadcast the table metadata as the writer factory will be sent to executors Broadcast
      tableBroadcast = sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); + int sortOrderId = writeConf.outputSortOrderId(writeRequirements); return new WriterFactory( tableBroadcast, queryId, @@ -202,7 +203,8 @@ private WriterFactory createWriterFactory() { writeSchema, dsSchema, useFanoutWriter, - writeProperties); + writeProperties, + sortOrderId); } private void commitOperation(SnapshotUpdate operation, String description) { @@ -672,6 +674,7 @@ private static class WriterFactory implements DataWriterFactory, StreamingDataWr private final boolean useFanoutWriter; private final String queryId; private final Map writeProperties; + private final int sortOrderId; protected WriterFactory( Broadcast
      tableBroadcast, @@ -682,7 +685,8 @@ protected WriterFactory( Schema writeSchema, StructType dsSchema, boolean useFanoutWriter, - Map writeProperties) { + Map writeProperties, + int sortOrderId) { this.tableBroadcast = tableBroadcast; this.format = format; this.outputSpecId = outputSpecId; @@ -692,6 +696,7 @@ protected WriterFactory( this.useFanoutWriter = useFanoutWriter; this.queryId = queryId; this.writeProperties = writeProperties; + this.sortOrderId = sortOrderId; } @Override @@ -716,6 +721,7 @@ public DataWriter createWriter(int partitionId, long taskId, long e .dataSchema(writeSchema) .dataSparkType(dsSchema) .writeProperties(writeProperties) + .dataSortOrder(table.sortOrders().get(sortOrderId)) .build(); if (spec.isUnpartitioned()) { diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java index e608a40b72ad..df4566da0c90 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java @@ -27,27 +27,27 @@ import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.spark.SparkFilters; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkUtil; +import org.apache.iceberg.spark.SparkV2Filters; import org.apache.iceberg.spark.SparkWriteConf; import org.apache.iceberg.spark.SparkWriteRequirements; import org.apache.iceberg.types.TypeUtil; import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.connector.expressions.filter.Predicate; import org.apache.spark.sql.connector.read.Scan; import org.apache.spark.sql.connector.write.BatchWrite; import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.connector.write.RowLevelOperation.Command; import org.apache.spark.sql.connector.write.SupportsDynamicOverwrite; -import org.apache.spark.sql.connector.write.SupportsOverwrite; +import org.apache.spark.sql.connector.write.SupportsOverwriteV2; import org.apache.spark.sql.connector.write.Write; import org.apache.spark.sql.connector.write.WriteBuilder; import org.apache.spark.sql.connector.write.streaming.StreamingWrite; -import org.apache.spark.sql.sources.Filter; import org.apache.spark.sql.types.LongType$; import org.apache.spark.sql.types.StructType; -class SparkWriteBuilder implements WriteBuilder, SupportsDynamicOverwrite, SupportsOverwrite { +class SparkWriteBuilder implements WriteBuilder, SupportsDynamicOverwrite, SupportsOverwriteV2 { private final SparkSession spark; private final Table table; private final SparkWriteConf writeConf; @@ -100,12 +100,12 @@ public WriteBuilder overwriteDynamicPartitions() { } @Override - public WriteBuilder overwrite(Filter[] filters) { + public WriteBuilder overwrite(Predicate[] predicates) { Preconditions.checkState( !overwriteFiles, "Cannot overwrite individual files and using filters"); Preconditions.checkState(rewrittenFileSetId == null, "Cannot overwrite and rewrite"); - this.overwriteExpr = SparkFilters.convert(filters); + this.overwriteExpr = SparkV2Filters.convert(predicates); if (overwriteExpr == Expressions.alwaysTrue() && "dynamic".equals(overwriteMode)) { // use the write option to override truncating the table. use dynamic overwrite instead. this.overwriteDynamic = true; diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SyncSparkMicroBatchPlanner.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SyncSparkMicroBatchPlanner.java new file mode 100644 index 000000000000..f1b0029c5432 --- /dev/null +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SyncSparkMicroBatchPlanner.java @@ -0,0 +1,249 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import java.io.IOException; +import java.util.List; +import java.util.Locale; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.MicroBatches; +import org.apache.iceberg.MicroBatches.MicroBatch; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.util.Pair; +import org.apache.iceberg.util.SnapshotUtil; +import org.apache.spark.sql.connector.read.streaming.ReadLimit; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class SyncSparkMicroBatchPlanner extends BaseSparkMicroBatchPlanner { + private static final Logger LOG = LoggerFactory.getLogger(SyncSparkMicroBatchPlanner.class); + + private final boolean caseSensitive; + private final long fromTimestamp; + private final StreamingOffset lastOffsetForTriggerAvailableNow; + + SyncSparkMicroBatchPlanner( + Table table, SparkReadConf readConf, StreamingOffset lastOffsetForTriggerAvailableNow) { + super(table, readConf); + this.caseSensitive = readConf().caseSensitive(); + this.fromTimestamp = readConf().streamFromTimestamp(); + this.lastOffsetForTriggerAvailableNow = lastOffsetForTriggerAvailableNow; + } + + @Override + public List planFiles(StreamingOffset startOffset, StreamingOffset endOffset) { + List fileScanTasks = Lists.newArrayList(); + StreamingOffset batchStartOffset = + StreamingOffset.START_OFFSET.equals(startOffset) + ? MicroBatchUtils.determineStartingOffset(table(), fromTimestamp) + : startOffset; + + StreamingOffset currentOffset = null; + + // [(startOffset : startFileIndex), (endOffset : endFileIndex) ) + do { + long endFileIndex; + if (currentOffset == null) { + currentOffset = batchStartOffset; + } else { + Snapshot snapshotAfter = SnapshotUtil.snapshotAfter(table(), currentOffset.snapshotId()); + // it may happen that we need to read this snapshot partially in case it's equal to + // endOffset. + if (currentOffset.snapshotId() != endOffset.snapshotId()) { + currentOffset = new StreamingOffset(snapshotAfter.snapshotId(), 0L, false); + } else { + currentOffset = endOffset; + } + } + + Snapshot snapshot = table().snapshot(currentOffset.snapshotId()); + + validateCurrentSnapshotExists(snapshot, currentOffset); + + if (!shouldProcess(snapshot)) { + LOG.debug("Skipping snapshot: {} of table {}", currentOffset.snapshotId(), table().name()); + continue; + } + + Snapshot currentSnapshot = table().snapshot(currentOffset.snapshotId()); + if (currentOffset.snapshotId() == endOffset.snapshotId()) { + endFileIndex = endOffset.position(); + } else { + endFileIndex = MicroBatchUtils.addedFilesCount(table(), currentSnapshot); + } + + MicroBatch latestMicroBatch = + MicroBatches.from(currentSnapshot, table().io()) + .caseSensitive(caseSensitive) + .specsById(table().specs()) + .generate( + currentOffset.position(), + endFileIndex, + Long.MAX_VALUE, + currentOffset.shouldScanAllFiles()); + + fileScanTasks.addAll(latestMicroBatch.tasks()); + } while (currentOffset.snapshotId() != endOffset.snapshotId()); + + return fileScanTasks; + } + + @Override + @SuppressWarnings("checkstyle:CyclomaticComplexity") + public StreamingOffset latestOffset(StreamingOffset startOffset, ReadLimit limit) { + table().refresh(); + if (table().currentSnapshot() == null) { + return StreamingOffset.START_OFFSET; + } + + if (table().currentSnapshot().timestampMillis() < fromTimestamp) { + return StreamingOffset.START_OFFSET; + } + + // end offset can expand to multiple snapshots + StreamingOffset startingOffset = startOffset; + + if (startOffset.equals(StreamingOffset.START_OFFSET)) { + startingOffset = MicroBatchUtils.determineStartingOffset(table(), fromTimestamp); + } + + Snapshot curSnapshot = table().snapshot(startingOffset.snapshotId()); + validateCurrentSnapshotExists(curSnapshot, startingOffset); + + // Use the pre-computed snapshotId when Trigger.AvailableNow is enabled. + long latestSnapshotId = + lastOffsetForTriggerAvailableNow != null + ? lastOffsetForTriggerAvailableNow.snapshotId() + : table().currentSnapshot().snapshotId(); + + int startPosOfSnapOffset = (int) startingOffset.position(); + + boolean scanAllFiles = startingOffset.shouldScanAllFiles(); + + boolean shouldContinueReading = true; + int curFilesAdded = 0; + long curRecordCount = 0; + int curPos = 0; + + // Extract limits once to avoid repeated calls in tight loop + UnpackedLimits unpackedLimits = new UnpackedLimits(limit); + long maxFiles = unpackedLimits.getMaxFiles(); + long maxRows = unpackedLimits.getMaxRows(); + + // Note : we produce nextOffset with pos as non-inclusive + while (shouldContinueReading) { + // generate manifest index for the curSnapshot + List> indexedManifests = + MicroBatches.skippedManifestIndexesFromSnapshot( + table().io(), curSnapshot, startPosOfSnapOffset, scanAllFiles); + // this is under assumption we will be able to add at-least 1 file in the new offset + for (int idx = 0; idx < indexedManifests.size() && shouldContinueReading; idx++) { + // be rest assured curPos >= startFileIndex + curPos = indexedManifests.get(idx).second(); + try (CloseableIterable taskIterable = + MicroBatches.openManifestFile( + table().io(), + table().specs(), + caseSensitive, + curSnapshot, + indexedManifests.get(idx).first(), + scanAllFiles); + CloseableIterator taskIter = taskIterable.iterator()) { + while (taskIter.hasNext()) { + FileScanTask task = taskIter.next(); + if (curPos >= startPosOfSnapOffset) { + if ((curFilesAdded + 1) > maxFiles) { + // On including the file it might happen that we might exceed, the configured + // soft limit on the number of records, since this is a soft limit its acceptable. + shouldContinueReading = false; + break; + } + + curFilesAdded += 1; + curRecordCount += task.file().recordCount(); + + if (curRecordCount >= maxRows) { + // we included the file, so increment the number of files + // read in the current snapshot. + if (curFilesAdded == 1 && curRecordCount > maxRows) { + LOG.warn( + "File {} contains {} records, exceeding maxRecordsPerMicroBatch limit of {}. " + + "This file will be processed entirely to guarantee forward progress. " + + "Consider increasing the limit or writing smaller files to avoid unexpected memory usage.", + task.file().location(), + task.file().recordCount(), + maxRows); + } + ++curPos; + shouldContinueReading = false; + break; + } + } + ++curPos; + } + } catch (IOException ioe) { + LOG.warn("Failed to close task iterable", ioe); + } + } + // if the currentSnapShot was also the latestSnapshot then break + if (curSnapshot.snapshotId() == latestSnapshotId) { + break; + } + + // if everything was OK and we consumed complete snapshot then move to next snapshot + if (shouldContinueReading) { + Snapshot nextValid = nextValidSnapshot(curSnapshot); + if (nextValid == null) { + // nextValid implies all the remaining snapshots should be skipped. + break; + } + // we found the next available snapshot, continue from there. + curSnapshot = nextValid; + startPosOfSnapOffset = -1; + // if anyhow we are moving to next snapshot we should only scan addedFiles + scanAllFiles = false; + } + } + + StreamingOffset latestStreamingOffset = + new StreamingOffset(curSnapshot.snapshotId(), curPos, scanAllFiles); + + // if no new data arrived, then return null. + return latestStreamingOffset.equals(startingOffset) ? null : latestStreamingOffset; + } + + @Override + public void stop() {} + + private void validateCurrentSnapshotExists(Snapshot snapshot, StreamingOffset currentOffset) { + if (snapshot == null) { + throw new IllegalStateException( + String.format( + Locale.ROOT, + "Cannot load current offset at snapshot %d, the snapshot was expired or removed", + currentOffset.snapshotId())); + } + } +} diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java index 404ba7284606..9b08d6f7ab1e 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.BeforeEach; @@ -90,6 +91,7 @@ protected static SparkSession initSpark(String serializer) { .master("local[2]") .config("spark.serializer", serializer) .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java index 659507e4c5e3..e28603c0b43a 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -73,6 +74,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java index a218f965ea65..2967f0e22cec 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java @@ -23,6 +23,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -61,6 +62,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + .config(TestBase.DISABLE_UI) .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") .getOrCreate(); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java index 2665d7ba8d3b..4f789d2c5ae9 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -63,6 +64,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java new file mode 100644 index 000000000000..ee1f29e56fb3 --- /dev/null +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark; + +import com.codahale.metrics.MetricRegistry; +import java.util.Properties; +import org.apache.spark.SparkConf; +import org.apache.spark.metrics.sink.MetricsServlet; +import org.sparkproject.jetty.servlet.ServletContextHandler; + +/** + * A dummy implementation of {@link MetricsServlet} that does not start a server or report metrics. + * This is used in tests to avoid conflicts with Spark's jetty dependencies. + */ +public class DummyMetricsServlet extends MetricsServlet { + + /** + * Constructor required by Spark's reflection-based instantiation. + * + * @param properties Metrics properties + * @param registry Metric registry + */ + public DummyMetricsServlet(Properties properties, MetricRegistry registry) { + super(properties, registry); + } + + @Override + public ServletContextHandler[] getHandlers(SparkConf conf) { + return new ServletContextHandler[] {}; + } + + @Override + public void start() { + // No-op for tests + } + + @Override + public void stop() { + // No-op for tests + } + + @Override + public void report() { + // No-op for tests + } +} diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestBase.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestBase.java index daf4e29ac075..5e7e1a1f6193 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestBase.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestBase.java @@ -65,6 +65,13 @@ public abstract class TestBase extends SparkTestHelperBase { protected static SparkSession spark = null; protected static JavaSparkContext sparkContext = null; protected static HiveCatalog catalog = null; + // disable Spark UI and use dummy servlet to avoid dependency conflicts with Spark's Jetty version + public static final Map DISABLE_UI = + ImmutableMap.of( + "spark.ui.enabled", + "false", + "spark.metrics.conf.*.sink.servlet.class", + "org.apache.iceberg.spark.DummyMetricsServlet"); @BeforeAll public static void startMetastoreAndSpark() { @@ -79,6 +86,7 @@ public static void startMetastoreAndSpark() { .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") + .config(DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java index 4045847d5a4a..b8f436cf2d86 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java @@ -24,10 +24,12 @@ import java.util.List; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; +import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.apache.spark.sql.catalyst.expressions.AttributeReference; import org.apache.spark.sql.catalyst.expressions.MetadataAttribute; import org.apache.spark.sql.catalyst.types.DataTypeUtils; +import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructType; import org.junit.jupiter.api.Test; @@ -80,4 +82,18 @@ public void testSchemaConversionWithMetaDataColumnSchema() { } } } + + @Test + public void testUnknownTypeToSpark() { + Schema schema = new Schema(optional(1, "col", Types.UnknownType.get())); + StructType sparkType = SparkSchemaUtil.convert(schema); + assertThat(sparkType.fields()[0].dataType()).isEqualTo(DataTypes.NullType); + } + + @Test + public void testNullTypeToIceberg() { + StructType sparkType = new StructType().add("col", DataTypes.NullType, true); + Type icebergType = SparkSchemaUtil.convert(sparkType).findField("col").type(); + assertThat(icebergType).isEqualTo(Types.UnknownType.get()); + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java index a9b5d1a237b4..89daf195ca73 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java @@ -45,6 +45,7 @@ import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.MERGE; import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.UPDATE; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException; import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.time.Duration; @@ -552,6 +553,51 @@ public void testDVWriteConf() { assertThat(writeConf.deleteFileFormat()).isEqualTo(FileFormat.PUFFIN); } + @TestTemplate + public void testSortOrderWriteConf() { + Table table = validationCatalog.loadTable(tableIdent); + + table.replaceSortOrder().asc("id").commit(); + + SparkWriteConf writeConf = + new SparkWriteConf( + spark, table, ImmutableMap.of(SparkWriteOptions.OUTPUT_SORT_ORDER_ID, "1")); + + assertThat(writeConf.outputSortOrderId(SparkWriteRequirements.EMPTY)) + .isEqualTo(table.sortOrder().orderId()); + } + + @TestTemplate + public void testSortOrderWriteConfWithInvalidId() { + Table table = validationCatalog.loadTable(tableIdent); + + table.replaceSortOrder().asc("id").commit(); + + SparkWriteConf writeConfForUnknownSortOrder = + new SparkWriteConf( + spark, table, ImmutableMap.of(SparkWriteOptions.OUTPUT_SORT_ORDER_ID, "999")); + + assertThatIllegalArgumentException() + .isThrownBy( + () -> writeConfForUnknownSortOrder.outputSortOrderId(SparkWriteRequirements.EMPTY)) + .withMessage( + "Cannot use output sort order id 999 because the table does not contain a sort order with that id"); + } + + @TestTemplate + public void testSortOrderWriteConfWithNoOption() { + Table table = validationCatalog.loadTable(tableIdent); + + table.replaceSortOrder().asc("id").commit(); + + SparkWriteConf writeConfNoOption = new SparkWriteConf(spark, table, ImmutableMap.of()); + + assertThat(writeConfNoOption.outputSortOrderId(writeConfNoOption.writeRequirements())) + .isEqualTo(table.sortOrder().orderId()); + + assertThat(writeConfNoOption.outputSortOrderId(SparkWriteRequirements.EMPTY)).isEqualTo(0); + } + private void testWriteProperties(List> propertiesSuite) { withSQLConf( propertiesSuite.get(0), diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java index eb89b0a23274..50afb53e0539 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java @@ -662,7 +662,7 @@ public void schemaEvolutionTestWithSparkSQL() throws Exception { "CAST(id AS FLOAT) col1", "CAST(id AS STRING) col2", "CAST(id AS INT) col3") - .registerTempTable("tempdata"); + .createOrReplaceTempView("tempdata"); sql("INSERT INTO TABLE %s SELECT * FROM tempdata", tblName); List expectedBeforeAddColumn = sql("SELECT * FROM %s ORDER BY col0", tblName); List expectedAfterAddColumn = diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index 411b7e78116f..d74d8a29f994 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -22,6 +22,7 @@ import static org.apache.iceberg.data.FileHelpers.encrypt; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; +import static org.apache.spark.sql.functions.col; import static org.apache.spark.sql.functions.current_date; import static org.apache.spark.sql.functions.date_add; import static org.apache.spark.sql.functions.expr; @@ -127,6 +128,7 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; import org.apache.spark.sql.internal.SQLConf; +import org.apache.spark.sql.types.DataTypes; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.TestTemplate; @@ -1514,7 +1516,7 @@ public void testSortMultipleGroups() { } @TestTemplate - public void testSimpleSort() { + public void testSimpleSort() throws IOException { Table table = createTable(20); shouldHaveFiles(table, 20); table.replaceSortOrder().asc("c2").commit(); @@ -1542,10 +1544,11 @@ public void testSimpleSort() { shouldHaveACleanCache(table); shouldHaveMultipleFiles(table); shouldHaveLastCommitSorted(table, "c2"); + dataFilesSortOrderShouldMatchTableSortOrder(table); } @TestTemplate - public void testSortAfterPartitionChange() { + public void testSortAfterPartitionChange() throws IOException { Table table = createTable(20); shouldHaveFiles(table, 20); table.updateSpec().addField(Expressions.bucket("c1", 4)).commit(); @@ -1576,10 +1579,11 @@ public void testSortAfterPartitionChange() { shouldHaveACleanCache(table); shouldHaveMultipleFiles(table); shouldHaveLastCommitSorted(table, "c2"); + dataFilesSortOrderShouldMatchTableSortOrder(table); } @TestTemplate - public void testSortCustomSortOrder() { + public void testSortCustomSortOrder() throws IOException { Table table = createTable(20); shouldHaveLastCommitUnsorted(table, "c2"); shouldHaveFiles(table, 20); @@ -1605,10 +1609,11 @@ public void testSortCustomSortOrder() { shouldHaveACleanCache(table); shouldHaveMultipleFiles(table); shouldHaveLastCommitSorted(table, "c2"); + dataFilesShouldHaveSortOrderIdMatching(table, SortOrder.unsorted()); } @TestTemplate - public void testSortCustomSortOrderRequiresRepartition() { + public void testSortCustomSortOrderRequiresRepartition() throws IOException { int partitions = 4; Table table = createTable(); writeRecords(20, SCALE, partitions); @@ -1644,10 +1649,40 @@ public void testSortCustomSortOrderRequiresRepartition() { shouldHaveMultipleFiles(table); shouldHaveLastCommitUnsorted(table, "c2"); shouldHaveLastCommitSorted(table, "c3"); + dataFilesShouldHaveSortOrderIdMatching(table, SortOrder.unsorted()); } @TestTemplate - public void testAutoSortShuffleOutput() { + public void testSortPastTableSortOrderGetsAppliedToFiles() throws IOException { + Table table = createTable(1); + + table.replaceSortOrder().asc("c3").commit(); + SortOrder c3SortOrder = table.sortOrder(); + + table.replaceSortOrder().asc("c2").commit(); + + List originalData = currentData(); + + RewriteDataFiles.Result result = + basicRewrite(table) + .sort(SortOrder.builderFor(table.schema()).asc("c3").build()) + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") + .execute(); + + assertThat(result.rewriteResults()).as("Should have 1 fileGroups").hasSize(1); + + table.refresh(); + + List postRewriteData = currentData(); + assertEquals("We shouldn't have changed the data", originalData, postRewriteData); + + shouldHaveSnapshots(table, 2); + shouldHaveACleanCache(table); + dataFilesShouldHaveSortOrderIdMatching(table, c3SortOrder); + } + + @TestTemplate + public void testAutoSortShuffleOutput() throws IOException { Table table = createTable(20); shouldHaveLastCommitUnsorted(table, "c2"); shouldHaveFiles(table, 20); @@ -1682,6 +1717,7 @@ public void testAutoSortShuffleOutput() { shouldHaveACleanCache(table); shouldHaveMultipleFiles(table); shouldHaveLastCommitSorted(table, "c2"); + dataFilesShouldHaveSortOrderIdMatching(table, SortOrder.unsorted()); } @TestTemplate @@ -2573,6 +2609,23 @@ public void testExecutorCacheForDeleteFilesDisabled() { .isFalse(); } + @TestTemplate + public void testZOrderUDFWithTimestampNTZType() { + SparkZOrderUDF zorderUDF = new SparkZOrderUDF(1, 16, 1024); + Dataset result = + spark + .sql("SELECT timestamp_ntz '2025-01-01 12:00:00' as test_col") + .withColumn( + "zorder_result", + zorderUDF.sortedLexicographically(col("test_col"), DataTypes.TimestampNTZType)); + + assertThat(result.schema().apply("zorder_result").dataType()).isEqualTo(DataTypes.BinaryType); + List rows = result.collectAsList(); + Row row = rows.get(0); + byte[] zorderBytes = row.getAs("zorder_result"); + assertThat(zorderBytes).isNotNull().isNotEmpty(); + } + private double percentFilesRequired(Table table, String col, String value) { return percentFilesRequired(table, new String[] {col}, new String[] {value}); } @@ -2600,4 +2653,17 @@ public boolean matches(RewriteFileGroup argument) { return groupIDs.contains(argument.info().globalIndex()); } } + + private void dataFilesSortOrderShouldMatchTableSortOrder(Table table) throws IOException { + dataFilesShouldHaveSortOrderIdMatching(table, table.sortOrder()); + } + + private void dataFilesShouldHaveSortOrderIdMatching(Table table, SortOrder sortOrder) + throws IOException { + try (CloseableIterable files = table.newScan().planFiles()) { + assertThat(files) + .extracting(fileScanTask -> fileScanTask.file().sortOrderId()) + .containsOnly(sortOrder.orderId()); + } + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTestBase.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTestBase.java index 0db6a65fd394..45053c1a4f1f 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTestBase.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTestBase.java @@ -32,6 +32,7 @@ import java.util.Map; import java.util.UUID; import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; @@ -108,8 +109,8 @@ protected boolean supportsRowLineage() { required(114, "dec_9_0", Types.DecimalType.of(9, 0)), // int encoded required(115, "dec_11_2", Types.DecimalType.of(11, 2)), // long encoded required(116, "dec_20_5", Types.DecimalType.of(20, 5)), // requires padding - required(117, "dec_38_10", Types.DecimalType.of(38, 10)) // Spark's maximum precision - ); + required(117, "dec_38_10", Types.DecimalType.of(38, 10)), // Spark's maximum precision + optional(118, "unk", Types.UnknownType.get())); @TempDir protected Path temp; @@ -120,10 +121,13 @@ public void testSimpleStruct() throws IOException { @Test public void testStructWithRequiredFields() throws IOException { + List supportedPrimitives = + SUPPORTED_PRIMITIVES.fields().stream() + .filter(f -> f.type().typeId() != Type.TypeID.UNKNOWN) + .collect(Collectors.toList()); writeAndValidate( TypeUtil.assignIncreasingFreshIds( - new Schema( - Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asRequired)))); + new Schema(Lists.transform(supportedPrimitives, Types.NestedField::asRequired)))); } @Test @@ -603,4 +607,48 @@ public void testRowLineage() throws Exception { record.copy(Map.of("id", 4L, "data", "d", "_row_id", 1_001L)), record.copy(Map.of("id", 5L, "data", "e")))); } + + @Test + public void testUnknownNestedLevel() throws IOException { + assumeThat(supportsNestedTypes()).isTrue(); + + Schema schema = + new Schema( + required(1, "id", LongType.get()), + optional( + 2, + "nested", + Types.StructType.of( + required(20, "int", Types.IntegerType.get()), + optional(21, "unk", Types.UnknownType.get())))); + + writeAndValidate(schema); + } + + @Test + public void testUnknownListType() throws IOException { + assumeThat(supportsNestedTypes()).isTrue(); + + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional(1, "data", ListType.ofOptional(2, Types.UnknownType.get()))); + + writeAndValidate(schema); + } + + @Test + public void testUnknownMapType() throws IOException { + assumeThat(supportsNestedTypes()).isTrue(); + + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional(2, 3, Types.StringType.get(), Types.UnknownType.get()))); + + writeAndValidate(schema); + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkFormatModel.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkFormatModel.java index c18e4c053f50..291bb2bca4f5 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkFormatModel.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkFormatModel.java @@ -25,6 +25,8 @@ import org.apache.iceberg.data.BaseFormatModelTests; import org.apache.iceberg.data.Record; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.SparkUtil; +import org.apache.iceberg.types.Type; import org.apache.spark.sql.catalyst.InternalRow; public class TestSparkFormatModel extends BaseFormatModelTests { @@ -51,4 +53,9 @@ protected void assertEquals(Schema schema, List expected, List expect private Iterator batchesToRows(Iterator batches) { return Iterators.concat(Iterators.transform(batches, ColumnarBatch::rowIterator)); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create ListType with unknown element type"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create MapType with unknown value type"); + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java index 328dcaa0014c..bc4b77059d43 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java @@ -248,4 +248,20 @@ public void testMissingRequiredWithoutDefault() { .isInstanceOf(IllegalArgumentException.class) .hasMessage("Missing required field: missing_str"); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert element Parquet: unknown"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert value Parquet: unknown"); + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java index bf738be59cb8..634327a81d86 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java @@ -20,6 +20,7 @@ import static org.apache.iceberg.types.Types.NestedField.required; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.File; import java.io.IOException; @@ -150,4 +151,20 @@ private static void assertEqualsUnsafe( assertThat(expectedIter).as("Expected iterator should not have any extra rows.").isExhausted(); assertThat(actualIter).as("Actual iterator should not have any extra rows.").isExhausted(); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create ListType with unknown element type"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create MapType with unknown value type"); + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java index 284fa0b0552f..b61ecfa2f442 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java @@ -43,6 +43,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.TestHelpers; import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders; import org.apache.iceberg.types.Types; @@ -65,6 +66,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java index 1c5905744a75..91d07e3647c9 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java @@ -37,6 +37,7 @@ import org.apache.iceberg.TableProperties; import org.apache.iceberg.hadoop.HadoopTables; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.AvroDataTestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.TestHelpers; @@ -62,6 +63,7 @@ public static void startSpark() { SparkSession.builder() .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) .master("local[2]") + .config(TestBase.DISABLE_UI) .getOrCreate(); ScanTestBase.sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); } @@ -93,11 +95,7 @@ protected void writeAndValidate(Schema writeSchema, Schema expectedSchema) throw HadoopTables tables = new HadoopTables(CONF); // If V3 spec features are used, set the format version to 3 - Map tableProperties = - writeSchema.columns().stream() - .anyMatch(f -> f.initialDefaultLiteral() != null || f.writeDefaultLiteral() != null) - ? ImmutableMap.of(TableProperties.FORMAT_VERSION, "3") - : ImmutableMap.of(); + Map tableProperties = ImmutableMap.of(TableProperties.FORMAT_VERSION, "3"); Table table = tables.create( writeSchema, PartitionSpec.unpartitioned(), tableProperties, location.toString()); diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestAsyncSparkMicroBatchPlanner.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestAsyncSparkMicroBatchPlanner.java new file mode 100644 index 000000000000..b6017e2001e7 --- /dev/null +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestAsyncSparkMicroBatchPlanner.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import org.apache.iceberg.Snapshot; +import org.junit.jupiter.api.Test; + +class TestAsyncSparkMicroBatchPlanner { + + @Test + void reachedAvailableNowCapReturnsTrueOnlyForExactCapSnapshot() { + Snapshot capSnapshot = mockSnapshot(10L); + Snapshot laterSnapshotWithHigherId = mockSnapshot(20L); + Snapshot laterSnapshotWithLowerId = mockSnapshot(5L); + StreamingOffset capOffset = new StreamingOffset(10L, 3L, false); + + assertThat(AsyncSparkMicroBatchPlanner.reachedAvailableNowCap(capSnapshot, capOffset)).isTrue(); + assertThat( + AsyncSparkMicroBatchPlanner.reachedAvailableNowCap( + laterSnapshotWithHigherId, capOffset)) + .isFalse(); + assertThat( + AsyncSparkMicroBatchPlanner.reachedAvailableNowCap(laterSnapshotWithLowerId, capOffset)) + .isFalse(); + } + + @Test + void reachedAvailableNowCapReturnsFalseWhenCapOrSnapshotIsMissing() { + Snapshot readFrom = mockSnapshot(10L); + StreamingOffset capOffset = new StreamingOffset(10L, 1L, false); + + assertThat(AsyncSparkMicroBatchPlanner.reachedAvailableNowCap(readFrom, null)).isFalse(); + assertThat(AsyncSparkMicroBatchPlanner.reachedAvailableNowCap(null, capOffset)).isFalse(); + } + + private Snapshot mockSnapshot(long snapshotId) { + Snapshot snapshot = mock(Snapshot.class); + when(snapshot.snapshotId()).thenReturn(snapshotId); + return snapshot; + } +} diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java index 61d6501a6847..26c2b6ab70cb 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java @@ -54,6 +54,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.GenericsHelpers; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; @@ -116,6 +117,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java index 153564f7d129..0ba3f0d35fd7 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java @@ -46,6 +46,7 @@ import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.TestHelpers; import org.apache.iceberg.types.Types; @@ -98,6 +99,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java index f4f57157e479..a637b975fe2b 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java @@ -28,6 +28,7 @@ import java.sql.Timestamp; import java.util.List; import org.apache.iceberg.spark.IcebergSpark; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.transforms.Transforms; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Row; @@ -51,6 +52,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestMicroBatchPlanningUtils.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestMicroBatchPlanningUtils.java new file mode 100644 index 000000000000..a9ce340fd4ec --- /dev/null +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestMicroBatchPlanningUtils.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.SnapshotSummary; +import org.apache.iceberg.Table; +import org.apache.iceberg.spark.CatalogTestBase; +import org.apache.spark.sql.connector.read.streaming.ReadLimit; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestMicroBatchPlanningUtils extends CatalogTestBase { + + private Table table; + + @BeforeEach + public void setupTable() { + sql("DROP TABLE IF EXISTS %s", tableName); + sql( + "CREATE TABLE %s " + + "(id INT, data STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(3, id))", + tableName); + this.table = validationCatalog.loadTable(tableIdent); + } + + @AfterEach + public void dropTable() { + sql("DROP TABLE IF EXISTS %s", tableName); + } + + @TestTemplate + public void testUnpackedLimitsCompositeChoosesMinimum() { + ReadLimit[] limits = + new ReadLimit[] { + ReadLimit.maxRows(10), ReadLimit.maxRows(4), ReadLimit.maxFiles(8), ReadLimit.maxFiles(2) + }; + + ReadLimit composite = ReadLimit.compositeLimit(limits); + + BaseSparkMicroBatchPlanner.UnpackedLimits unpacked = + new BaseSparkMicroBatchPlanner.UnpackedLimits(composite); + + assertThat(unpacked.getMaxRows()).isEqualTo(4); + assertThat(unpacked.getMaxFiles()).isEqualTo(2); + } + + @TestTemplate + public void testDetermineStartingOffsetWithTimestampBetweenSnapshots() { + sql("INSERT INTO %s VALUES (1, 'one')", tableName); + table.refresh(); + long snapshot1Time = table.currentSnapshot().timestampMillis(); + + sql("INSERT INTO %s VALUES (2, 'two')", tableName); + table.refresh(); + long snapshot2Id = table.currentSnapshot().snapshotId(); + + StreamingOffset offset = MicroBatchUtils.determineStartingOffset(table, snapshot1Time + 1); + + assertThat(offset.snapshotId()).isEqualTo(snapshot2Id); + assertThat(offset.position()).isEqualTo(0L); + assertThat(offset.shouldScanAllFiles()).isFalse(); + } + + @TestTemplate + public void testAddedFilesCountUsesSummaryWhenPresent() { + sql("INSERT INTO %s VALUES (1, 'one')", tableName); + table.refresh(); + + long expectedAddedFiles = + Long.parseLong(table.currentSnapshot().summary().get(SnapshotSummary.ADDED_FILES_PROP)); + + long actual = MicroBatchUtils.addedFilesCount(table, table.currentSnapshot()); + + assertThat(actual).isEqualTo(expectedAddedFiles); + } +} diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestORCDataFrameWrite.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestORCDataFrameWrite.java index 35be6423ee23..892e260f66f0 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestORCDataFrameWrite.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestORCDataFrameWrite.java @@ -18,9 +18,13 @@ */ package org.apache.iceberg.spark.source; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + import org.apache.iceberg.FileFormat; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; +import org.apache.spark.SparkException; +import org.junit.jupiter.api.Test; public class TestORCDataFrameWrite extends DataFrameWriteTestBase { @Override @@ -30,4 +34,24 @@ protected void configureTable(Table table) { .set(TableProperties.DEFAULT_FILE_FORMAT, FileFormat.ORC.toString()) .commit(); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(SparkException.class) + .cause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create ListType with unknown element type"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(SparkException.class) + .cause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create MapType with unknown value type"); + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetDataFrameWrite.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetDataFrameWrite.java index 90a9ac48a486..c24d92ef30af 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetDataFrameWrite.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetDataFrameWrite.java @@ -18,9 +18,13 @@ */ package org.apache.iceberg.spark.source; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + import org.apache.iceberg.FileFormat; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; +import org.apache.spark.SparkException; +import org.junit.jupiter.api.Test; public class TestParquetDataFrameWrite extends DataFrameWriteTestBase { @Override @@ -30,4 +34,24 @@ protected void configureTable(Table table) { .set(TableProperties.DEFAULT_FILE_FORMAT, FileFormat.PARQUET.toString()) .commit(); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(SparkException.class) + .cause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert element Parquet: unknown"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(SparkException.class) + .cause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert value Parquet: unknown"); + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java index c0dee43d6de1..8b567bcaf11e 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java @@ -19,6 +19,7 @@ package org.apache.iceberg.spark.source; import static org.apache.iceberg.Files.localOutput; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.assertj.core.api.Assumptions.assumeThat; import java.io.File; @@ -37,6 +38,7 @@ import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; public class TestParquetScan extends ScanTestBase { protected boolean vectorized() { @@ -83,4 +85,20 @@ protected void writeAndValidate(Schema writeSchema, Schema expectedSchema) throw super.writeAndValidate(writeSchema, expectedSchema); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert element Parquet: unknown"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert value Parquet: unknown"); + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java index becb9dcb4aca..cf3097ebdb30 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java @@ -59,6 +59,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.transforms.Transforms; import org.apache.iceberg.types.Types; import org.apache.spark.api.java.JavaRDD; @@ -118,6 +119,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); TestPartitionPruning.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java index 0b6ab2052b66..9b5b22a73f36 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java @@ -46,6 +46,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.TestHelpers; import org.apache.iceberg.types.Types; @@ -112,6 +113,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java index 11865db7fce5..fe754f4a02ba 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java @@ -42,6 +42,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -91,6 +92,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java index 600b3eab1d68..2122f2579e4a 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java @@ -59,6 +59,7 @@ import org.apache.iceberg.spark.SparkDataFile; import org.apache.iceberg.spark.SparkDeleteFile; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types; @@ -125,6 +126,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); TestSparkDataFile.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java index 2ad84f41fd87..70f3b986d23b 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java @@ -27,6 +27,7 @@ import static org.mockito.Mockito.when; import java.io.File; +import java.io.IOException; import java.net.InetAddress; import java.nio.file.Path; import java.util.List; @@ -35,6 +36,7 @@ import org.apache.iceberg.AppendFiles; import org.apache.iceberg.DataFile; import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; import org.apache.iceberg.ManifestFile; import org.apache.iceberg.ManifestFiles; import org.apache.iceberg.Parameter; @@ -43,14 +45,17 @@ import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.SortOrder; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; import org.apache.iceberg.exceptions.CommitStateUnknownException; import org.apache.iceberg.hadoop.HadoopTables; +import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.SnapshotUtil; import org.apache.spark.sql.Dataset; @@ -99,6 +104,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } @@ -153,6 +159,7 @@ public void testBasicWrite() { assertThat(file.splitOffsets()).as("Split offsets not present").isNotNull(); } assertThat(file.recordCount()).as("Should have reported record count as 1").isEqualTo(1); + assertThat(file.sortOrderId()).isEqualTo(SortOrder.unsorted().orderId()); // TODO: append more metric info if (format.equals(FileFormat.PARQUET)) { assertThat(file.columnSizes()).as("Column sizes metric not present").isNotNull(); @@ -473,6 +480,116 @@ public void testViewsReturnRecentResults() { assertThat(actual2).hasSameSizeAs(expected2).isEqualTo(expected2); } + @TestTemplate + public void testWriteDataFilesInTableSortOrder() throws IOException { + File parent = temp.resolve(format.toString()).toFile(); + File location = new File(parent, "test"); + HadoopTables tables = new HadoopTables(CONF); + PartitionSpec spec = PartitionSpec.unpartitioned(); + SortOrder sortOrder = SortOrder.builderFor(SCHEMA).asc("id").build(); + Table table = tables.create(SCHEMA, spec, sortOrder, ImmutableMap.of(), location.toString()); + + List expected = Lists.newArrayListWithCapacity(10); + for (int i = 0; i < 10; i++) { + expected.add(new SimpleRecord(i, "a")); + } + + Dataset df = spark.createDataFrame(expected, SimpleRecord.class); + + df.select("id", "data") + .write() + .format("iceberg") + .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) + .mode(SaveMode.Append) + .save(location.toString()); + + Dataset result = spark.read().format("iceberg").load(location.toString()); + + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + assertThat(actual).hasSameSizeAs(expected).isEqualTo(expected); + + try (CloseableIterable fileScanTasks = table.newScan().planFiles()) { + assertThat(fileScanTasks) + .extracting(task -> task.file().sortOrderId()) + .as("All DataFiles are written with the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + } + + @TestTemplate + public void testWriteDataFilesUnsortedTable() throws IOException { + File parent = temp.resolve(format.toString()).toFile(); + File location = new File(parent, "test"); + + HadoopTables tables = new HadoopTables(CONF); + PartitionSpec spec = PartitionSpec.unpartitioned(); + Table table = tables.create(SCHEMA, spec, location.toString()); + + List expected = Lists.newArrayList(new SimpleRecord(1, "a")); + Dataset df = spark.createDataFrame(expected, SimpleRecord.class); + + df.select("id", "data") + .write() + .format("iceberg") + .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) + .mode(SaveMode.Append) + .save(location.toString()); + + try (CloseableIterable tasks = table.newScan().planFiles()) { + assertThat(tasks) + .extracting(task -> task.file().sortOrderId()) + .as("All DataFiles should have unsorted sort order id") + .containsOnly(SortOrder.unsorted().orderId()); + } + } + + @TestTemplate + public void testWriteDataFilesAfterSortOrderChange() throws IOException { + File parent = temp.resolve(format.toString()).toFile(); + File location = new File(parent, "test"); + + HadoopTables tables = new HadoopTables(CONF); + PartitionSpec spec = PartitionSpec.unpartitioned(); + Table table = tables.create(SCHEMA, spec, location.toString()); + + List records = Lists.newArrayList(new SimpleRecord(1, "a")); + Dataset df = spark.createDataFrame(records, SimpleRecord.class); + + df.select("id", "data") + .write() + .format("iceberg") + .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) + .mode(SaveMode.Append) + .save(location.toString()); + + table.refresh(); + int unsortedId = SortOrder.unsorted().orderId(); + + try (CloseableIterable tasks = table.newScan().planFiles()) { + assertThat(tasks).extracting(task -> task.file().sortOrderId()).containsOnly(unsortedId); + } + + table.replaceSortOrder().asc("id").commit(); + int sortedId = table.sortOrder().orderId(); + + df.select("id", "data") + .write() + .format("iceberg") + .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) + .mode(SaveMode.Append) + .save(location.toString()); + + table.refresh(); + + try (CloseableIterable tasks = table.newScan().planFiles()) { + assertThat(tasks) + .extracting(task -> task.file().sortOrderId()) + .as("Should contain both unsorted and sorted files") + .containsOnly(unsortedId, sortedId); + } + } + public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType option) { File parent = temp.resolve(format.toString()).toFile(); File location = new File(parent, "test"); diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java index 8ccea303d0c1..de6a5e59029c 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java @@ -50,6 +50,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkValueConverter; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; @@ -88,6 +89,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); ImmutableMap config = ImmutableMap.of( diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java index d129fdecc50b..5daf4e1441f9 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java @@ -76,6 +76,7 @@ import org.apache.iceberg.spark.ParquetBatchReadConf; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkStructLike; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.VortexBatchReadConf; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.SparkParquetWriters; @@ -139,6 +140,7 @@ public static void startMetastoreAndSpark() { .config("spark.ui.liveUpdate.period", 0) .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java index d22ecb02d483..cb2f866fab10 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java @@ -64,6 +64,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkValueConverter; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.PropertyUtil; import org.apache.spark.sql.Dataset; @@ -182,6 +183,7 @@ public static void startMetastoreAndSpark() { SparkSession.builder() .master("local[2]") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java index d14b1a52cf82..e3934faa60ce 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java @@ -20,7 +20,9 @@ import static org.assertj.core.api.Assertions.assertThat; +import org.apache.iceberg.HistoryEntry; import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Table; import org.apache.iceberg.spark.CatalogTestBase; import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; import org.apache.spark.sql.connector.catalog.CatalogManager; @@ -56,4 +58,58 @@ public void testTableEquality() throws NoSuchTableException { assertThat(table1).as("References must be different").isNotSameAs(table2); assertThat(table1).as("Tables must be equivalent").isEqualTo(table2); } + + @TestTemplate + public void testTableInequalityWithDifferentSnapshots() throws NoSuchTableException { + sql("INSERT INTO %s VALUES (1, 'a')", tableName); + sql("INSERT INTO %s VALUES (2, 'b')", tableName); + + CatalogManager catalogManager = spark.sessionState().catalogManager(); + TableCatalog catalog = (TableCatalog) catalogManager.catalog(catalogName); + Identifier identifier = Identifier.of(tableIdent.namespace().levels(), tableIdent.name()); + SparkTable table = (SparkTable) catalog.loadTable(identifier); + + Table icebergTable = validationCatalog.loadTable(tableIdent); + long[] snapshotIds = + icebergTable.history().stream().mapToLong(HistoryEntry::snapshotId).toArray(); + + SparkTable tableAtSnapshot1 = table.copyWithSnapshotId(snapshotIds[0]); + SparkTable tableAtSnapshot2 = table.copyWithSnapshotId(snapshotIds[1]); + + assertThat(tableAtSnapshot1) + .as("Tables at different snapshots must not be equal") + .isNotEqualTo(tableAtSnapshot2); + assertThat(tableAtSnapshot1.hashCode()) + .as("Hash codes should differ for different snapshots") + .isNotEqualTo(tableAtSnapshot2.hashCode()); + } + + @TestTemplate + public void testTableInequalityWithDifferentBranches() throws NoSuchTableException { + sql("INSERT INTO %s VALUES (1, 'a')", tableName); + + CatalogManager catalogManager = spark.sessionState().catalogManager(); + TableCatalog catalog = (TableCatalog) catalogManager.catalog(catalogName); + Identifier identifier = Identifier.of(tableIdent.namespace().levels(), tableIdent.name()); + + Table icebergTable = validationCatalog.loadTable(tableIdent); + icebergTable + .manageSnapshots() + .createBranch("testBranch", icebergTable.currentSnapshot().snapshotId()) + .commit(); + + // reload after branch creation so the table sees the new ref + SparkTable table = (SparkTable) catalog.loadTable(identifier); + table.table().refresh(); + + SparkTable tableOnMain = table.copyWithBranch("main"); + SparkTable tableOnBranch = table.copyWithBranch("testBranch"); + + assertThat(tableOnMain) + .as("Tables on different branches must not be equal") + .isNotEqualTo(tableOnBranch); + assertThat(tableOnMain.hashCode()) + .as("Hash codes should differ for different branches") + .isNotEqualTo(tableOnBranch.hashCode()); + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java index 54048bbf218a..ab760010535b 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java @@ -29,11 +29,16 @@ import java.nio.file.Paths; import java.util.List; import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.FileScanTask; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; import org.apache.iceberg.Table; import org.apache.iceberg.hadoop.HadoopTables; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoder; @@ -69,6 +74,7 @@ public static void startSpark() { .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) .config("spark.sql.shuffle.partitions", 4) + .config(TestBase.DISABLE_UI) .getOrCreate(); } @@ -263,6 +269,50 @@ public void testStreamingWriteCompleteModeWithProjection() throws Exception { } } + @Test + public void testStreamingWriteDataFilesInTableSortOrder() throws Exception { + File parent = temp.resolve("parquet").toFile(); + File location = new File(parent, "test-table"); + File checkpoint = new File(parent, "checkpoint"); + + HadoopTables tables = new HadoopTables(CONF); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); + SortOrder sortOrder = SortOrder.builderFor(SCHEMA).asc("id").build(); + Table table = tables.create(SCHEMA, spec, sortOrder, ImmutableMap.of(), location.toString()); + + MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .selectExpr("value AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("append") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); + + try { + StreamingQuery query = streamWriter.start(); + List batch1 = Lists.newArrayList(1, 2); + send(batch1, inputStream); + query.processAllAvailable(); + query.stop(); + + table.refresh(); + + try (CloseableIterable tasks = table.newScan().planFiles()) { + assertThat(tasks) + .extracting(task -> task.file().sortOrderId()) + .as("All DataFiles are written with the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + } finally { + for (StreamingQuery query : spark.streams().active()) { + query.stop(); + } + } + } + @Test public void testStreamingWriteUpdateMode() throws Exception { File parent = temp.resolve("parquet").toFile(); diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java index 5f54c832aa93..d97e6ec00d7f 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java @@ -31,13 +31,17 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.IntStream; import org.apache.iceberg.BaseTable; +import org.apache.iceberg.CatalogProperties; import org.apache.iceberg.DataFile; import org.apache.iceberg.DataFiles; import org.apache.iceberg.DataOperations; import org.apache.iceberg.DeleteFile; import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; import org.apache.iceberg.Files; +import org.apache.iceberg.Parameter; import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; import org.apache.iceberg.RewriteFiles; import org.apache.iceberg.Schema; import org.apache.iceberg.Snapshot; @@ -50,15 +54,22 @@ import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.data.Record; import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.CatalogTestBase; +import org.apache.iceberg.spark.SparkCatalogConfig; +import org.apache.iceberg.spark.SparkReadConf; import org.apache.iceberg.spark.SparkReadOptions; +import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.VoidFunction2; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Row; +import org.apache.spark.sql.connector.read.InputPartition; +import org.apache.spark.sql.connector.read.streaming.Offset; import org.apache.spark.sql.internal.SQLConf; import org.apache.spark.sql.streaming.DataStreamWriter; import org.apache.spark.sql.streaming.OutputMode; @@ -73,10 +84,73 @@ @ExtendWith(ParameterizedTestExtension.class) public final class TestStructuredStreamingRead3 extends CatalogTestBase { + @Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}, async = {3}") + public static Object[][] parameters() { + return new Object[][] { + { + SparkCatalogConfig.HIVE.catalogName(), + SparkCatalogConfig.HIVE.implementation(), + SparkCatalogConfig.HIVE.properties(), + false + }, + { + SparkCatalogConfig.HIVE.catalogName(), + SparkCatalogConfig.HIVE.implementation(), + SparkCatalogConfig.HIVE.properties(), + true + }, + { + SparkCatalogConfig.HADOOP.catalogName(), + SparkCatalogConfig.HADOOP.implementation(), + SparkCatalogConfig.HADOOP.properties(), + false + }, + { + SparkCatalogConfig.HADOOP.catalogName(), + SparkCatalogConfig.HADOOP.implementation(), + SparkCatalogConfig.HADOOP.properties(), + true + }, + { + SparkCatalogConfig.REST.catalogName(), + SparkCatalogConfig.REST.implementation(), + ImmutableMap.builder() + .putAll(SparkCatalogConfig.REST.properties()) + .put(CatalogProperties.URI, restCatalog.properties().get(CatalogProperties.URI)) + .build(), + false + }, + { + SparkCatalogConfig.REST.catalogName(), + SparkCatalogConfig.REST.implementation(), + ImmutableMap.builder() + .putAll(SparkCatalogConfig.REST.properties()) + .put(CatalogProperties.URI, restCatalog.properties().get(CatalogProperties.URI)) + .build(), + true + }, + { + SparkCatalogConfig.SPARK_SESSION.catalogName(), + SparkCatalogConfig.SPARK_SESSION.implementation(), + SparkCatalogConfig.SPARK_SESSION.properties(), + false + }, + { + SparkCatalogConfig.SPARK_SESSION.catalogName(), + SparkCatalogConfig.SPARK_SESSION.implementation(), + SparkCatalogConfig.SPARK_SESSION.properties(), + true + } + }; + } + private Table table; private final AtomicInteger microBatches = new AtomicInteger(); + @Parameter(index = 3) + private Boolean async; + /** * test data to be used by multiple writes each write creates a snapshot and writes a list of * records @@ -250,15 +324,41 @@ public void testReadStreamWithCompositeReadLimit() throws Exception { Trigger.AvailableNow()); } + @TestTemplate + public void testReadStreamWithLowAsyncQueuePreload() throws Exception { + appendDataAsMultipleSnapshots(TEST_DATA_MULTIPLE_SNAPSHOTS); + // Set low preload limits to test async queue behavior - background thread should load + // remaining data + + StreamingQuery query = + startStream( + ImmutableMap.of( + SparkReadOptions.ASYNC_QUEUE_PRELOAD_ROW_LIMIT, + "5", + SparkReadOptions.ASYNC_QUEUE_PRELOAD_FILE_LIMIT, + "5")); + + List actual = rowsAvailable(query); + assertThat(actual) + .containsExactlyInAnyOrderElementsOf(Iterables.concat(TEST_DATA_MULTIPLE_SNAPSHOTS)); + } + @TestTemplate public void testAvailableNowStreamReadShouldNotHangOrReprocessData() throws Exception { File writerCheckpointFolder = temp.resolve("writer-checkpoint-folder").toFile(); File writerCheckpoint = new File(writerCheckpointFolder, "writer-checkpoint"); File output = temp.resolve("junit").toFile(); + Map options = Maps.newHashMap(); + options.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + options.put(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } + DataStreamWriter querySource = spark .readStream() + .options(options) .format("iceberg") .load(tableName) .writeStream() @@ -313,10 +413,17 @@ public void testTriggerAvailableNowDoesNotProcessNewDataWhileRunning() throws Ex long expectedSnapshotId = table.currentSnapshot().snapshotId(); String sinkTable = "availablenow_sink"; + Map options = Maps.newHashMap(); + options.put(SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, "1"); + options.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + options.put(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } + StreamingQuery query = spark .readStream() - .option(SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, "1") + .options(options) .format("iceberg") .load(tableName) .writeStream() @@ -358,6 +465,142 @@ public void testTriggerAvailableNowDoesNotProcessNewDataWhileRunning() throws Ex assertThat(actualResults).containsExactlyInAnyOrderElementsOf(Iterables.concat(expectedData)); } + @TestTemplate + public void testTriggerAvailableNowCapsAsyncPreloadAfterPrepare() { + List> initialData = + List.of(List.of(new SimpleRecord(1, "one")), List.of(new SimpleRecord(2, "two"))); + appendDataAsMultipleSnapshots(initialData); + + table.refresh(); + long expectedCapSnapshotId = table.currentSnapshot().snapshotId(); + + SparkMicroBatchStream stream = + new SparkMicroBatchStream( + JavaSparkContext.fromSparkContext(spark.sparkContext()), + table, + table::io, + new SparkReadConf( + spark, + table, + ImmutableMap.of( + SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, + async.toString(), + SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, + "1", + SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, + "1", + SparkReadOptions.ASYNC_QUEUE_PRELOAD_FILE_LIMIT, + "10", + SparkReadOptions.ASYNC_QUEUE_PRELOAD_ROW_LIMIT, + "10")), + table.schema(), + temp.resolve("available-now-cap-checkpoint").toString()); + + try { + stream.prepareForTriggerAvailableNow(); + + appendData(List.of(new SimpleRecord(3, "three"))); + + Offset startOffset = stream.initialOffset(); + Offset firstEndOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + assertThat(firstEndOffset).isNotNull(); + stream.planInputPartitions(startOffset, firstEndOffset); + + Offset secondEndOffset = stream.latestOffset(firstEndOffset, stream.getDefaultReadLimit()); + assertThat(secondEndOffset).isNotNull(); + stream.planInputPartitions(firstEndOffset, secondEndOffset); + + assertThat(stream.latestOffset(secondEndOffset, stream.getDefaultReadLimit())).isNull(); + assertThat(((StreamingOffset) secondEndOffset).snapshotId()).isEqualTo(expectedCapSnapshotId); + } finally { + stream.stop(); + } + } + + @TestTemplate + public void testLatestOffsetReturnsNullAfterFinalBatchIsConsumed() throws Exception { + appendDataAsMultipleSnapshots(TEST_DATA_MULTIPLE_SNAPSHOTS); + + table.refresh(); + int expectedBatchCount; + try (CloseableIterable tasks = table.newScan().planFiles()) { + expectedBatchCount = Iterables.size(tasks); + } + + SparkMicroBatchStream stream = + newMicroBatchStream( + ImmutableMap.of(SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, "1"), + "drain-to-null-checkpoint"); + + try { + int plannedBatchCount = 0; + Offset startOffset = stream.initialOffset(); + Offset endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + while (endOffset != null) { + InputPartition[] partitions = stream.planInputPartitions(startOffset, endOffset); + assertThat(partitions).isNotEmpty(); + plannedBatchCount += 1; + startOffset = endOffset; + endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + } + + assertThat(endOffset).isNull(); + assertThat(plannedBatchCount).isEqualTo(expectedBatchCount); + } finally { + stream.stop(); + } + } + + @TestTemplate + public void testPlanInputPartitionsIsIdempotentForSameOffsets() { + appendDataAsMultipleSnapshots(TEST_DATA_MULTIPLE_SNAPSHOTS); + + SparkMicroBatchStream stream = + newMicroBatchStream( + ImmutableMap.of(SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, "1"), + "idempotent-plan-files-checkpoint"); + + try { + Offset startOffset = stream.initialOffset(); + Offset endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + + assertThat(endOffset).isNotNull(); + + InputPartition[] firstPartitions = stream.planInputPartitions(startOffset, endOffset); + InputPartition[] secondPartitions = stream.planInputPartitions(startOffset, endOffset); + + List firstFileLocations = Lists.newArrayList(); + for (InputPartition partition : firstPartitions) { + SparkInputPartition sparkInputPartition = (SparkInputPartition) partition; + for (FileScanTask task : sparkInputPartition.taskGroup().tasks()) { + firstFileLocations.add(task.file().location()); + } + } + + List secondFileLocations = Lists.newArrayList(); + for (InputPartition partition : secondPartitions) { + SparkInputPartition sparkInputPartition = (SparkInputPartition) partition; + for (FileScanTask task : sparkInputPartition.taskGroup().tasks()) { + secondFileLocations.add(task.file().location()); + } + } + + assertThat(firstFileLocations).containsExactlyInAnyOrderElementsOf(secondFileLocations); + + startOffset = endOffset; + endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + while (endOffset != null) { + assertThat(stream.planInputPartitions(startOffset, endOffset)).isNotEmpty(); + startOffset = endOffset; + endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + } + + assertThat(endOffset).isNull(); + } finally { + stream.stop(); + } + } + @TestTemplate public void testReadStreamOnIcebergThenAddData() throws Exception { List> expected = TEST_DATA_MULTIPLE_SNAPSHOTS; @@ -425,6 +668,8 @@ public void testReadingStreamFromFutureTimetsamp() throws Exception { // Data appended after the timestamp should appear appendData(data); + // Allow async background thread to refresh, else test sometimes fails + Thread.sleep(50); actual = rowsAvailable(query); assertThat(actual).containsExactlyInAnyOrderElementsOf(data); } @@ -872,13 +1117,18 @@ private void appendData(List data, String format) { private static final String MEMORY_TABLE = "_stream_view_mem"; private StreamingQuery startStream(Map options) throws TimeoutException { + Map allOptions = Maps.newHashMap(options); + allOptions.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + allOptions.putIfAbsent(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } return spark .readStream() - .options(options) + .options(allOptions) .format("iceberg") .load(tableName) .writeStream() - .options(options) + .options(allOptions) .format("memory") .queryName(MEMORY_TABLE) .outputMode(OutputMode.Append()) @@ -903,11 +1153,17 @@ private void assertMicroBatchRecordSizes( private void assertMicroBatchRecordSizes( Map options, List expectedMicroBatchRecordSize, Trigger trigger) throws TimeoutException { - Dataset ds = spark.readStream().options(options).format("iceberg").load(tableName); + Map allOptions = Maps.newHashMap(options); + allOptions.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + allOptions.putIfAbsent(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } + + Dataset ds = spark.readStream().options(allOptions).format("iceberg").load(tableName); List syncList = Collections.synchronizedList(Lists.newArrayList()); ds.writeStream() - .options(options) + .options(allOptions) .trigger(trigger) .foreachBatch( (VoidFunction2, Long>) @@ -929,4 +1185,21 @@ private List rowsAvailable(StreamingQuery query) { .as(Encoders.bean(SimpleRecord.class)) .collectAsList(); } + + private SparkMicroBatchStream newMicroBatchStream( + Map options, String checkpointDirName) { + Map allOptions = Maps.newHashMap(options); + allOptions.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + allOptions.putIfAbsent(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } + + return new SparkMicroBatchStream( + JavaSparkContext.fromSparkContext(spark.sparkContext()), + table, + table::io, + new SparkReadConf(spark, table, allOptions), + table.schema(), + temp.resolve(checkpointDirName).toString()); + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java index c3fac70dd3fc..45ff9184566b 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java @@ -41,6 +41,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.ByteBuffers; @@ -84,6 +85,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); TestWriteMetricsConfig.sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java index 5ce56b4feca7..946456fe2be8 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java @@ -63,6 +63,7 @@ public static void startMetastoreAndSpark() { SparkSession.builder() .master("local[2]") .config("spark.sql.iceberg.aggregate_pushdown", "true") + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.0/build.gradle b/spark/v4.0/build.gradle index 38c4bd785531..4b34396f4f49 100644 --- a/spark/v4.0/build.gradle +++ b/spark/v4.0/build.gradle @@ -112,14 +112,10 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") { testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') - testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - testImplementation libs.sqlite.jdbc + testImplementation project(path: ':iceberg-orc', configuration: 'testArtifacts') + testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) testImplementation libs.awaitility testImplementation(testFixtures(project(':iceberg-parquet'))) - // runtime dependencies for running REST Catalog based integration test - testRuntimeOnly libs.jetty.servlet } test { @@ -180,13 +176,7 @@ project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVer testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') testImplementation project(path: ":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}", configuration: 'testArtifacts') - testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - // runtime dependencies for running REST Catalog based integration test - testRuntimeOnly libs.jetty.servlet - testRuntimeOnly libs.sqlite.jdbc - + testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) testImplementation libs.avro.avro testImplementation libs.parquet.hadoop testImplementation libs.awaitility @@ -278,11 +268,7 @@ project(":iceberg-spark:iceberg-spark-runtime-${sparkMajorVersion}_${scalaVersio integrationRuntimeOnly project(':iceberg-hive-metastore') // runtime dependencies for running REST Catalog based integration test integrationRuntimeOnly project(path: ':iceberg-core', configuration: 'testArtifacts') - integrationRuntimeOnly (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - integrationRuntimeOnly libs.jetty.servlet - integrationRuntimeOnly libs.sqlite.jdbc + integrationRuntimeOnly (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) // Not allowed on our classpath, only the runtime jar is allowed integrationCompileOnly project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVersion}") @@ -345,5 +331,7 @@ project(":iceberg-spark:iceberg-spark-runtime-${sparkMajorVersion}_${scalaVersio jar { enabled = false } + + apply from: "${rootDir}/runtime-deps.gradle" } diff --git a/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java b/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java index 9375ca3a4f46..5287ccd514ab 100644 --- a/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java +++ b/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java @@ -31,6 +31,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions; import org.apache.iceberg.util.ThreadPools; import org.apache.spark.sql.SparkSession; @@ -205,7 +206,7 @@ private void initDataAndDVs() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) diff --git a/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java b/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java index 963daa2c364c..ea31b98f1ac9 100644 --- a/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java +++ b/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java @@ -155,7 +155,7 @@ private void runBenchmark(RowLevelOperationMode mode, double updatePercentage) { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java b/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java index 34d9d70e6ccb..f9558240f8cb 100644 --- a/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java +++ b/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java @@ -215,7 +215,7 @@ public void localPlanningViaDistributedScanWithoutFilterWithStats(Blackhole blac private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.driver.maxResultSize", "8G") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) diff --git a/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java b/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java index 7c2def237874..e9c563b9b0ef 100644 --- a/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java +++ b/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java @@ -199,7 +199,7 @@ private void initDataAndDeletes() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) diff --git a/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java b/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java index d917eae5eb0f..caa23625fc44 100644 --- a/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java +++ b/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java @@ -138,7 +138,7 @@ private void runBenchmark(RowLevelOperationMode mode, double updatePercentage) { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java index bfcb5af235d3..ef4f0090292c 100644 --- a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java +++ b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java @@ -49,7 +49,12 @@ public class TestExtendedParser { @BeforeAll public static void before() { - spark = SparkSession.builder().master("local").appName("TestExtendedParser").getOrCreate(); + spark = + SparkSession.builder() + .master("local") + .appName("TestExtendedParser") + .config(TestBase.DISABLE_UI) + .getOrCreate(); } @AfterAll diff --git a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java index 796c47b545cc..f23a5d9db3ad 100644 --- a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java +++ b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java @@ -59,6 +59,7 @@ public static void startMetastoreAndSpark() { .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") .config( SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), String.valueOf(RANDOM.nextBoolean())) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java index f7ded0c4d7d2..d39dff060c9a 100644 --- a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java +++ b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java @@ -162,6 +162,25 @@ public synchronized void testDeleteWithConcurrentTableRefresh() throws Exception assertThat(executorService.awaitTermination(2, TimeUnit.MINUTES)).as("Timeout").isTrue(); } + @TestTemplate + public void testCopyOnWriteDeleteSetsSortOrderIdOnRewrittenDataFiles() { + createAndInitTable( + "id INT, dep STRING", + "PARTITIONED BY (dep)", + "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + + sql("ALTER TABLE %s WRITE ORDERED BY id", tableName); + + sql("DELETE FROM %s WHERE id = 1", commitTarget()); + + Table table = validationCatalog.loadTable(tableIdent); + Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch); + assertThat(snapshot.addedDataFiles(table.io())) + .extracting(DataFile::sortOrderId) + .as("Rewritten data files should carry the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + @TestTemplate public void testRuntimeFilteringWithPreservedDataGrouping() throws NoSuchTableException { createAndInitPartitionedTable(); diff --git a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java index fef8b28c689a..394dbbda1a3d 100644 --- a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java +++ b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java @@ -151,6 +151,34 @@ public synchronized void testMergeWithConcurrentTableRefresh() throws Exception assertThat(executorService.awaitTermination(2, TimeUnit.MINUTES)).as("Timeout").isTrue(); } + @TestTemplate + public void testCopyOnWriteMergeSetsSortOrderIdOnRewrittenDataFiles() { + createAndInitTable("id INT, dep STRING"); + sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); + sql("ALTER TABLE %s WRITE ORDERED BY id", tableName); + + append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + createBranchIfNeeded(); + + createOrReplaceView("source", Collections.singletonList(1), Encoders.INT()); + + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.value " + + "WHEN MATCHED THEN " + + " UPDATE SET dep = 'changed' " + + "WHEN NOT MATCHED THEN " + + " INSERT (id, dep) VALUES (s.value, 'new')", + commitTarget()); + + Table table = validationCatalog.loadTable(tableIdent); + Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch); + assertThat(snapshot.addedDataFiles(table.io())) + .extracting(DataFile::sortOrderId) + .as("Rewritten data files should carry the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + @TestTemplate public void testRuntimeFilteringWithReportedPartitioning() { createAndInitTable("id INT, dep STRING"); diff --git a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java index 21d1377b2b98..b547218acbd4 100644 --- a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java +++ b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java @@ -149,6 +149,25 @@ public synchronized void testUpdateWithConcurrentTableRefresh() throws Exception assertThat(executorService.awaitTermination(2, TimeUnit.MINUTES)).as("Timeout").isTrue(); } + @TestTemplate + public void testCopyOnWriteUpdateSetsSortOrderIdOnRewrittenDataFiles() { + createAndInitTable( + "id INT, dep STRING", + "PARTITIONED BY (dep)", + "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + + sql("ALTER TABLE %s WRITE ORDERED BY id", tableName); + + sql("UPDATE %s SET dep = 'changed' WHERE id = 1", commitTarget()); + + Table table = validationCatalog.loadTable(tableIdent); + Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch); + assertThat(snapshot.addedDataFiles(table.io())) + .extracting(DataFile::sortOrderId) + .as("Rewritten data files should carry the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + @TestTemplate public void testRuntimeFilteringWithReportedPartitioning() { createAndInitTable("id INT, dep STRING"); diff --git a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java index fbf6ce3559a7..79d6bea12f67 100644 --- a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java +++ b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java @@ -1422,6 +1422,62 @@ public void testDeleteToCustomWapBranchWithoutWhereClause() throws NoSuchTableEx }); } + @TestTemplate + public void testDeleteToWapBranchCanDeleteWhereScansWapBranch() throws NoSuchTableException { + assumeThat(branch).as("WAP branch only works for table identifier without branch").isNull(); + + createAndInitPartitionedTable(); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' = 'true')", + tableName, TableProperties.WRITE_AUDIT_PUBLISH_ENABLED); + + append(tableName, new Employee(1, "hr")); + + spark.conf().set(SparkSQLProperties.WAP_BRANCH, "wap"); + try { + append(tableName, new Employee(0, "hr"), new Employee(1, "hr"), new Employee(2, "hr")); + + sql("DELETE FROM %s WHERE id = 1", tableName); + + assertThat(sql("SELECT id, dep FROM %s.branch_wap ORDER BY id", tableName)) + .as("DELETE should remove the matching rows from the WAP branch") + .containsExactly(row(0, "hr"), row(2, "hr")); + assertThat(sql("SELECT id, dep FROM %s.branch_main", tableName)) + .as("Main branch must not be modified by a WAP-targeted DELETE") + .containsExactly(row(1, "hr")); + } finally { + spark.conf().unset(SparkSQLProperties.WAP_BRANCH); + } + } + + @TestTemplate + public void testMetadataDeleteToWapBranchCommitsToWapBranch() throws NoSuchTableException { + assumeThat(branch).as("WAP branch only works for table identifier without branch").isNull(); + + createAndInitPartitionedTable(); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' = 'true')", + tableName, TableProperties.WRITE_AUDIT_PUBLISH_ENABLED); + + append(tableName, new Employee(1, "hr"), new Employee(5, "eng")); + + spark.conf().set(SparkSQLProperties.WAP_BRANCH, "wap"); + try { + append(tableName, new Employee(0, "hr"), new Employee(2, "eng")); + + sql("DELETE FROM %s WHERE dep = 'hr'", tableName); + + assertThat(sql("SELECT id, dep FROM %s.branch_wap ORDER BY id", tableName)) + .as("Metadata delete should remove the hr partition on the WAP branch") + .containsExactly(row(2, "eng"), row(5, "eng")); + assertThat(sql("SELECT id, dep FROM %s.branch_main ORDER BY id", tableName)) + .as("Metadata delete must not commit to main when WAP is set") + .containsExactly(row(1, "hr"), row(5, "eng")); + } finally { + spark.conf().unset(SparkSQLProperties.WAP_BRANCH); + } + } + @TestTemplate public void testDeleteWithFilterOnNestedColumn() { createAndInitNestedColumnsTable(); diff --git a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java index 737f19e86a95..9a42b58e3434 100644 --- a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java +++ b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java @@ -26,6 +26,7 @@ import java.util.Set; import java.util.stream.Collectors; import java.util.stream.IntStream; +import org.apache.iceberg.DataFile; import org.apache.iceberg.DeleteFile; import org.apache.iceberg.FileFormat; import org.apache.iceberg.ParameterizedTestExtension; @@ -136,6 +137,34 @@ public void testMergeWithDVAndHistoricalPositionDeletes() { assertThat(dvs).allMatch(dv -> FileFormat.fromFileName(dv.location()) == FileFormat.PUFFIN); } + @TestTemplate + public void testMergeOnReadMergeSetsSortOrderIdOnNewDataFiles() { + createAndInitTable( + "id INT, dep STRING", + "PARTITIONED BY (dep)", + "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + + sql("ALTER TABLE %s WRITE ORDERED BY id", tableName); + + createOrReplaceView("source", ImmutableList.of(1, 3), Encoders.INT()); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.value " + + "WHEN MATCHED THEN " + + " UPDATE SET id = id + 10 " + + "WHEN NOT MATCHED THEN " + + " INSERT (id, dep) VALUES (s.value, 'hr')", + commitTarget()); + + Table table = validationCatalog.loadTable(tableIdent); + Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch); + assertThat(snapshot.addedDataFiles(table.io())) + .extracting(DataFile::sortOrderId) + .as("All new data files should carry the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + private void checkMergeDeleteGranularity(DeleteGranularity deleteGranularity) { createTableWithDeleteGranularity( "id INT, dep STRING", "PARTITIONED BY (dep)", deleteGranularity); diff --git a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java index 2398bc45b19b..d1c336d5ddeb 100644 --- a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java +++ b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java @@ -25,6 +25,7 @@ import java.util.Map; import java.util.Set; import java.util.stream.Collectors; +import org.apache.iceberg.DataFile; import org.apache.iceberg.DeleteFile; import org.apache.iceberg.FileFormat; import org.apache.iceberg.ParameterizedTestExtension; @@ -224,6 +225,25 @@ public void testUpdateWithDVAndHistoricalPositionDeletes() { assertThat(dvs).allMatch(dv -> FileFormat.fromFileName(dv.location()) == FileFormat.PUFFIN); } + @TestTemplate + public void testMergeOnReadUpdateSetsSortOrderIdOnNewDataFiles() { + createAndInitTable( + "id INT, dep STRING", + "PARTITIONED BY (dep)", + "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + + sql("ALTER TABLE %s WRITE ORDERED BY id", tableName); + + sql("UPDATE %s SET id = id + 10 WHERE id = 1", commitTarget()); + + Table table = validationCatalog.loadTable(tableIdent); + Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch); + assertThat(snapshot.addedDataFiles(table.io())) + .extracting(DataFile::sortOrderId) + .as("All new data files should carry the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + private void initTable(String partitionedBy, DeleteGranularity deleteGranularity) { createTableWithDeleteGranularity("id INT, dep STRING", partitionedBy, deleteGranularity); diff --git a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java index 19800c2f4666..3f8b574126ba 100644 --- a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java +++ b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java @@ -375,4 +375,54 @@ public void testSnapshotPartitionedV1() throws IOException { } } } + + @TestTemplate + public void testSnapshotWithVariant() throws IOException { + assumeThat(catalogName) + .as("Variant type requires Hive 4 which is not yet supported") + .isNotEqualTo("testhive") + .isNotEqualTo("spark_catalog"); + String location = Files.createTempDirectory(temp, "junit").toFile().toString(); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data variant) USING parquet LOCATION '%s'", + SOURCE_NAME, location); + sql("INSERT INTO TABLE %s VALUES (1, parse_json('{\"key\": 123}'))", SOURCE_NAME); + + Object result = + scalarSql( + "CALL %s.system.snapshot('%s', '%s', properties => map('format-version','3'))", + catalogName, SOURCE_NAME, tableName); + assertThat(result).as("Should have added one file").isEqualTo(1L); + + assertEquals( + "Should have expected rows", + ImmutableList.of(row(1L, 123)), + sql("SELECT id, variant_get(data, '$.key', 'int') FROM %s", tableName)); + } + + @TestTemplate + public void testSnapshotPartitionedWithVariant() throws IOException { + assumeThat(catalogName) + .as("Variant type requires Hive 4 which is not yet supported") + .isNotEqualTo("testhive") + .isNotEqualTo("spark_catalog"); + String location = Files.createTempDirectory(temp, "junit").toFile().toString(); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data variant) USING parquet PARTITIONED BY (id) LOCATION '%s'", + SOURCE_NAME, location); + sql( + "INSERT INTO TABLE %s (id, data) VALUES (1, parse_json('{\"key\": 123}')), (2, parse_json('{\"key\": 456}'))", + SOURCE_NAME); + + Object result = + scalarSql( + "CALL %s.system.snapshot('%s', '%s', properties => map('format-version','3'))", + catalogName, SOURCE_NAME, tableName); + assertThat(result).as("Should have added two files").isEqualTo(2L); + + assertEquals( + "Should have expected rows", + ImmutableList.of(row(123, 1L), row(456, 2L)), + sql("SELECT variant_get(data, '$.key', 'int'), id FROM %s ORDER BY id", tableName)); + } } diff --git a/spark/v4.0/spark-runtime/LICENSE b/spark/v4.0/spark-runtime/LICENSE index a67296eb412c..50c91faf8edb 100644 --- a/spark/v4.0/spark-runtime/LICENSE +++ b/spark/v4.0/spark-runtime/LICENSE @@ -219,6 +219,94 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles FastDoubleParser (via Jackson JSON Processor). + +Copyright: 2023 Werner Randelshofer, Switzerland +Project URL: https://github.com/wrandelshofer/FastDoubleParser +License: MIT + +| Copyright (c) 2023 Werner Randelshofer, Switzerland +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles fast_float (bundled by FastDoubleParser). + +Copyright: 2021 The fast_float authors +Project URL: https://github.com/fastfloat/fast_float +License: MIT + +| Copyright (c) 2021 The fast_float authors +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles bigint (bundled by FastDoubleParser). + +Copyright: 2022 Tim Buktu +Project URL: https://github.com/tbuktu/bigint +License: BSD 2-Clause + +| Copyright 2022 Tim Buktu +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| 1. Redistributions of source code must retain the above copyright notice, this +| list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + This product bundles Apache Parquet. Copyright: 2014-2024 The Apache Software Foundation @@ -227,7 +315,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Thrift. +This product bundles Apache Thrift (bundled by Parquet). Copyright: 2006-2017 The Apache Software Foundation. Project URL: https://thrift.apache.org/ @@ -235,7 +323,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product includes code from Daniel Lemire's JavaFastPFOR project. +This product includes code from Daniel Lemire's JavaFastPFOR project (bundled by Parquet). Copyright: 2013 Daniel Lemire Project URL: https://github.com/lemire/JavaFastPFOR @@ -243,7 +331,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles fastutil. +This product bundles fastutil (bundled by Parquet). Copyright: 2002-2014 Sebastiano Vigna Project URL: http://fastutil.di.unimi.it/ @@ -251,6 +339,13 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles Zero-Allocation Hashing (bundled by Parquet). + +Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + This product bundles Apache ORC. Copyright: 2013 and onwards The Apache Software Foundation. @@ -259,7 +354,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Hive. +This product bundles Apache Hive's Storage API (bundled by ORC). Copyright: 2008-2020 The Apache Software Foundation Project URL: https://hive.apache.org/ @@ -267,11 +362,12 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Google protobuf. +This product bundles Google protobuf (bundled by ORC). Copyright: 2008 Google Inc. Project URL: https://developers.google.com/protocol-buffers License: BSD 3-Clause + | Copyright 2008 Google Inc. All rights reserved. | | Redistribution and use in source and binary forms, with or without @@ -339,6 +435,387 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles the Mozilla Public Suffix List, distributed by Apache HttpComponents. + +Project URL: https://publicsuffix.org/ +License: Mozilla Public License, Version 2.0 - https://mozilla.org/MPL/2.0/ + +| Mozilla Public License Version 2.0 +| ================================== +| +| 1. Definitions +| -------------- +| +| 1.1. "Contributor" +| means each individual or legal entity that creates, contributes to +| the creation of, or owns Covered Software. +| +| 1.2. "Contributor Version" +| means the combination of the Contributions of others (if any) used +| by a Contributor and that particular Contributor's Contribution. +| +| 1.3. "Contribution" +| means Covered Software of a particular Contributor. +| +| 1.4. "Covered Software" +| means Source Code Form to which the initial Contributor has attached +| the notice in Exhibit A, the Executable Form of such Source Code +| Form, and Modifications of such Source Code Form, in each case +| including portions thereof. +| +| 1.5. "Incompatible With Secondary Licenses" +| means +| +| (a) that the initial Contributor has attached the notice described +| in Exhibit B to the Covered Software; or +| +| (b) that the Covered Software was made available under the terms of +| version 1.1 or earlier of the License, but not also under the +| terms of a Secondary License. +| +| 1.6. "Executable Form" +| means any form of the work other than Source Code Form. +| +| 1.7. "Larger Work" +| means a work that combines Covered Software with other material, in +| a separate file or files, that is not Covered Software. +| +| 1.8. "License" +| means this document. +| +| 1.9. "Licensable" +| means having the right to grant, to the maximum extent possible, +| whether at the time of the initial grant or subsequently, any and +| all of the rights conveyed by this License. +| +| 1.10. "Modifications" +| means any of the following: +| +| (a) any file in Source Code Form that results from an addition to, +| deletion from, or modification of the contents of Covered +| Software; or +| +| (b) any new file in Source Code Form that contains any Covered +| Software. +| +| 1.11. "Patent Claims" of a Contributor +| means any patent claim(s), including without limitation, method, +| process, and apparatus claims, in any patent Licensable by such +| Contributor that would be infringed, but for the grant of the +| License, by the making, using, selling, offering for sale, having +| made, import, or transfer of either its Contributions or its +| Contributor Version. +| +| 1.12. "Secondary License" +| means either the GNU General Public License, Version 2.0, the GNU +| Lesser General Public License, Version 2.1, the GNU Affero General +| Public License, Version 3.0, or any later versions of those +| licenses. +| +| 1.13. "Source Code Form" +| means the form of the work preferred for making modifications. +| +| 1.14. "You" (or "Your") +| means an individual or a legal entity exercising rights under this +| License. For legal entities, "You" includes any entity that +| controls, is controlled by, or is under common control with You. For +| purposes of this definition, "control" means (a) the power, direct +| or indirect, to cause the direction or management of such entity, +| whether by contract or otherwise, or (b) ownership of more than +| fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants and Conditions +| -------------------------------- +| +| 2.1. Grants +| +| Each Contributor hereby grants You a world-wide, royalty-free, +| non-exclusive license: +| +| (a) under intellectual property rights (other than patent or trademark) +| Licensable by such Contributor to use, reproduce, make available, +| modify, display, perform, distribute, and otherwise exploit its +| Contributions, either on an unmodified basis, with Modifications, or +| as part of a Larger Work; and +| +| (b) under Patent Claims of such Contributor to make, use, sell, offer +| for sale, have made, import, and otherwise transfer either its +| Contributions or its Contributor Version. +| +| 2.2. Effective Date +| +| The licenses granted in Section 2.1 with respect to any Contribution +| become effective for each Contribution on the date the Contributor first +| distributes such Contribution. +| +| 2.3. Limitations on Grant Scope +| +| The licenses granted in this Section 2 are the only rights granted under +| this License. No additional rights or licenses will be implied from the +| distribution or licensing of Covered Software under this License. +| Notwithstanding Section 2.1(b) above, no patent license is granted by a +| Contributor: +| +| (a) for any code that a Contributor has removed from Covered Software; +| or +| +| (b) for infringements caused by: (i) Your and any other third party's +| modifications of Covered Software, or (ii) the combination of its +| Contributions with other software (except as part of its Contributor +| Version); or +| +| (c) under Patent Claims infringed by Covered Software in the absence of +| its Contributions. +| +| This License does not grant any rights in the trademarks, service marks, +| or logos of any Contributor (except as may be necessary to comply with +| the notice requirements in Section 3.4). +| +| 2.4. Subsequent Licenses +| +| No Contributor makes additional grants as a result of Your choice to +| distribute the Covered Software under a subsequent version of this +| License (see Section 10.2) or under the terms of a Secondary License (if +| permitted under the terms of Section 3.3). +| +| 2.5. Representation +| +| Each Contributor represents that the Contributor believes its +| Contributions are its original creation(s) or it has sufficient rights +| to grant the rights to its Contributions conveyed by this License. +| +| 2.6. Fair Use +| +| This License is not intended to limit any rights You have under +| applicable copyright doctrines of fair use, fair dealing, or other +| equivalents. +| +| 2.7. Conditions +| +| Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +| in Section 2.1. +| +| 3. Responsibilities +| ------------------- +| +| 3.1. Distribution of Source Form +| +| All distribution of Covered Software in Source Code Form, including any +| Modifications that You create or to which You contribute, must be under +| the terms of this License. You must inform recipients that the Source +| Code Form of the Covered Software is governed by the terms of this +| License, and how they can obtain a copy of this License. You may not +| attempt to alter or restrict the recipients' rights in the Source Code +| Form. +| +| 3.2. Distribution of Executable Form +| +| If You distribute Covered Software in Executable Form then: +| +| (a) such Covered Software must also be made available in Source Code +| Form, as described in Section 3.1, and You must inform recipients of +| the Executable Form how they can obtain a copy of such Source Code +| Form by reasonable means in a timely manner, at a charge no more +| than the cost of distribution to the recipient; and +| +| (b) You may distribute such Executable Form under the terms of this +| License, or sublicense it under different terms, provided that the +| license for the Executable Form does not attempt to limit or alter +| the recipients' rights in the Source Code Form under this License. +| +| 3.3. Distribution of a Larger Work +| +| You may create and distribute a Larger Work under terms of Your choice, +| provided that You also comply with the requirements of this License for +| the Covered Software. If the Larger Work is a combination of Covered +| Software with a work governed by one or more Secondary Licenses, and the +| Covered Software is not Incompatible With Secondary Licenses, this +| License permits You to additionally distribute such Covered Software +| under the terms of such Secondary License(s), so that the recipient of +| the Larger Work may, at their option, further distribute the Covered +| Software under the terms of either this License or such Secondary +| License(s). +| +| 3.4. Notices +| +| You may not remove or alter the substance of any license notices +| (including copyright notices, patent notices, disclaimers of warranty, +| or limitations of liability) contained within the Source Code Form of +| the Covered Software, except that You may alter any license notices to +| the extent required to remedy known factual inaccuracies. +| +| 3.5. Application of Additional Terms +| +| You may choose to offer, and to charge a fee for, warranty, support, +| indemnity or liability obligations to one or more recipients of Covered +| Software. However, You may do so only on Your own behalf, and not on +| behalf of any Contributor. You must make it absolutely clear that any +| such warranty, support, indemnity, or liability obligation is offered by +| You alone, and You hereby agree to indemnify every Contributor for any +| liability incurred by such Contributor as a result of warranty, support, +| indemnity or liability terms You offer. You may include additional +| disclaimers of warranty and limitations of liability specific to any +| jurisdiction. +| +| 4. Inability to Comply Due to Statute or Regulation +| --------------------------------------------------- +| +| If it is impossible for You to comply with any of the terms of this +| License with respect to some or all of the Covered Software due to +| statute, judicial order, or regulation then You must: (a) comply with +| the terms of this License to the maximum extent possible; and (b) +| describe the limitations and the code they affect. Such description must +| be placed in a text file included with all distributions of the Covered +| Software under this License. Except to the extent prohibited by statute +| or regulation, such description must be sufficiently detailed for a +| recipient of ordinary skill to be able to understand it. +| +| 5. Termination +| -------------- +| +| 5.1. The rights granted under this License will terminate automatically +| if You fail to comply with any of its terms. However, if You become +| compliant, then the rights granted under this License from a particular +| Contributor are reinstated (a) provisionally, unless and until such +| Contributor explicitly and finally terminates Your grants, and (b) on an +| ongoing basis, if such Contributor fails to notify You of the +| non-compliance by some reasonable means prior to 60 days after You have +| come back into compliance. Moreover, Your grants from a particular +| Contributor are reinstated on an ongoing basis if such Contributor +| notifies You of the non-compliance by some reasonable means, this is the +| first time You have received notice of non-compliance with this License +| from such Contributor, and You become compliant prior to 30 days after +| Your receipt of the notice. +| +| 5.2. If You initiate litigation against any entity by asserting a patent +| infringement claim (excluding declaratory judgment actions, +| counter-claims, and cross-claims) alleging that a Contributor Version +| directly or indirectly infringes any patent, then the rights granted to +| You by any and all Contributors for the Covered Software under Section +| 2.1 of this License shall terminate. +| +| 5.3. In the event of termination under Sections 5.1 or 5.2 above, all +| end user license agreements (excluding distributors and resellers) which +| have been validly granted by You or Your distributors under this License +| prior to termination shall survive termination. +| +| ************************************************************************ +| * * +| * 6. Disclaimer of Warranty * +| * ------------------------- * +| * * +| * Covered Software is provided under this License on an "as is" * +| * basis, without warranty of any kind, either expressed, implied, or * +| * statutory, including, without limitation, warranties that the * +| * Covered Software is free of defects, merchantable, fit for a * +| * particular purpose or non-infringing. The entire risk as to the * +| * quality and performance of the Covered Software is with You. * +| * Should any Covered Software prove defective in any respect, You * +| * (not any Contributor) assume the cost of any necessary servicing, * +| * repair, or correction. This disclaimer of warranty constitutes an * +| * essential part of this License. No use of any Covered Software is * +| * authorized under this License except under this disclaimer. * +| * * +| ************************************************************************ +| +| ************************************************************************ +| * * +| * 7. Limitation of Liability * +| * -------------------------- * +| * * +| * Under no circumstances and under no legal theory, whether tort * +| * (including negligence), contract, or otherwise, shall any * +| * Contributor, or anyone who distributes Covered Software as * +| * permitted above, be liable to You for any direct, indirect, * +| * special, incidental, or consequential damages of any character * +| * including, without limitation, damages for lost profits, loss of * +| * goodwill, work stoppage, computer failure or malfunction, or any * +| * and all other commercial damages or losses, even if such party * +| * shall have been informed of the possibility of such damages. This * +| * limitation of liability shall not apply to liability for death or * +| * personal injury resulting from such party's negligence to the * +| * extent applicable law prohibits such limitation. Some * +| * jurisdictions do not allow the exclusion or limitation of * +| * incidental or consequential damages, so this exclusion and * +| * limitation may not apply to You. * +| * * +| ************************************************************************ +| +| 8. Litigation +| ------------- +| +| Any litigation relating to this License may be brought only in the +| courts of a jurisdiction where the defendant maintains its principal +| place of business and such litigation shall be governed by laws of that +| jurisdiction, without reference to its conflict-of-law provisions. +| Nothing in this Section shall prevent a party's ability to bring +| cross-claims or counter-claims. +| +| 9. Miscellaneous +| ---------------- +| +| This License represents the complete agreement concerning the subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. Any law or regulation which provides +| that the language of a contract shall be construed against the drafter +| shall not be used to construe this License against a Contributor. +| +| 10. Versions of the License +| --------------------------- +| +| 10.1. New Versions +| +| Mozilla Foundation is the license steward. Except as provided in Section +| 10.3, no one other than the license steward has the right to modify or +| publish new versions of this License. Each version will be given a +| distinguishing version number. +| +| 10.2. Effect of New Versions +| +| You may distribute the Covered Software under the terms of the version +| of the License under which You originally received the Covered Software, +| or under the terms of any subsequent version published by the license +| steward. +| +| 10.3. Modified Versions +| +| If you create software not governed by this License, and you want to +| create a new license for such software, you may create and use a +| modified version of this License if you rename the license and remove +| any references to the name of the license steward (except to note that +| such modified license differs from this License). +| +| 10.4. Distributing Source Code Form that is Incompatible With Secondary +| Licenses +| +| If You choose to distribute Source Code Form that is Incompatible With +| Secondary Licenses under the terms of this version of the License, the +| notice described in Exhibit B of this License must be attached. +| +| Exhibit A - Source Code Form License Notice +| ------------------------------------------- +| +| This Source Code Form is subject to the terms of the Mozilla Public +| License, v. 2.0. If a copy of the MPL was not distributed with this +| file, You can obtain one at http://mozilla.org/MPL/2.0/. +| +| If it is not possible or desirable to put the notice in a particular +| file, then You may include the notice in a location (such as a LICENSE +| file in a relevant directory) where a recipient would be likely to look +| for such a notice. +| +| You may add additional accurate notices of copyright ownership. +| +| Exhibit B - "Incompatible With Secondary Licenses" Notice +| --------------------------------------------------------- +| +| This Source Code Form is "Incompatible With Secondary Licenses", as +| defined by the Mozilla Public License, v. 2.0. + +-------------------------------------------------------------------------------- + This product bundles Google Error Prone Annotations. Copyright: Copyright 2011-2019 The Error Prone Authors @@ -352,6 +829,7 @@ This product bundles checkerframework checker-qual. Copyright: 2004-2019 the Checker Framework developers Project URL: https://github.com/typetools/checker-framework License: MIT license + | The annotations are licensed under the MIT License. (The text of this | license appears below.) More specifically, all the parts of the Checker | Framework that you might want to include with your own program use the @@ -390,20 +868,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Perfmark. - -Project URL: https://github.com/perfmark/perfmark -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles org.json. - -Project URL: https://github.com/douglascrockford/JSON-java -License: Public Domain - https://github.com/stleary/JSON-java/blob/master/LICENSE - --------------------------------------------------------------------------------- - This product bundles Apache Arrow. Copyright: 2016-2019 The Apache Software Foundation. @@ -420,47 +884,18 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Google FlatBuffers. +This product bundles JCTools (via Netty). -Copyright: 2013-2020 Google Inc. -Project URL: https://google.github.io/flatbuffers/ +Project URL: https://github.com/JCTools/JCTools License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles ThreeTen BP. +This product bundles Google FlatBuffers. -Project URL: https://www.threeten.org/threetenbp -License: BSD 3-Clause -| Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. -| -| All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are met: -| -| * Redistributions of source code must retain the above copyright notice, -| this list of conditions and the following disclaimer. -| -| * Redistributions in binary form must reproduce the above copyright notice, -| this list of conditions and the following disclaimer in the documentation -| and/or other materials provided with the distribution. -| -| * Neither the name of JSR-310 nor the names of its contributors -| may be used to endorse or promote products derived from this software -| without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +Copyright: 2013-2020 Google Inc. +Project URL: https://google.github.io/flatbuffers/ +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -469,6 +904,7 @@ This product bundles ThreeTen Extra. Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. Project URL: https://www.threeten.org/threeten-extra/ License: BSD 3-Clause + | All rights reserved. | | * Redistribution and use in source and binary forms, with or without @@ -540,19 +976,11 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache HttpComponents (client and core). - -Copyright: 1999-2022 The Apache Software Foundation. -Project URL: https://hc.apache.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product includes code from Apache HttpComponents Client. +This product bundles and includes code from Apache HttpComponents (core/client). * retry and error handling logic in ExponentialHttpRequestRetryStrategy.java -Copyright: 1999-2022 The Apache Software Foundation. +Copyright: 1999-2022 The Apache Software Foundation Project URL: https://hc.apache.org/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 @@ -573,16 +1001,46 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Datasketches. +This product bundles Eclipse Collections. -Project URL: https://datasketches.apache.org -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 +Project URL: https://github.com/eclipse-collections/eclipse-collections +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php + +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| * Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| * Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- -This product bundles Zero-Allocation Hashing. +This product bundles Apache Datasketches. -Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing +Project URL: https://datasketches.apache.org License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -590,70 +1048,37 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 This product bundles JTS Topology Suite. Project URL: https://github.com/locationtech/jts -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt - --------------------------------------------------------------------------------- - -This product bundles JSpecify. - -Project URL: https://github.com/jspecify/jspecify -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Animal Sniffer Annotations. - -License: MIT -| The MIT License -| -| Copyright (c) 2009 codehaus.org. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in -| all copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -| THE SOFTWARE. - --------------------------------------------------------------------------------- - -This product bundles Android Annotations. - -Project URL: http://source.android.com/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Conscrypt (openjdk-uber). - -Project URL: https://conscrypt.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud BigQuery Client for Java. - -Project URL: https://github.com/googleapis/java-bigquery -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud Java Client Libraries. - -Project URL: https://github.com/googleapis/google-cloud-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| * Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| * Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- This product bundles RoaringBitmap. @@ -661,128 +1086,3 @@ This product bundles RoaringBitmap. Copyright: (c) 2013-... the RoaringBitmap authors Project URL: https://github.com/RoaringBitmap/RoaringBitmap License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Credentials for Java. - -Project URL: https://github.com/aliyun/credentials-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Tea for Java. - -Project URL: https://github.com/aliyun/tea-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google GAX. - -Project URL: https://github.com/googleapis/gax-java -License: BSD 3-Clause -| Copyright 2016, Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This product bundles Google Gson. - -Project URL: https://github.com/google/gson/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles gRPC. - -Project URL: https://github.com/grpc/grpc-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OpenCensus. - -Project URL: https://github.com/census-instrumentation/opencensus-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OkHttp. - -Project URL: https://github.com/square/okhttp -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Auto Value. - -Project URL: https://github.com/google/auto/tree/master/value -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JaCoCo. - -Project URL: https://github.com/jacoco/jacoco/ -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt - --------------------------------------------------------------------------------- - -This product bundles OpenTelemetry. - -Project URL: https://opentelemetry.io/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Kotlin. - -Project URL: https://github.com/JetBrains/kotlin -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JAXB. - -Project URL: http://jaxb.java.net/ -License: CDDL 1.1 - https://glassfish.java.net/public/CDDL+GPL_1_1.html - --------------------------------------------------------------------------------- - -This product bundles EMMA runtime. - -Project URL: https://github.com/ehelms/Emma/ -License: Common Public License - v 1.0 - --------------------------------------------------------------------------------- - -This product bundles Google j2objc. - -Project URL: https://github.com/google/j2objc/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 diff --git a/spark/v4.0/spark-runtime/NOTICE b/spark/v4.0/spark-runtime/NOTICE index 68abd73906b1..797765628363 100644 --- a/spark/v4.0/spark-runtime/NOTICE +++ b/spark/v4.0/spark-runtime/NOTICE @@ -66,42 +66,6 @@ This product bundles Airlift Aircompressor with the following in its NOTICE file -------------------------------------------------------------------------------- -This product bundles Google Protobuf with the following in its NOTICE file: -| Copyright 2008 Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -| -| Code generated by the Protocol Buffer compiler is owned by the owner -| of the input file used when generating it. This code is not -| standalone and requires a support library to be linked with it. This -| support library is itself covered by the above license. - --------------------------------------------------------------------------------- - This product bundles Netty with the following in its NOTICE file: | The Netty Project | ================= @@ -394,66 +358,36 @@ This product bundles Eclipse MicroProfile OpenAPI with the following in its NOTI -------------------------------------------------------------------------------- -This product bundles gRPC with the following in its NOTICE file: -| Copyright 2014 The gRPC Authors -| -| Licensed under the Apache License, Version 2.0 (the "License"); -| you may not use this file except in compliance with the License. -| You may obtain a copy of the License at +This product bundles Jackson JSON Processor with the following in its NOTICE file: +| # Jackson JSON processor | -| http://www.apache.org/licenses/LICENSE-2.0 +| Jackson is a high-performance, Free/Open Source JSON processing library. +| It was originally written by Tatu Saloranta (tatu.saloranta@iki.fi), and has +| been in development since 2007. +| It is currently developed by a community of developers. | -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, -| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -| See the License for the specific language governing permissions and -| limitations under the License. +| ## Copyright | -| ----------------------------------------------------------------------- +| Copyright 2007-, Tatu Saloranta (tatu.saloranta@iki.fi) | -| This product contains a modified portion of 'OkHttp', an open source -| HTTP & SPDY client for Android and Java applications, which can be obtained -| at: +| ## Licensing | -| * LICENSE: -| * okhttp/third_party/okhttp/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/square/okhttp -| * LOCATION_IN_GRPC: -| * okhttp/third_party/okhttp +| Jackson 2.x core and extension components are licensed under Apache License 2.0 +| To find the details that apply to this artifact see the accompanying LICENSE file. | -| This product contains a modified portion of 'Envoy', an open source -| cloud-native high-performance edge/middle/service proxy, which can be -| obtained at: +| ## Credits | -| * LICENSE: -| * xds/third_party/envoy/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/envoy/NOTICE -| * HOMEPAGE: -| * https://www.envoyproxy.io -| * LOCATION_IN_GRPC: -| * xds/third_party/envoy +| A list of contributors may be found from CREDITS(-2.x) file, which is included +| in some artifacts (usually source distributions); but is always available +| from the source code management (SCM) system project uses. | -| This product contains a modified portion of 'protoc-gen-validate (PGV)', -| an open source protoc plugin to generate polyglot message validators, -| which can be obtained at: +| ## FastDoubleParser | -| * LICENSE: -| * xds/third_party/protoc-gen-validate/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/protoc-gen-validate/NOTICE -| * HOMEPAGE: -| * https://github.com/envoyproxy/protoc-gen-validate -| * LOCATION_IN_GRPC: -| * xds/third_party/protoc-gen-validate +| jackson-core bundles a shaded copy of FastDoubleParser . +| That code is available under an MIT license +| under the following copyright. | -| This product contains a modified portion of 'udpa', -| an open source universal data plane API, which can be obtained at: +| Copyright © 2023 Werner Randelshofer, Switzerland. MIT License. | -| * LICENSE: -| * xds/third_party/udpa/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/cncf/udpa -| * LOCATION_IN_GRPC: -| * xds/third_party/udpa +| See FastDoubleParser-NOTICE for details of other source code included in FastDoubleParser +| and the licenses and copyrights that apply to that code. diff --git a/spark/v4.0/spark-runtime/baseline-class-uniqueness.lock b/spark/v4.0/spark-runtime/baseline-class-uniqueness.lock index 35cad90d888f..6197975f3900 100644 --- a/spark/v4.0/spark-runtime/baseline-class-uniqueness.lock +++ b/spark/v4.0/spark-runtime/baseline-class-uniqueness.lock @@ -77,21 +77,21 @@ - io.netty.buffer.AdaptivePoolingAllocator$SizeClassChunkController - io.netty.buffer.AdaptivePoolingAllocator$SizeClassedChunk - io.netty.buffer.ByteBufUtil - - io.netty.buffer.ByteBufUtil$2 - - io.netty.buffer.ByteBufUtil$HexUtil - - io.netty.buffer.ByteBufUtil$ThreadLocalDirectByteBuf - - io.netty.buffer.ByteBufUtil$ThreadLocalDirectByteBuf$1 - - io.netty.buffer.ByteBufUtil$ThreadLocalUnsafeDirectByteBuf - - io.netty.buffer.ByteBufUtil$ThreadLocalUnsafeDirectByteBuf$1 - io.netty.buffer.CompositeByteBuf - io.netty.buffer.EmptyByteBuf - io.netty.buffer.PoolArena + - io.netty.buffer.PoolArena$DirectArena - io.netty.buffer.PoolThreadCache$FreeOnFinalize - io.netty.buffer.PooledByteBufAllocator + - io.netty.buffer.PooledByteBufAllocator$PoolThreadLocalCache - io.netty.buffer.ReadOnlyAbstractByteBuf - io.netty.buffer.SimpleLeakAwareByteBuf - io.netty.buffer.Unpooled + - io.netty.buffer.UnpooledByteBufAllocator$DecrementingCleanableDirectBuffer + - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledDirectByteBuf + - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeDirectByteBuf - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeNoCleanerDirectByteBuf + - io.netty.buffer.UnpooledByteBufAllocator$UnpooledByteBufAllocatorMetric - io.netty.buffer.UnpooledDirectByteBuf - io.netty.buffer.UnpooledHeapByteBuf - io.netty.buffer.UnpooledUnsafeDirectByteBuf @@ -99,11 +99,19 @@ - io.netty.buffer.UnsafeByteBufUtil [dev.vortex:vortex-jni (classifier=all), io.netty:netty-common] - io.netty.util.AbstractReferenceCounted + - io.netty.util.DefaultAttributeMap - io.netty.util.HashedWheelTimer - io.netty.util.HashedWheelTimer$HashedWheelBucket - io.netty.util.LeakPresenceDetector - io.netty.util.LeakPresenceDetector$LeakCreation - io.netty.util.LeakPresenceDetector$ResourceScope + - io.netty.util.Recycler + - io.netty.util.Recycler$BlockingMessageQueue + - io.netty.util.Recycler$DefaultHandle + - io.netty.util.Recycler$EnhancedHandle + - io.netty.util.Recycler$GuardedLocalPool + - io.netty.util.Recycler$LocalPool + - io.netty.util.Recycler$UnguardedLocalPool - io.netty.util.concurrent.AbstractScheduledEventExecutor - io.netty.util.concurrent.GlobalEventExecutor - io.netty.util.concurrent.GlobalEventExecutor$2 @@ -118,11 +126,23 @@ - io.netty.util.concurrent.SingleThreadEventExecutor$4 - io.netty.util.concurrent.SingleThreadEventExecutor$5 - io.netty.util.concurrent.SingleThreadEventExecutor$DefaultThreadProperties + - io.netty.util.internal.Cleaner - io.netty.util.internal.CleanerJava24Linker - io.netty.util.internal.CleanerJava24Linker$CleanableDirectBufferImpl - io.netty.util.internal.CleanerJava25 - io.netty.util.internal.CleanerJava25$CleanableDirectBufferImpl + - io.netty.util.internal.CleanerJava6 + - io.netty.util.internal.CleanerJava6$2 + - io.netty.util.internal.CleanerJava6$CleanableDirectBufferImpl + - io.netty.util.internal.CleanerJava9 + - io.netty.util.internal.CleanerJava9$2 + - io.netty.util.internal.CleanerJava9$CleanableDirectBufferImpl + - io.netty.util.internal.DirectCleaner + - io.netty.util.internal.DirectCleaner$CleanableDirectBufferImpl + - io.netty.util.internal.EmptyArrays - io.netty.util.internal.PlatformDependent + - io.netty.util.internal.PlatformDependent$1 + - io.netty.util.internal.PlatformDependent$1$1 - io.netty.util.internal.PlatformDependent$Mpsc - io.netty.util.internal.PlatformDependent$Mpsc$1 - io.netty.util.internal.PlatformDependent0 diff --git a/spark/v4.0/spark-runtime/runtime-deps.txt b/spark/v4.0/spark-runtime/runtime-deps.txt new file mode 100644 index 000000000000..ec5a5a3785fc --- /dev/null +++ b/spark/v4.0/spark-runtime/runtime-deps.txt @@ -0,0 +1,48 @@ +com.fasterxml.jackson.core:jackson-annotations:2.21 +com.fasterxml.jackson.core:jackson-core:2.15.2 +com.fasterxml.jackson.core:jackson-databind:2.15.2 +com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.21.3 +com.github.ben-manes.caffeine:caffeine:2.9.3 +com.google.errorprone:error_prone_annotations:2.41.0 +com.google.flatbuffers:flatbuffers-java:25.2.10 +com.google.guava:failureaccess:1.0.3 +com.google.guava:guava:33.5.0-jre +com.google.guava:listenablefuture:9999.0-empty-to-avoid-conflict-with-guava +com.google.j2objc:j2objc-annotations:3.1 +com.google.protobuf:protobuf-java:4.33.5 +dev.failsafe:failsafe:3.3.2 +dev.vortex:vortex-jni:0.67.0 +dev.vortex:vortex-spark_2.13:0.67.0 +io.airlift:aircompressor:2.0.3 +io.netty:netty-buffer:4.2.12.Final +io.netty:netty-common:4.2.12.Final +org.apache.arrow:arrow-c-data:18.3.0 +org.apache.arrow:arrow-format:18.3.0 +org.apache.arrow:arrow-memory-core:18.3.0 +org.apache.arrow:arrow-memory-netty-buffer-patch:18.3.0 +org.apache.arrow:arrow-memory-netty:18.3.0 +org.apache.arrow:arrow-vector:18.3.0 +org.apache.avro:avro:1.12.1 +org.apache.datasketches:datasketches-java:6.2.0 +org.apache.datasketches:datasketches-memory:3.0.2 +org.apache.httpcomponents.client5:httpclient5:5.6.1 +org.apache.httpcomponents.core5:httpcore5-h2:5.4 +org.apache.httpcomponents.core5:httpcore5:5.4 +org.apache.orc:orc-core:1.9.8 +org.apache.orc:orc-shims:1.9.8 +org.apache.parquet:parquet-avro:1.17.0 +org.apache.parquet:parquet-column:1.17.0 +org.apache.parquet:parquet-common:1.17.0 +org.apache.parquet:parquet-encoding:1.17.0 +org.apache.parquet:parquet-format-structures:1.17.0 +org.apache.parquet:parquet-hadoop:1.17.0 +org.apache.parquet:parquet-jackson:1.17.0 +org.apache.parquet:parquet-variant:1.17.0 +org.checkerframework:checker-qual:3.19.0 +org.eclipse.microprofile.openapi:microprofile-openapi-api:4.1.1 +org.jspecify:jspecify:1.0.0 +org.locationtech.jts:jts-core:1.20.0 +org.projectnessie.nessie:nessie-client:0.107.5 +org.projectnessie.nessie:nessie-model:0.107.5 +org.roaringbitmap:RoaringBitmap:1.6.14 +org.threeten:threeten-extra:1.7.1 diff --git a/spark/v4.0/spark/baseline-class-uniqueness.lock b/spark/v4.0/spark/baseline-class-uniqueness.lock index 4a6e30c63973..72c0c24fb849 100644 --- a/spark/v4.0/spark/baseline-class-uniqueness.lock +++ b/spark/v4.0/spark/baseline-class-uniqueness.lock @@ -125,21 +125,21 @@ - io.netty.buffer.AdaptivePoolingAllocator$SizeClassChunkController - io.netty.buffer.AdaptivePoolingAllocator$SizeClassedChunk - io.netty.buffer.ByteBufUtil - - io.netty.buffer.ByteBufUtil$2 - - io.netty.buffer.ByteBufUtil$HexUtil - - io.netty.buffer.ByteBufUtil$ThreadLocalDirectByteBuf - - io.netty.buffer.ByteBufUtil$ThreadLocalDirectByteBuf$1 - - io.netty.buffer.ByteBufUtil$ThreadLocalUnsafeDirectByteBuf - - io.netty.buffer.ByteBufUtil$ThreadLocalUnsafeDirectByteBuf$1 - io.netty.buffer.CompositeByteBuf - io.netty.buffer.EmptyByteBuf - io.netty.buffer.PoolArena + - io.netty.buffer.PoolArena$DirectArena - io.netty.buffer.PoolThreadCache$FreeOnFinalize - io.netty.buffer.PooledByteBufAllocator + - io.netty.buffer.PooledByteBufAllocator$PoolThreadLocalCache - io.netty.buffer.ReadOnlyAbstractByteBuf - io.netty.buffer.SimpleLeakAwareByteBuf - io.netty.buffer.Unpooled + - io.netty.buffer.UnpooledByteBufAllocator$DecrementingCleanableDirectBuffer + - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledDirectByteBuf + - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeDirectByteBuf - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeNoCleanerDirectByteBuf + - io.netty.buffer.UnpooledByteBufAllocator$UnpooledByteBufAllocatorMetric - io.netty.buffer.UnpooledDirectByteBuf - io.netty.buffer.UnpooledHeapByteBuf - io.netty.buffer.UnpooledUnsafeDirectByteBuf @@ -147,11 +147,19 @@ - io.netty.buffer.UnsafeByteBufUtil [dev.vortex:vortex-jni (classifier=all), io.netty:netty-common] - io.netty.util.AbstractReferenceCounted + - io.netty.util.DefaultAttributeMap - io.netty.util.HashedWheelTimer - io.netty.util.HashedWheelTimer$HashedWheelBucket - io.netty.util.LeakPresenceDetector - io.netty.util.LeakPresenceDetector$LeakCreation - io.netty.util.LeakPresenceDetector$ResourceScope + - io.netty.util.Recycler + - io.netty.util.Recycler$BlockingMessageQueue + - io.netty.util.Recycler$DefaultHandle + - io.netty.util.Recycler$EnhancedHandle + - io.netty.util.Recycler$GuardedLocalPool + - io.netty.util.Recycler$LocalPool + - io.netty.util.Recycler$UnguardedLocalPool - io.netty.util.concurrent.AbstractScheduledEventExecutor - io.netty.util.concurrent.GlobalEventExecutor - io.netty.util.concurrent.GlobalEventExecutor$2 @@ -166,11 +174,23 @@ - io.netty.util.concurrent.SingleThreadEventExecutor$4 - io.netty.util.concurrent.SingleThreadEventExecutor$5 - io.netty.util.concurrent.SingleThreadEventExecutor$DefaultThreadProperties + - io.netty.util.internal.Cleaner - io.netty.util.internal.CleanerJava24Linker - io.netty.util.internal.CleanerJava24Linker$CleanableDirectBufferImpl - io.netty.util.internal.CleanerJava25 - io.netty.util.internal.CleanerJava25$CleanableDirectBufferImpl + - io.netty.util.internal.CleanerJava6 + - io.netty.util.internal.CleanerJava6$2 + - io.netty.util.internal.CleanerJava6$CleanableDirectBufferImpl + - io.netty.util.internal.CleanerJava9 + - io.netty.util.internal.CleanerJava9$2 + - io.netty.util.internal.CleanerJava9$CleanableDirectBufferImpl + - io.netty.util.internal.DirectCleaner + - io.netty.util.internal.DirectCleaner$CleanableDirectBufferImpl + - io.netty.util.internal.EmptyArrays - io.netty.util.internal.PlatformDependent + - io.netty.util.internal.PlatformDependent$1 + - io.netty.util.internal.PlatformDependent$1$1 - io.netty.util.internal.PlatformDependent$Mpsc - io.netty.util.internal.PlatformDependent$Mpsc$1 - io.netty.util.internal.PlatformDependent0 diff --git a/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java b/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java index e1d9ac18dac1..ad4c0f3e67e4 100644 --- a/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java +++ b/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java @@ -37,6 +37,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.actions.SparkActions; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -179,6 +180,7 @@ private void setupSpark() { .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") .config("spark.sql.catalog.spark_catalog.warehouse", catalogWarehouse()) + .config(TestBase.DISABLE_UI) .master("local"); spark = builder.getOrCreate(); } diff --git a/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java b/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java index 4978961be641..683f6bb46d05 100644 --- a/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java +++ b/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java @@ -41,6 +41,7 @@ import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.actions.SparkActions; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; @@ -394,6 +395,7 @@ protected void setupSpark() { "spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") .config("spark.sql.catalog.spark_catalog.type", "hadoop") .config("spark.sql.catalog.spark_catalog.warehouse", getCatalogWarehouse()) + .config(TestBase.DISABLE_UI) .master("local[*]"); spark = builder.getOrCreate(); Configuration sparkHadoopConf = spark.sessionState().newHadoopConf(); diff --git a/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java b/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java index c6794e43c636..3f242ce228ca 100644 --- a/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java +++ b/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java @@ -49,6 +49,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.util.ContentFileUtil; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.catalyst.InternalRow; @@ -234,7 +235,7 @@ private String generateDataFilePath() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java b/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java index ac74fb5a109c..db5789724056 100644 --- a/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java +++ b/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java @@ -43,6 +43,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; @@ -218,7 +219,7 @@ private String generateDataFilePath() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java b/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java index 68c537e34a4a..debe37866ff7 100644 --- a/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java +++ b/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java @@ -30,6 +30,7 @@ import org.apache.iceberg.UpdateProperties; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SaveMode; @@ -94,7 +95,7 @@ protected void cleanupFiles() throws IOException { } protected void setupSpark(boolean enableDictionaryEncoding) { - SparkSession.Builder builder = SparkSession.builder().config("spark.ui.enabled", false); + SparkSession.Builder builder = SparkSession.builder().config(TestBase.DISABLE_UI); if (!enableDictionaryEncoding) { builder .config("parquet.dictionary.page.size", "1") diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java index ea400a779235..cb9da3edc678 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java @@ -56,7 +56,7 @@ public static R withCommitProperties( ExceptionUtil.castAndThrow(e, exClass); return null; } finally { - COMMIT_PROPERTIES.set(ImmutableMap.of()); + COMMIT_PROPERTIES.remove(); } } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java index bc8a966488ee..f1709277525a 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java @@ -24,17 +24,17 @@ import java.util.function.Function; import org.apache.spark.sql.connector.catalog.Identifier; import org.apache.spark.sql.connector.catalog.StagedTable; -import org.apache.spark.sql.connector.catalog.SupportsDelete; +import org.apache.spark.sql.connector.catalog.SupportsDeleteV2; import org.apache.spark.sql.connector.catalog.SupportsRead; import org.apache.spark.sql.connector.catalog.SupportsWrite; import org.apache.spark.sql.connector.catalog.Table; import org.apache.spark.sql.connector.catalog.TableCapability; import org.apache.spark.sql.connector.catalog.TableCatalog; import org.apache.spark.sql.connector.expressions.Transform; +import org.apache.spark.sql.connector.expressions.filter.Predicate; import org.apache.spark.sql.connector.read.ScanBuilder; import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.connector.write.WriteBuilder; -import org.apache.spark.sql.sources.Filter; import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.util.CaseInsensitiveStringMap; @@ -58,7 +58,7 @@ * #capabilities()}. */ public class RollbackStagedTable - implements StagedTable, SupportsRead, SupportsWrite, SupportsDelete { + implements StagedTable, SupportsRead, SupportsWrite, SupportsDeleteV2 { private final TableCatalog catalog; private final Identifier ident; private final Table table; @@ -106,8 +106,8 @@ public Set capabilities() { } @Override - public void deleteWhere(Filter[] filters) { - call(SupportsDelete.class, t -> t.deleteWhere(filters)); + public void deleteWhere(Predicate[] predicates) { + call(SupportsDeleteV2.class, t -> t.deleteWhere(predicates)); } @Override diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java index bad31d8d85f4..78d69eeaaf61 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java @@ -35,8 +35,6 @@ public abstract class SparkContentFile implements ContentFile { - private static final FileContent[] FILE_CONTENT_VALUES = FileContent.values(); - private final int fileContentPosition; private final int filePathPosition; private final int fileFormatPosition; @@ -139,7 +137,7 @@ public FileContent content() { if (wrapped.isNullAt(fileContentPosition)) { return null; } - return FILE_CONTENT_VALUES[wrapped.getInt(fileContentPosition)]; + return FileContent.fromId(wrapped.getInt(fileContentPosition)); } @Override diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java index 9a19aa7d1e62..c5fe276aaecb 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java @@ -261,6 +261,39 @@ public int maxRecordsPerMicroBatch() { .parse(); } + public boolean asyncMicroBatchPlanningEnabled() { + return confParser + .booleanConf() + .option(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED) + .sessionConf(SparkSQLProperties.ASYNC_MICRO_BATCH_PLANNING_ENABLED) + .defaultValue(SparkSQLProperties.ASYNC_MICRO_BATCH_PLANNING_ENABLED_DEFAULT) + .parse(); + } + + public long streamingSnapshotPollingIntervalMs() { + return confParser + .longConf() + .option(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS) + .defaultValue(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS_DEFAULT) + .parse(); + } + + public long asyncQueuePreloadFileLimit() { + return confParser + .longConf() + .option(SparkReadOptions.ASYNC_QUEUE_PRELOAD_FILE_LIMIT) + .defaultValue(SparkReadOptions.ASYNC_QUEUE_PRELOAD_FILE_LIMIT_DEFAULT) + .parse(); + } + + public long asyncQueuePreloadRowLimit() { + return confParser + .longConf() + .option(SparkReadOptions.ASYNC_QUEUE_PRELOAD_ROW_LIMIT) + .defaultValue(SparkReadOptions.ASYNC_QUEUE_PRELOAD_ROW_LIMIT_DEFAULT) + .parse(); + } + public boolean preserveDataGrouping() { return confParser .booleanConf() diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java index 17f2bfee69b8..5262310e2c5e 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java @@ -87,6 +87,21 @@ private SparkReadOptions() {} public static final String STREAMING_MAX_ROWS_PER_MICRO_BATCH = "streaming-max-rows-per-micro-batch"; + // Enable async micro batch planning + public static final String ASYNC_MICRO_BATCH_PLANNING_ENABLED = + "async-micro-batch-planning-enabled"; + + // Polling interval for async planner to refresh table metadata (ms) + public static final String STREAMING_SNAPSHOT_POLLING_INTERVAL_MS = + "streaming-snapshot-polling-interval-ms"; + public static final long STREAMING_SNAPSHOT_POLLING_INTERVAL_MS_DEFAULT = 30000L; + + // Initial queue preload limits for async micro batch planner + public static final String ASYNC_QUEUE_PRELOAD_FILE_LIMIT = "async-queue-preload-file-limit"; + public static final long ASYNC_QUEUE_PRELOAD_FILE_LIMIT_DEFAULT = 100L; + public static final String ASYNC_QUEUE_PRELOAD_ROW_LIMIT = "async-queue-preload-row-limit"; + public static final long ASYNC_QUEUE_PRELOAD_ROW_LIMIT_DEFAULT = 100000L; + // Table path public static final String PATH = "path"; diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java index 735ee4efbc35..336aadd73c48 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java @@ -104,6 +104,19 @@ private SparkSQLProperties() {} public static final String REPORT_COLUMN_STATS = "spark.sql.iceberg.report-column-stats"; public static final boolean REPORT_COLUMN_STATS_DEFAULT = true; + // Controls whether to enable async micro batch planning for session + public static final String ASYNC_MICRO_BATCH_PLANNING_ENABLED = + "spark.sql.iceberg.async-micro-batch-planning-enabled"; + public static final boolean ASYNC_MICRO_BATCH_PLANNING_ENABLED_DEFAULT = false; + // Prefix for custom snapshot properties public static final String SNAPSHOT_PROPERTY_PREFIX = "spark.sql.iceberg.snapshot-property."; + + // Controls whether to shred variant columns during write operations + public static final String SHRED_VARIANTS = "spark.sql.iceberg.shred-variants"; + + // Controls the buffer size for variant schema inference during writes + // This determines how many rows are buffered before inferring shredded schema + public static final String VARIANT_INFERENCE_BUFFER_SIZE = + "spark.sql.iceberg.variant-inference-buffer-size"; } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java index 3b4fc8f48786..0e9edac3fbd5 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java @@ -330,15 +330,15 @@ private static List listPartition( private static SparkPartition toSparkPartition( CatalogTablePartition partition, CatalogTable table) { Option locationUri = partition.storage().locationUri(); - Option serde = partition.storage().serde(); + Option partitionSerde = partition.storage().serde(); Preconditions.checkArgument(locationUri.nonEmpty(), "Partition URI should be defined"); Preconditions.checkArgument( - serde.nonEmpty() || table.provider().nonEmpty(), "Partition format should be defined"); + partitionSerde.nonEmpty() || table.provider().nonEmpty(), + "Partition format should be defined"); String uri = Util.uriToString(locationUri.get()); - String format = serde.nonEmpty() ? serde.get() : table.provider().get(); - + String format = resolveFileFormat(partitionSerde.getOrElse(() -> null), table); Map partitionSpec = JavaConverters.mapAsJavaMapConverter(partition.spec()).asJava(); return new SparkPartition(partitionSpec, uri, format); @@ -683,11 +683,7 @@ private static void importUnpartitionedSparkTable( ExecutorService service) { try { CatalogTable sourceTable = spark.sessionState().catalog().getTableMetadata(sourceTableIdent); - Option format = - sourceTable.storage().serde().nonEmpty() - ? sourceTable.storage().serde() - : sourceTable.provider(); - Preconditions.checkArgument(format.nonEmpty(), "Could not determine table format"); + String format = resolveFileFormat(null, sourceTable); Map partition = Collections.emptyMap(); PartitionSpec spec = PartitionSpec.unpartitioned(); @@ -701,7 +697,7 @@ private static void importUnpartitionedSparkTable( TableMigrationUtil.listPartition( partition, Util.uriToString(sourceTable.location()), - format.get(), + format, spec, conf, metricsConfig, @@ -1051,6 +1047,30 @@ public static boolean wapEnabled(Table table) { Boolean.parseBoolean(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED_DEFAULT)); } + private static String resolveFileFormat(String partitionSerde, CatalogTable table) { + if (partitionSerde != null && isKnownFileFormat(partitionSerde)) { + return partitionSerde; + } + + Option serde = table.storage().serde(); + if (serde.nonEmpty() && isKnownFileFormat(serde.get())) { + return serde.get(); + } + + Preconditions.checkArgument( + table.provider().nonEmpty(), + "Could not determine table format from serde %s and no provider set", + serde.getOrElse(() -> "unknown")); + return table.provider().get(); + } + + private static boolean isKnownFileFormat(String serde) { + String lowerSerde = serde.toLowerCase(Locale.ROOT); + return lowerSerde.contains("parquet") + || lowerSerde.contains("avro") + || lowerSerde.contains("orc"); + } + /** Class representing a table partition. */ public static class SparkPartition implements Serializable { private final Map values; diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java index 96131e0e56dd..add12e6040b0 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java @@ -33,6 +33,8 @@ import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY; import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION; import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.PARQUET_SHRED_VARIANTS; +import static org.apache.iceberg.TableProperties.PARQUET_VARIANT_BUFFER_SIZE; import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.DELETE; import java.util.Locale; @@ -42,6 +44,7 @@ import org.apache.iceberg.FileFormat; import org.apache.iceberg.IsolationLevel; import org.apache.iceberg.SnapshotSummary; +import org.apache.iceberg.SortOrder; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; import org.apache.iceberg.TableUtil; @@ -164,6 +167,25 @@ public int outputSpecId() { return outputSpecId; } + public int outputSortOrderId(SparkWriteRequirements writeRequirements) { + Integer explicitId = + confParser.intConf().option(SparkWriteOptions.OUTPUT_SORT_ORDER_ID).parseOptional(); + + if (explicitId != null) { + Preconditions.checkArgument( + table.sortOrders().containsKey(explicitId), + "Cannot use output sort order id %s because the table does not contain a sort order with that id", + explicitId); + return explicitId; + } + + if (writeRequirements.hasOrdering()) { + return table.sortOrder().orderId(); + } + + return SortOrder.unsorted().orderId(); + } + public FileFormat dataFileFormat() { String valueAsString = confParser @@ -509,6 +531,14 @@ private Map dataWriteProperties() { if (parquetCompressionLevel != null) { writeProperties.put(PARQUET_COMPRESSION_LEVEL, parquetCompressionLevel); } + boolean shouldShredVariants = shredVariants(); + writeProperties.put(PARQUET_SHRED_VARIANTS, String.valueOf(shouldShredVariants)); + + // Add variant shredding configuration properties + if (shouldShredVariants) { + writeProperties.put( + PARQUET_VARIANT_BUFFER_SIZE, String.valueOf(variantInferenceBufferSize())); + } break; case AVRO: @@ -729,4 +759,24 @@ public DeleteGranularity deleteGranularity() { .defaultValue(DeleteGranularity.FILE) .parse(); } + + public boolean shredVariants() { + return confParser + .booleanConf() + .option(SparkWriteOptions.SHRED_VARIANTS) + .sessionConf(SparkSQLProperties.SHRED_VARIANTS) + .tableProperty(TableProperties.PARQUET_SHRED_VARIANTS) + .defaultValue(TableProperties.PARQUET_SHRED_VARIANTS_DEFAULT) + .parse(); + } + + public int variantInferenceBufferSize() { + return confParser + .intConf() + .option(SparkWriteOptions.VARIANT_INFERENCE_BUFFER_SIZE) + .sessionConf(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE) + .tableProperty(TableProperties.PARQUET_VARIANT_BUFFER_SIZE) + .defaultValue(TableProperties.PARQUET_VARIANT_BUFFER_SIZE_DEFAULT) + .parse(); + } } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java index 33db70bae587..6c76b5c873c5 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java @@ -54,6 +54,7 @@ private SparkWriteOptions() {} public static final String REWRITTEN_FILE_SCAN_TASK_SET_ID = "rewritten-file-scan-task-set-id"; public static final String OUTPUT_SPEC_ID = "output-spec-id"; + public static final String OUTPUT_SORT_ORDER_ID = "output-sort-order-id"; public static final String OVERWRITE_MODE = "overwrite-mode"; @@ -85,4 +86,10 @@ private SparkWriteOptions() {} // Overrides the delete granularity public static final String DELETE_GRANULARITY = "delete-granularity"; + + // Controls whether to shred variant columns during write operations + public static final String SHRED_VARIANTS = "shred-variants"; + + // Controls the buffer size for variant schema inference during writes + public static final String VARIANT_INFERENCE_BUFFER_SIZE = "variant-inference-buffer-size"; } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingFileRewriteRunner.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingFileRewriteRunner.java index 569eb252cba5..f1d45a4b142b 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingFileRewriteRunner.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingFileRewriteRunner.java @@ -47,10 +47,14 @@ import org.apache.spark.sql.connector.expressions.SortOrder; import org.apache.spark.sql.connector.write.RequiresDistributionAndOrdering; import org.apache.spark.sql.execution.datasources.v2.DistributionAndOrderingUtils$; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import scala.Option; abstract class SparkShufflingFileRewriteRunner extends SparkDataFileRewriteRunner { + private static final Logger LOG = LoggerFactory.getLogger(SparkShufflingFileRewriteRunner.class); + /** * The number of shuffle partitions to use for each output file. By default, this file rewriter * assumes each shuffle partition would become a separate output file. Attempting to generate @@ -119,6 +123,17 @@ public void doRewrite(String groupId, RewriteFileGroup fileGroup) { spec(fileGroup.outputSpecId()), fileGroup.expectedOutputFiles())); + org.apache.iceberg.SortOrder sortOrderInJobSpec = sortOrder(); + + org.apache.iceberg.SortOrder maybeMatchingTableSortOrder = + SortOrderUtil.findTableSortOrder(table(), sortOrder()); + + if (sortOrderInJobSpec.isSorted() && maybeMatchingTableSortOrder.isUnsorted()) { + LOG.warn( + "Sort order specified for job {} doesn't match any table sort orders, rewritten files will not be marked as sorted in the manifest files", + Spark3Util.describe(sortOrderInJobSpec)); + } + sortedDF .write() .format("iceberg") @@ -126,6 +141,7 @@ public void doRewrite(String groupId, RewriteFileGroup fileGroup) { .option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES, fileGroup.maxOutputFileSize()) .option(SparkWriteOptions.USE_TABLE_DISTRIBUTION_AND_ORDERING, "false") .option(SparkWriteOptions.OUTPUT_SPEC_ID, fileGroup.outputSpecId()) + .option(SparkWriteOptions.OUTPUT_SORT_ORDER_ID, maybeMatchingTableSortOrder.orderId()) .mode("append") .save(groupId); } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java index d142e3fd1aee..cf9cc8fd511a 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java @@ -24,6 +24,8 @@ import java.nio.ByteBuffer; import java.nio.charset.CharsetEncoder; import java.nio.charset.StandardCharsets; +import java.time.LocalDateTime; +import org.apache.iceberg.util.DateTimeUtil; import org.apache.iceberg.util.ZOrderByteUtils; import org.apache.spark.sql.Column; import org.apache.spark.sql.expressions.UserDefinedFunction; @@ -40,6 +42,7 @@ import org.apache.spark.sql.types.LongType; import org.apache.spark.sql.types.ShortType; import org.apache.spark.sql.types.StringType; +import org.apache.spark.sql.types.TimestampNTZType; import org.apache.spark.sql.types.TimestampType; import scala.collection.JavaConverters; import scala.collection.Seq; @@ -180,6 +183,29 @@ value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) return udf; } + private UserDefinedFunction timestampNtzToOrderedBytesUDF() { + int position = inputCol; + UserDefinedFunction udf = + functions + .udf( + (LocalDateTime value) -> { + if (value == null) { + return PRIMITIVE_EMPTY; + } + long micros = DateTimeUtil.microsFromTimestamp(value); + return ZOrderByteUtils.longToOrderedBytes( + micros, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) + .array(); + }, + DataTypes.BinaryType) + .withName("TIMESTAMP_NTZ_ORDERED_BYTES"); + + this.inputCol++; + increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); + + return udf; + } + private UserDefinedFunction floatToOrderedBytesUDF() { int position = inputCol; UserDefinedFunction udf = @@ -309,6 +335,8 @@ Column sortedLexicographically(Column column, DataType type) { return booleanToOrderedBytesUDF().apply(column); } else if (type instanceof TimestampType) { return longToOrderedBytesUDF().apply(column.cast(DataTypes.LongType)); + } else if (type instanceof TimestampNTZType) { + return timestampNtzToOrderedBytesUDF().apply(column); } else if (type instanceof DateType) { return longToOrderedBytesUDF().apply(functions.unix_date(column).cast(DataTypes.LongType)); } else { diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java new file mode 100644 index 000000000000..3e442f9917d4 --- /dev/null +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java @@ -0,0 +1,543 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; +import java.util.LinkedList; +import java.util.List; +import java.util.concurrent.Executors; +import java.util.concurrent.LinkedBlockingDeque; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.MicroBatches; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.util.Pair; +import org.apache.spark.sql.connector.read.streaming.ReadAllAvailable; +import org.apache.spark.sql.connector.read.streaming.ReadLimit; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class AsyncSparkMicroBatchPlanner extends BaseSparkMicroBatchPlanner implements AutoCloseable { + private static final Logger LOG = LoggerFactory.getLogger(AsyncSparkMicroBatchPlanner.class); + private static final int PLAN_FILES_CACHE_MAX_SIZE = 10; + private static final long QUEUE_POLL_TIMEOUT_MS = 100L; // 100 ms + + private final long minQueuedFiles; + private final long minQueuedRows; + + // Cache for planFiles results to handle duplicate calls + private final Cache, List> planFilesCache; + + // Queue to buffer pre-fetched file scan tasks + private final LinkedBlockingDeque> queue; + + // Background executor for async operations + private final ScheduledExecutorService executor; + + // Error tracking + private volatile Throwable refreshFailedThrowable; + private volatile Throwable fillQueueFailedThrowable; + + // Tracking queue state + private final AtomicLong queuedFileCount = new AtomicLong(0); + private final AtomicLong queuedRowCount = new AtomicLong(0); + private Snapshot lastQueuedSnapshot; + private boolean stopped; + + // Cap for Trigger.AvailableNow - don't process beyond this offset + private final StreamingOffset lastOffsetForTriggerAvailableNow; + + /** + * This class manages a queue of FileScanTask + StreamingOffset. On creation, it starts up an + * asynchronous polling process which populates the queue when a new snapshot arrives or the + * minimum amount of queued data is too low. + * + *

      Note: this will capture the state of the table when snapshots are added to the queue. If a + * snapshot is expired after being added to the queue, the job will still process it. + */ + AsyncSparkMicroBatchPlanner( + Table table, + SparkReadConf readConf, + StreamingOffset initialOffset, + StreamingOffset maybeEndOffset, + StreamingOffset lastOffsetForTriggerAvailableNow) { + super(table, readConf); + this.minQueuedFiles = readConf().maxFilesPerMicroBatch(); + this.minQueuedRows = readConf().maxRecordsPerMicroBatch(); + this.lastOffsetForTriggerAvailableNow = lastOffsetForTriggerAvailableNow; + this.planFilesCache = Caffeine.newBuilder().maximumSize(PLAN_FILES_CACHE_MAX_SIZE).build(); + this.queue = new LinkedBlockingDeque<>(); + + table().refresh(); + + // Synchronously add data to the queue to meet our initial constraints. + // For Trigger.AvailableNow, constructor-time preload is normally initialized from + // latestOffset(...) with no explicit end offset, so bounded preload must stop at + // Trigger.AvailableNow snapshot. + fillQueue(initialOffset, maybeEndOffset); + + this.executor = + Executors.newSingleThreadScheduledExecutor( + r -> { + Thread thread = new Thread(r, "iceberg-async-planner-" + table().name()); + thread.setDaemon(true); + return thread; + }); + // Schedule table refresh at configured interval + long pollingIntervalMs = readConf().streamingSnapshotPollingIntervalMs(); + this.executor.scheduleWithFixedDelay( + this::refreshAndTrapException, pollingIntervalMs, pollingIntervalMs, TimeUnit.MILLISECONDS); + // Schedule queue fill to run frequently (use polling interval for tests, cap at 100ms for + // production) + long queueFillIntervalMs = Math.min(QUEUE_POLL_TIMEOUT_MS, pollingIntervalMs); + executor.scheduleWithFixedDelay( + () -> fillQueueAndTrapException(lastQueuedSnapshot), + 0, + queueFillIntervalMs, + TimeUnit.MILLISECONDS); + + LOG.info( + "Started AsyncSparkMicroBatchPlanner for {} from initialOffset: {}", + table().name(), + initialOffset); + } + + @Override + public synchronized void stop() { + Preconditions.checkArgument( + !stopped, "AsyncSparkMicroBatchPlanner for {} was already stopped", table().name()); + stopped = true; + LOG.info("Stopping AsyncSparkMicroBatchPlanner for table: {}", table().name()); + executor.shutdownNow(); + boolean terminated = false; + try { + terminated = + executor.awaitTermination( + readConf().streamingSnapshotPollingIntervalMs() * 2, TimeUnit.MILLISECONDS); + } catch (InterruptedException ignored) { + // Restore interrupt status + Thread.currentThread().interrupt(); + } + LOG.info("AsyncSparkMicroBatchPlanner for table: {}, stopped: {}", table().name(), terminated); + } + + @Override + public void close() { + stop(); + } + + /** + * Spark can call this multiple times; it should produce the same answer every time. + * + * @param startOffset the starting offset of this microbatch, position is inclusive + * @param endOffset the end offset of this microbatch, position is exclusive + * @return the list of files to scan between these offsets + */ + @Override + public synchronized List planFiles( + StreamingOffset startOffset, StreamingOffset endOffset) { + return planFilesCache.get( + Pair.of(startOffset, endOffset), + key -> { + LOG.info( + "running planFiles for {}, startOffset: {}, endOffset: {}", + table().name(), + startOffset, + endOffset); + List result = new LinkedList<>(); + Pair elem; + StreamingOffset currentOffset; + boolean shouldTerminate = false; + long filesInPlan = 0; + long rowsInPlan = 0; + + do { + try { + elem = queue.pollFirst(QUEUE_POLL_TIMEOUT_MS, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException("Interrupted while polling queue", e); + } + + if (elem != null) { + currentOffset = elem.first(); + LOG.debug("planFiles consumed: {}", currentOffset); + FileScanTask currentTask = elem.second(); + filesInPlan += 1; + long elemRows = currentTask.file().recordCount(); + rowsInPlan += elemRows; + queuedFileCount.decrementAndGet(); + queuedRowCount.addAndGet(-elemRows); + result.add(currentTask); + + // try to peek at the next entry of the queue and see if we should stop + Pair nextElem = queue.peekFirst(); + boolean endOffsetPeek = false; + if (nextElem != null) { + endOffsetPeek = endOffset.equals(nextElem.first()); + } + // end offset may be synthetic and not exist in the queue + boolean endOffsetSynthetic = + currentOffset.snapshotId() == endOffset.snapshotId() + && (currentOffset.position() + 1) == endOffset.position(); + shouldTerminate = endOffsetPeek || endOffsetSynthetic; + } else { + LOG.trace("planFiles hasn't reached {}, waiting", endOffset); + } + } while (!shouldTerminate + && refreshFailedThrowable == null + && fillQueueFailedThrowable == null); + + if (refreshFailedThrowable != null) { + throw new RuntimeException("Table refresh failed", refreshFailedThrowable); + } + + if (fillQueueFailedThrowable != null) { + throw new RuntimeException("Queue filling failed", fillQueueFailedThrowable); + } + + LOG.info( + "completed planFiles for {}, startOffset: {}, endOffset: {}, files: {}, rows: {}", + table().name(), + startOffset, + endOffset, + filesInPlan, + rowsInPlan); + return result; + }); + } + + /** + * This needs to be non destructive on the queue as spark could call this multiple times. Each + * time, depending on the table state it could return something different + * + * @param startOffset the starting offset of the next microbatch + * @param limit a limit for how many files/bytes/rows the next microbatch should include + * @return The end offset to use for the next microbatch, null signals that no data is available + */ + @Override + public synchronized StreamingOffset latestOffset(StreamingOffset startOffset, ReadLimit limit) { + LOG.info( + "running latestOffset for {}, startOffset: {}, limit: {}", + table().name(), + startOffset, + limit); + + if (table().currentSnapshot() == null) { + LOG.info("latestOffset returning START_OFFSET, currentSnapshot() is null"); + return StreamingOffset.START_OFFSET; + } + + if (table().currentSnapshot().timestampMillis() < readConf().streamFromTimestamp()) { + LOG.info("latestOffset returning START_OFFSET, currentSnapshot() < fromTimestamp"); + return StreamingOffset.START_OFFSET; + } + + // if any exceptions were encountered in the background process, raise them here + if (refreshFailedThrowable != null) { + throw new RuntimeException(refreshFailedThrowable); + } + if (fillQueueFailedThrowable != null) { + throw new RuntimeException(fillQueueFailedThrowable); + } + + // if we want to read all available we don't need to scan files, just snapshots + if (limit instanceof ReadAllAvailable) { + // If Trigger.AvailableNow cap is set, return it directly + if (this.lastOffsetForTriggerAvailableNow != null) { + return this.lastOffsetForTriggerAvailableNow; + } + Snapshot lastValidSnapshot = table().snapshot(startOffset.snapshotId()); + Snapshot nextValidSnapshot; + do { + nextValidSnapshot = nextValidSnapshot(lastValidSnapshot); + if (nextValidSnapshot != null) { + lastValidSnapshot = nextValidSnapshot; + } + } while (nextValidSnapshot != null); + return new StreamingOffset( + lastValidSnapshot.snapshotId(), + MicroBatchUtils.addedFilesCount(table(), lastValidSnapshot), + false); + } + + return computeLimitedOffset(limit); + } + + private StreamingOffset computeLimitedOffset(ReadLimit limit) { + UnpackedLimits unpackedLimits = new UnpackedLimits(limit); + long rowsSeen = 0; + long filesSeen = 0; + LOG.debug( + "latestOffset queue status, queuedFiles: {}, queuedRows: {}", + queuedFileCount.get(), + queuedRowCount.get()); + + List> queueSnapshot = Lists.newArrayList(queue); + Pair queueTail = + queueSnapshot.isEmpty() ? null : queueSnapshot.get(queueSnapshot.size() - 1); + + for (int i = 0; i < queueSnapshot.size(); i++) { + Pair elem = queueSnapshot.get(i); + long fileRows = elem.second().file().recordCount(); + + // Hard limit on files - stop BEFORE exceeding + if (filesSeen + 1 > unpackedLimits.getMaxFiles()) { + if (filesSeen == 0) { + return null; + } + LOG.debug( + "latestOffset hit file limit at {}, rows: {}, files: {}", + elem.first(), + rowsSeen, + filesSeen); + return elem.first(); + } + + // Soft limit on rows - include file FIRST, then check + rowsSeen += fileRows; + filesSeen += 1; + + // Check if we've hit the row limit after including this file + if (rowsSeen >= unpackedLimits.getMaxRows()) { + if (filesSeen == 1 && rowsSeen > unpackedLimits.getMaxRows()) { + LOG.warn( + "File {} at offset {} contains {} records, exceeding maxRecordsPerMicroBatch limit of {}. " + + "This file will be processed entirely to guarantee forward progress. " + + "Consider increasing the limit or writing smaller files to avoid unexpected memory usage.", + elem.second().file().location(), + elem.first(), + fileRows, + unpackedLimits.getMaxRows()); + } + // Return the offset of the NEXT element (or synthesize tail+1) + if (i + 1 < queueSnapshot.size()) { + LOG.debug( + "latestOffset hit row limit at {}, rows: {}, files: {}", + queueSnapshot.get(i + 1).first(), + rowsSeen, + filesSeen); + return queueSnapshot.get(i + 1).first(); + } else { + // This is the last element - return tail+1 + StreamingOffset current = elem.first(); + StreamingOffset result = + new StreamingOffset( + current.snapshotId(), current.position() + 1, current.shouldScanAllFiles()); + LOG.debug( + "latestOffset hit row limit at tail {}, rows: {}, files: {}", + result, + rowsSeen, + filesSeen); + return result; + } + } + } + + // if we got here there aren't enough files to exceed our limits + if (queueTail != null) { + StreamingOffset tailOffset = queueTail.first(); + // we have to increment the position by 1 since we want to include the tail in the read and + // position is non-inclusive + StreamingOffset latestOffset = + new StreamingOffset( + tailOffset.snapshotId(), tailOffset.position() + 1, tailOffset.shouldScanAllFiles()); + LOG.debug("latestOffset returning all queued data {}", latestOffset); + return latestOffset; + } + + // if we got here the queue is empty + LOG.debug("latestOffset no data, returning null"); + return null; + } + + // Background task wrapper that traps exceptions + private void refreshAndTrapException() { + try { + table().refresh(); + } catch (Throwable t) { + LOG.error("Failed to refresh table {}", table().name(), t); + refreshFailedThrowable = t; + } + } + + // Background task wrapper that traps exceptions + private void fillQueueAndTrapException(Snapshot snapshot) { + try { + fillQueue(snapshot); + } catch (Throwable t) { + LOG.error("Failed to fill queue for table {}", table().name(), t); + fillQueueFailedThrowable = t; + } + } + + /** Generate a MicroBatch based on input parameters and add to the queue */ + private void addMicroBatchToQueue( + Snapshot snapshot, long startFileIndex, long endFileIndex, boolean shouldScanAllFile) { + LOG.info("Adding MicroBatch for snapshot: {} to the queue", snapshot.snapshotId()); + MicroBatches.MicroBatch microBatch = + MicroBatches.from(snapshot, table().io()) + .caseSensitive(readConf().caseSensitive()) + .specsById(table().specs()) + .generate(startFileIndex, endFileIndex, Long.MAX_VALUE, shouldScanAllFile); + + long position = startFileIndex; + for (FileScanTask task : microBatch.tasks()) { + Pair elem = + Pair.of(new StreamingOffset(microBatch.snapshotId(), position, shouldScanAllFile), task); + queuedFileCount.incrementAndGet(); + queuedRowCount.addAndGet(task.file().recordCount()); + queue.addLast(elem); + position += 1; + } + if (LOG.isDebugEnabled()) { + StringBuilder sb = new StringBuilder("\n"); + for (Pair elem : queue) { + sb.append(elem.first()).append("\n"); + } + LOG.debug(sb.toString()); + } + lastQueuedSnapshot = snapshot; + } + + private void fillQueue(StreamingOffset fromOffset, StreamingOffset toOffset) { + LOG.debug("filling queue from {}, to: {}", fromOffset, toOffset); + Snapshot currentSnapshot = table().snapshot(fromOffset.snapshotId()); + // this could be a partial snapshot so add it outside the loop + if (currentSnapshot != null) { + addMicroBatchToQueue( + currentSnapshot, + fromOffset.position(), + MicroBatchUtils.addedFilesCount(table(), currentSnapshot), + fromOffset.shouldScanAllFiles()); + } + if (toOffset != null) { + if (currentSnapshot != null) { + while (currentSnapshot.snapshotId() != toOffset.snapshotId()) { + currentSnapshot = nextValidSnapshot(currentSnapshot); + if (currentSnapshot != null) { + addMicroBatchToQueue( + currentSnapshot, + 0, + MicroBatchUtils.addedFilesCount(table(), currentSnapshot), + false); + } else { + break; + } + } + } + // toOffset snapshot already added in loop when currentSnapshot == toOffset + } else { + fillQueueInitialBuffer(currentSnapshot); + } + } + + private void fillQueueInitialBuffer(Snapshot startSnapshot) { + // toOffset is null - fill initial buffer to prevent queue starvation before background + // thread starts. Use configured limits to avoid loading all snapshots + // (which could cause OOM on tables with thousands of snapshots). + long targetRows = readConf().asyncQueuePreloadRowLimit(); + long targetFiles = readConf().asyncQueuePreloadFileLimit(); + + Snapshot preloadEndSnapshot = initialPreloadEndSnapshot(); + if (preloadEndSnapshot == null) { + return; // Empty table + } + + // START_OFFSET case: initialize using nextValidSnapshot which respects timestamp filtering + Snapshot current = startSnapshot; + if (current == null) { + current = nextValidSnapshot(null); + if (current != null) { + addMicroBatchToQueue(current, 0, MicroBatchUtils.addedFilesCount(table(), current), false); + } + } + + // Continue loading more snapshots within safety limits + if (current != null) { + while ((queuedRowCount.get() < targetRows || queuedFileCount.get() < targetFiles) + && current.snapshotId() != preloadEndSnapshot.snapshotId()) { + current = nextValidSnapshot(current); + if (current != null) { + addMicroBatchToQueue( + current, 0, MicroBatchUtils.addedFilesCount(table(), current), false); + } else { + break; + } + } + } + } + + private Snapshot initialPreloadEndSnapshot() { + if (lastOffsetForTriggerAvailableNow != null) { + return table().snapshot(lastOffsetForTriggerAvailableNow.snapshotId()); + } + + return table().currentSnapshot(); + } + + @VisibleForTesting + static boolean reachedAvailableNowCap( + Snapshot readFrom, StreamingOffset lastOffsetForTriggerAvailableNow) { + return lastOffsetForTriggerAvailableNow != null + && readFrom != null + && readFrom.snapshotId() == lastOffsetForTriggerAvailableNow.snapshotId(); + } + + /** Try to populate the queue with data from unread snapshots */ + private void fillQueue(Snapshot readFrom) { + // Don't add beyond cap for Trigger.AvailableNow + if (reachedAvailableNowCap(readFrom, lastOffsetForTriggerAvailableNow)) { + LOG.debug( + "Reached cap snapshot {}, not adding more", + this.lastOffsetForTriggerAvailableNow.snapshotId()); + return; + } + + if ((queuedRowCount.get() > minQueuedRows) || (queuedFileCount.get() > minQueuedFiles)) { + // we have enough data buffered, check back shortly + LOG.debug( + "Buffer is full, {} > {} or {} > {}", + queuedRowCount.get(), + minQueuedRows, + queuedFileCount.get(), + minQueuedFiles); + } else { + // add an entire snapshot to the queue + Snapshot nextValidSnapshot = nextValidSnapshot(readFrom); + if (nextValidSnapshot != null) { + addMicroBatchToQueue( + nextValidSnapshot, + 0, + MicroBatchUtils.addedFilesCount(table(), nextValidSnapshot), + false); + } else { + LOG.debug("No snapshots ready to be read"); + } + } + } +} diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseSparkMicroBatchPlanner.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseSparkMicroBatchPlanner.java new file mode 100644 index 000000000000..9298c2bbdfcc --- /dev/null +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseSparkMicroBatchPlanner.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import java.util.Locale; +import org.apache.iceberg.DataOperations; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.SparkReadOptions; +import org.apache.iceberg.util.SnapshotUtil; +import org.apache.spark.sql.connector.read.streaming.CompositeReadLimit; +import org.apache.spark.sql.connector.read.streaming.ReadLimit; +import org.apache.spark.sql.connector.read.streaming.ReadMaxFiles; +import org.apache.spark.sql.connector.read.streaming.ReadMaxRows; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +abstract class BaseSparkMicroBatchPlanner implements SparkMicroBatchPlanner { + private static final Logger LOG = LoggerFactory.getLogger(BaseSparkMicroBatchPlanner.class); + private final Table table; + private final SparkReadConf readConf; + + BaseSparkMicroBatchPlanner(Table table, SparkReadConf readConf) { + this.table = table; + this.readConf = readConf; + } + + protected Table table() { + return table; + } + + protected SparkReadConf readConf() { + return readConf; + } + + protected boolean shouldProcess(Snapshot snapshot) { + String op = snapshot.operation(); + switch (op) { + case DataOperations.APPEND: + return true; + case DataOperations.REPLACE: + return false; + case DataOperations.DELETE: + Preconditions.checkState( + readConf.streamingSkipDeleteSnapshots(), + "Cannot process delete snapshot: %s, to ignore deletes, set %s=true", + snapshot.snapshotId(), + SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS); + return false; + case DataOperations.OVERWRITE: + Preconditions.checkState( + readConf.streamingSkipOverwriteSnapshots(), + "Cannot process overwrite snapshot: %s, to ignore overwrites, set %s=true", + snapshot.snapshotId(), + SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS); + return false; + default: + throw new IllegalStateException( + String.format( + "Cannot process unknown snapshot operation: %s (snapshot id %s)", + op.toLowerCase(Locale.ROOT), snapshot.snapshotId())); + } + } + + /** + * Get the next snapshot skipping over rewrite and delete snapshots. Async must handle nulls. + * + * @param curSnapshot the current snapshot + * @return the next valid snapshot (not a rewrite or delete snapshot), returns null if all + * remaining snapshots should be skipped. + */ + protected Snapshot nextValidSnapshot(Snapshot curSnapshot) { + Snapshot nextSnapshot; + // if there were no valid snapshots, check for an initialOffset again + if (curSnapshot == null) { + StreamingOffset startingOffset = + MicroBatchUtils.determineStartingOffset(table, readConf.streamFromTimestamp()); + LOG.debug("determineStartingOffset picked startingOffset: {}", startingOffset); + if (StreamingOffset.START_OFFSET.equals(startingOffset)) { + return null; + } + nextSnapshot = table.snapshot(startingOffset.snapshotId()); + } else { + if (curSnapshot.snapshotId() == table.currentSnapshot().snapshotId()) { + return null; + } + nextSnapshot = SnapshotUtil.snapshotAfter(table, curSnapshot.snapshotId()); + } + // skip over rewrite and delete snapshots + while (!shouldProcess(nextSnapshot)) { + LOG.debug("Skipping snapshot: {}", nextSnapshot); + // if the currentSnapShot was also the mostRecentSnapshot then break + // avoids snapshotAfter throwing exception since there are no more snapshots to process + if (nextSnapshot.snapshotId() == table.currentSnapshot().snapshotId()) { + return null; + } + nextSnapshot = SnapshotUtil.snapshotAfter(table, nextSnapshot.snapshotId()); + } + return nextSnapshot; + } + + static class UnpackedLimits { + private long maxRows = Integer.MAX_VALUE; + private long maxFiles = Integer.MAX_VALUE; + + UnpackedLimits(ReadLimit limit) { + if (limit instanceof CompositeReadLimit) { + ReadLimit[] compositeLimits = ((CompositeReadLimit) limit).getReadLimits(); + for (ReadLimit individualLimit : compositeLimits) { + if (individualLimit instanceof ReadMaxRows) { + ReadMaxRows readMaxRows = (ReadMaxRows) individualLimit; + this.maxRows = Math.min(this.maxRows, readMaxRows.maxRows()); + } else if (individualLimit instanceof ReadMaxFiles) { + ReadMaxFiles readMaxFiles = (ReadMaxFiles) individualLimit; + this.maxFiles = Math.min(this.maxFiles, readMaxFiles.maxFiles()); + } + } + } else if (limit instanceof ReadMaxRows) { + this.maxRows = ((ReadMaxRows) limit).maxRows(); + } else if (limit instanceof ReadMaxFiles) { + this.maxFiles = ((ReadMaxFiles) limit).maxFiles(); + } + } + + public long getMaxRows() { + return maxRows; + } + + public long getMaxFiles() { + return maxFiles; + } + } +} diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/MicroBatchUtils.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/MicroBatchUtils.java new file mode 100644 index 000000000000..7c73e3f416e3 --- /dev/null +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/MicroBatchUtils.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.SnapshotChanges; +import org.apache.iceberg.SnapshotSummary; +import org.apache.iceberg.Table; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.util.PropertyUtil; +import org.apache.iceberg.util.SnapshotUtil; + +class MicroBatchUtils { + + private MicroBatchUtils() {} + + static StreamingOffset determineStartingOffset(Table table, long fromTimestamp) { + if (table.currentSnapshot() == null) { + return StreamingOffset.START_OFFSET; + } + + if (fromTimestamp == Long.MIN_VALUE) { + // start from the oldest snapshot, since default value is MIN_VALUE + // avoids looping to find first snapshot + return new StreamingOffset(SnapshotUtil.oldestAncestor(table).snapshotId(), 0, false); + } + + if (table.currentSnapshot().timestampMillis() < fromTimestamp) { + return StreamingOffset.START_OFFSET; + } + + try { + Snapshot snapshot = SnapshotUtil.oldestAncestorAfter(table, fromTimestamp); + if (snapshot != null) { + return new StreamingOffset(snapshot.snapshotId(), 0, false); + } else { + return StreamingOffset.START_OFFSET; + } + } catch (IllegalStateException e) { + // could not determine the first snapshot after the timestamp. use the oldest ancestor instead + return new StreamingOffset(SnapshotUtil.oldestAncestor(table).snapshotId(), 0, false); + } + } + + static long addedFilesCount(Table table, Snapshot snapshot) { + long addedFilesCount = + PropertyUtil.propertyAsLong(snapshot.summary(), SnapshotSummary.ADDED_FILES_PROP, -1); + return addedFilesCount == -1 + ? Iterables.size( + SnapshotChanges.builderFor(table).snapshot(snapshot).build().addedDataFiles()) + : addedFilesCount; + } +} diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java index 3f2e0f1af08f..5c973ae711b9 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java @@ -55,7 +55,9 @@ public static void register() { StructType.class, SparkParquetWriters::buildWriter, (icebergSchema, fileSchema, engineSchema, idToConstant) -> - SparkParquetReaders.buildReader(icebergSchema, fileSchema, idToConstant))); + SparkParquetReaders.buildReader(icebergSchema, fileSchema, idToConstant), + new SparkVariantShreddingAnalyzer(), + InternalRow::copy)); FormatModelRegistry.register( ParquetFormatModel.create( diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchPlanner.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchPlanner.java new file mode 100644 index 000000000000..1986ddac5d8e --- /dev/null +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchPlanner.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import java.util.List; +import org.apache.iceberg.FileScanTask; +import org.apache.spark.sql.connector.read.streaming.ReadLimit; + +interface SparkMicroBatchPlanner { + /** + * Return the {@link FileScanTask}s for data added between the start and end offsets. + * + * @param startOffset the offset to start planning from + * @param endOffset the offset to plan up to + * @return file scan tasks for data in the offset range + */ + List planFiles(StreamingOffset startOffset, StreamingOffset endOffset); + + /** + * Return the latest offset the stream can advance to from {@code startOffset}, respecting the + * given {@link ReadLimit}. + * + * @param startOffset the current offset of the stream + * @param limit the read limit bounding how far ahead to advance + * @return the latest available offset, or {@code null} if no new data is available + */ + StreamingOffset latestOffset(StreamingOffset startOffset, ReadLimit limit); + + /** Stop the planner and release any resources. */ + void stop(); +} diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java index d54246e6d513..a1ff767fe2a0 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java @@ -26,48 +26,32 @@ import java.io.UncheckedIOException; import java.nio.charset.StandardCharsets; import java.util.List; -import java.util.Locale; import java.util.function.Supplier; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.DataOperations; import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.MicroBatches; -import org.apache.iceberg.MicroBatches.MicroBatch; import org.apache.iceberg.Schema; import org.apache.iceberg.SchemaParser; import org.apache.iceberg.Snapshot; -import org.apache.iceberg.SnapshotChanges; -import org.apache.iceberg.SnapshotSummary; import org.apache.iceberg.Table; import org.apache.iceberg.hadoop.HadoopFileIO; import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.CloseableIterator; import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.relocated.com.google.common.base.Joiner; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadConf; -import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.Pair; -import org.apache.iceberg.util.PropertyUtil; -import org.apache.iceberg.util.SnapshotUtil; import org.apache.iceberg.util.TableScanUtil; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.broadcast.Broadcast; import org.apache.spark.sql.connector.read.InputPartition; import org.apache.spark.sql.connector.read.PartitionReaderFactory; -import org.apache.spark.sql.connector.read.streaming.CompositeReadLimit; import org.apache.spark.sql.connector.read.streaming.MicroBatchStream; import org.apache.spark.sql.connector.read.streaming.Offset; import org.apache.spark.sql.connector.read.streaming.ReadLimit; -import org.apache.spark.sql.connector.read.streaming.ReadMaxFiles; -import org.apache.spark.sql.connector.read.streaming.ReadMaxRows; import org.apache.spark.sql.connector.read.streaming.SupportsTriggerAvailableNow; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -79,6 +63,7 @@ public class SparkMicroBatchStream implements MicroBatchStream, SupportsTriggerA private final Table table; private final Supplier fileIO; + private final SparkReadConf readConf; private final String branch; private final boolean caseSensitive; private final String expectedSchema; @@ -89,12 +74,11 @@ public class SparkMicroBatchStream implements MicroBatchStream, SupportsTriggerA private final long splitOpenFileCost; private final boolean localityPreferred; private final StreamingOffset initialOffset; - private final boolean skipDelete; - private final boolean skipOverwrite; private final long fromTimestamp; private final int maxFilesPerMicroBatch; private final int maxRecordsPerMicroBatch; private final boolean cacheDeleteFilesOnExecutors; + private SparkMicroBatchPlanner planner; private StreamingOffset lastOffsetForTriggerAvailableNow; SparkMicroBatchStream( @@ -106,6 +90,7 @@ public class SparkMicroBatchStream implements MicroBatchStream, SupportsTriggerA String checkpointLocation) { this.table = table; this.fileIO = fileIO; + this.readConf = readConf; this.branch = readConf.branch(); this.caseSensitive = readConf.caseSensitive(); this.expectedSchema = SchemaParser.toJson(expectedSchema); @@ -124,9 +109,6 @@ public class SparkMicroBatchStream implements MicroBatchStream, SupportsTriggerA new InitialOffsetStore( table, checkpointLocation, fromTimestamp, sparkContext.hadoopConfiguration()); this.initialOffset = initialOffsetStore.initialOffset(); - - this.skipDelete = readConf.streamingSkipDeleteSnapshots(); - this.skipOverwrite = readConf.streamingSkipOverwriteSnapshots(); } @Override @@ -141,8 +123,8 @@ public Offset latestOffset() { } Snapshot latestSnapshot = table.currentSnapshot(); - - return new StreamingOffset(latestSnapshot.snapshotId(), addedFilesCount(latestSnapshot), false); + return new StreamingOffset( + latestSnapshot.snapshotId(), MicroBatchUtils.addedFilesCount(table, latestSnapshot), false); } @Override @@ -161,7 +143,11 @@ public InputPartition[] planInputPartitions(Offset start, Offset end) { StreamingOffset endOffset = (StreamingOffset) end; StreamingOffset startOffset = (StreamingOffset) start; - List fileScanTasks = planFiles(startOffset, endOffset); + if (planner == null) { + initializePlanner(startOffset, endOffset); + } + + List fileScanTasks = planner.planFiles(startOffset, endOffset); CloseableIterable splitTasks = TableScanUtil.splitFiles(CloseableIterable.withNoopClose(fileScanTasks), splitSize); @@ -171,7 +157,6 @@ public InputPartition[] planInputPartitions(Offset start, Offset end) { String[][] locations = computePreferredLocations(combinedScanTasks); InputPartition[] partitions = new InputPartition[combinedScanTasks.size()]; - for (int index = 0; index < combinedScanTasks.size(); index++) { partitions[index] = new SparkInputPartition( @@ -214,318 +199,35 @@ public Offset deserializeOffset(String json) { public void commit(Offset end) {} @Override - public void stop() {} - - private List planFiles(StreamingOffset startOffset, StreamingOffset endOffset) { - List fileScanTasks = Lists.newArrayList(); - StreamingOffset batchStartOffset = - StreamingOffset.START_OFFSET.equals(startOffset) - ? determineStartingOffset(table, fromTimestamp) - : startOffset; - - StreamingOffset currentOffset = null; - - // [(startOffset : startFileIndex), (endOffset : endFileIndex) ) - do { - long endFileIndex; - if (currentOffset == null) { - currentOffset = batchStartOffset; - } else { - Snapshot snapshotAfter = SnapshotUtil.snapshotAfter(table, currentOffset.snapshotId()); - // it may happen that we need to read this snapshot partially in case it's equal to - // endOffset. - if (currentOffset.snapshotId() != endOffset.snapshotId()) { - currentOffset = new StreamingOffset(snapshotAfter.snapshotId(), 0L, false); - } else { - currentOffset = endOffset; - } - } - - Snapshot snapshot = table.snapshot(currentOffset.snapshotId()); - - validateCurrentSnapshotExists(snapshot, currentOffset); - - if (!shouldProcess(snapshot)) { - LOG.debug("Skipping snapshot: {} of table {}", currentOffset.snapshotId(), table.name()); - continue; - } - - Snapshot currentSnapshot = table.snapshot(currentOffset.snapshotId()); - if (currentOffset.snapshotId() == endOffset.snapshotId()) { - endFileIndex = endOffset.position(); - } else { - endFileIndex = addedFilesCount(currentSnapshot); - } - - MicroBatch latestMicroBatch = - MicroBatches.from(currentSnapshot, table.io()) - .caseSensitive(caseSensitive) - .specsById(table.specs()) - .generate( - currentOffset.position(), - endFileIndex, - Long.MAX_VALUE, - currentOffset.shouldScanAllFiles()); - - fileScanTasks.addAll(latestMicroBatch.tasks()); - } while (currentOffset.snapshotId() != endOffset.snapshotId()); - - return fileScanTasks; - } - - private boolean shouldProcess(Snapshot snapshot) { - String op = snapshot.operation(); - switch (op) { - case DataOperations.APPEND: - return true; - case DataOperations.REPLACE: - return false; - case DataOperations.DELETE: - Preconditions.checkState( - skipDelete, - "Cannot process delete snapshot: %s, to ignore deletes, set %s=true", - snapshot.snapshotId(), - SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS); - return false; - case DataOperations.OVERWRITE: - Preconditions.checkState( - skipOverwrite, - "Cannot process overwrite snapshot: %s, to ignore overwrites, set %s=true", - snapshot.snapshotId(), - SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS); - return false; - default: - throw new IllegalStateException( - String.format( - "Cannot process unknown snapshot operation: %s (snapshot id %s)", - op.toLowerCase(Locale.ROOT), snapshot.snapshotId())); - } - } - - private static StreamingOffset determineStartingOffset(Table table, Long fromTimestamp) { - if (table.currentSnapshot() == null) { - return StreamingOffset.START_OFFSET; - } - - if (fromTimestamp == null) { - // match existing behavior and start from the oldest snapshot - return new StreamingOffset(SnapshotUtil.oldestAncestor(table).snapshotId(), 0, false); - } - - if (table.currentSnapshot().timestampMillis() < fromTimestamp) { - return StreamingOffset.START_OFFSET; - } - - try { - Snapshot snapshot = SnapshotUtil.oldestAncestorAfter(table, fromTimestamp); - if (snapshot != null) { - return new StreamingOffset(snapshot.snapshotId(), 0, false); - } else { - return StreamingOffset.START_OFFSET; - } - } catch (IllegalStateException e) { - // could not determine the first snapshot after the timestamp. use the oldest ancestor instead - return new StreamingOffset(SnapshotUtil.oldestAncestor(table).snapshotId(), 0, false); + public void stop() { + if (planner != null) { + planner.stop(); } } - private static int getMaxFiles(ReadLimit readLimit) { - if (readLimit instanceof ReadMaxFiles) { - return ((ReadMaxFiles) readLimit).maxFiles(); - } - - if (readLimit instanceof CompositeReadLimit) { - // We do not expect a CompositeReadLimit to contain a nested CompositeReadLimit. - // In fact, it should only be a composite of two or more of ReadMinRows, ReadMaxRows and - // ReadMaxFiles, with no more than one of each. - ReadLimit[] limits = ((CompositeReadLimit) readLimit).getReadLimits(); - for (ReadLimit limit : limits) { - if (limit instanceof ReadMaxFiles) { - return ((ReadMaxFiles) limit).maxFiles(); - } - } - } - - // there is no ReadMaxFiles, so return the default - return Integer.MAX_VALUE; - } - - private static int getMaxRows(ReadLimit readLimit) { - if (readLimit instanceof ReadMaxRows) { - long maxRows = ((ReadMaxRows) readLimit).maxRows(); - return Math.toIntExact(maxRows); - } - - if (readLimit instanceof CompositeReadLimit) { - ReadLimit[] limits = ((CompositeReadLimit) readLimit).getReadLimits(); - for (ReadLimit limit : limits) { - if (limit instanceof ReadMaxRows) { - long maxRows = ((ReadMaxRows) limit).maxRows(); - return Math.toIntExact(maxRows); - } - } + private void initializePlanner(StreamingOffset startOffset, StreamingOffset endOffset) { + if (readConf.asyncMicroBatchPlanningEnabled()) { + this.planner = + new AsyncSparkMicroBatchPlanner( + table, readConf, startOffset, endOffset, lastOffsetForTriggerAvailableNow); + } else { + this.planner = + new SyncSparkMicroBatchPlanner(table, readConf, lastOffsetForTriggerAvailableNow); } - - // there is no ReadMaxRows, so return the default - return Integer.MAX_VALUE; } @Override - @SuppressWarnings("checkstyle:CyclomaticComplexity") public Offset latestOffset(Offset startOffset, ReadLimit limit) { - // calculate end offset get snapshotId from the startOffset Preconditions.checkArgument( startOffset instanceof StreamingOffset, "Invalid start offset: %s is not a StreamingOffset", startOffset); - table.refresh(); - if (table.currentSnapshot() == null) { - return StreamingOffset.START_OFFSET; - } - - if (table.currentSnapshot().timestampMillis() < fromTimestamp) { - return StreamingOffset.START_OFFSET; + if (planner == null) { + initializePlanner((StreamingOffset) startOffset, null); } - // end offset can expand to multiple snapshots - StreamingOffset startingOffset = (StreamingOffset) startOffset; - - if (startOffset.equals(StreamingOffset.START_OFFSET)) { - startingOffset = determineStartingOffset(table, fromTimestamp); - } - - Snapshot curSnapshot = table.snapshot(startingOffset.snapshotId()); - validateCurrentSnapshotExists(curSnapshot, startingOffset); - - // Use the pre-computed snapshotId when Trigger.AvailableNow is enabled. - long latestSnapshotId = - lastOffsetForTriggerAvailableNow != null - ? lastOffsetForTriggerAvailableNow.snapshotId() - : table.currentSnapshot().snapshotId(); - - int startPosOfSnapOffset = (int) startingOffset.position(); - - boolean scanAllFiles = startingOffset.shouldScanAllFiles(); - - boolean shouldContinueReading = true; - int curFilesAdded = 0; - long curRecordCount = 0; - int curPos = 0; - - // Note : we produce nextOffset with pos as non-inclusive - while (shouldContinueReading) { - // generate manifest index for the curSnapshot - List> indexedManifests = - MicroBatches.skippedManifestIndexesFromSnapshot( - table.io(), curSnapshot, startPosOfSnapOffset, scanAllFiles); - // this is under assumption we will be able to add at-least 1 file in the new offset - for (int idx = 0; idx < indexedManifests.size() && shouldContinueReading; idx++) { - // be rest assured curPos >= startFileIndex - curPos = indexedManifests.get(idx).second(); - try (CloseableIterable taskIterable = - MicroBatches.openManifestFile( - table.io(), - table.specs(), - caseSensitive, - curSnapshot, - indexedManifests.get(idx).first(), - scanAllFiles); - CloseableIterator taskIter = taskIterable.iterator()) { - while (taskIter.hasNext()) { - FileScanTask task = taskIter.next(); - if (curPos >= startPosOfSnapOffset) { - if ((curFilesAdded + 1) > getMaxFiles(limit)) { - // On including the file it might happen that we might exceed, the configured - // soft limit on the number of records, since this is a soft limit its acceptable. - shouldContinueReading = false; - break; - } - - curFilesAdded += 1; - curRecordCount += task.file().recordCount(); - - if (curRecordCount >= getMaxRows(limit)) { - // we included the file, so increment the number of files - // read in the current snapshot. - ++curPos; - shouldContinueReading = false; - break; - } - } - ++curPos; - } - } catch (IOException ioe) { - LOG.warn("Failed to close task iterable", ioe); - } - } - // if the currentSnapShot was also the latestSnapshot then break - if (curSnapshot.snapshotId() == latestSnapshotId) { - break; - } - - // if everything was OK and we consumed complete snapshot then move to next snapshot - if (shouldContinueReading) { - Snapshot nextValid = nextValidSnapshot(curSnapshot); - if (nextValid == null) { - // nextValid implies all the remaining snapshots should be skipped. - break; - } - // we found the next available snapshot, continue from there. - curSnapshot = nextValid; - startPosOfSnapOffset = -1; - // if anyhow we are moving to next snapshot we should only scan addedFiles - scanAllFiles = false; - } - } - - StreamingOffset latestStreamingOffset = - new StreamingOffset(curSnapshot.snapshotId(), curPos, scanAllFiles); - - // if no new data arrived, then return null. - return latestStreamingOffset.equals(startingOffset) ? null : latestStreamingOffset; - } - - /** - * Get the next snapshot skiping over rewrite and delete snapshots. - * - * @param curSnapshot the current snapshot - * @return the next valid snapshot (not a rewrite or delete snapshot), returns null if all - * remaining snapshots should be skipped. - */ - private Snapshot nextValidSnapshot(Snapshot curSnapshot) { - Snapshot nextSnapshot = SnapshotUtil.snapshotAfter(table, curSnapshot.snapshotId()); - // skip over rewrite and delete snapshots - while (!shouldProcess(nextSnapshot)) { - LOG.debug("Skipping snapshot: {} of table {}", nextSnapshot.snapshotId(), table.name()); - // if the currentSnapShot was also the mostRecentSnapshot then break - if (nextSnapshot.snapshotId() == table.currentSnapshot().snapshotId()) { - return null; - } - nextSnapshot = SnapshotUtil.snapshotAfter(table, nextSnapshot.snapshotId()); - } - return nextSnapshot; - } - - private long addedFilesCount(Snapshot snapshot) { - long addedFilesCount = - PropertyUtil.propertyAsLong(snapshot.summary(), SnapshotSummary.ADDED_FILES_PROP, -1); - // If snapshotSummary doesn't have SnapshotSummary.ADDED_FILES_PROP, - // iterate through addedFiles iterator to find addedFilesCount. - return addedFilesCount == -1 - ? Iterables.size( - SnapshotChanges.builderFor(table).snapshot(snapshot).build().addedDataFiles()) - : addedFilesCount; - } - - private void validateCurrentSnapshotExists(Snapshot snapshot, StreamingOffset currentOffset) { - if (snapshot == null) { - throw new IllegalStateException( - String.format( - Locale.ROOT, - "Cannot load current offset at snapshot %d, the snapshot was expired or removed", - currentOffset.snapshotId())); - } + return planner.latestOffset((StreamingOffset) startOffset, limit); } @Override @@ -553,6 +255,11 @@ public void prepareForTriggerAvailableNow() { (StreamingOffset) latestOffset(initialOffset, ReadLimit.allAvailable()); LOG.info("lastOffset for Trigger.AvailableNow is {}", lastOffsetForTriggerAvailableNow.json()); + + if (planner != null) { + planner.stop(); + planner = null; + } } private static class InitialOffsetStore { @@ -576,7 +283,7 @@ public StreamingOffset initialOffset() { } table.refresh(); - StreamingOffset offset = determineStartingOffset(table, fromTimestamp); + StreamingOffset offset = MicroBatchUtils.determineStartingOffset(table, fromTimestamp); OutputFile outputFile = io.newOutputFile(initialOffsetLocation); writeOffset(offset, outputFile); diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java index a1cb31bd3720..f0a58fc42107 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java @@ -117,6 +117,7 @@ class SparkPositionDeltaWrite implements DeltaWrite, RequiresDistributionAndOrde private final String branch; private final Map extraSnapshotMetadata; private final SparkWriteRequirements writeRequirements; + private final int sortOrderId; private final Context context; private final Map writeProperties; @@ -143,6 +144,7 @@ class SparkPositionDeltaWrite implements DeltaWrite, RequiresDistributionAndOrde this.branch = writeConf.branch(); this.extraSnapshotMetadata = writeConf.extraSnapshotMetadata(); this.writeRequirements = writeConf.positionDeltaRequirements(command); + this.sortOrderId = writeConf.outputSortOrderId(writeRequirements); this.context = new Context(dataSchema, writeConf, info, writeRequirements); this.writeProperties = writeConf.writeProperties(); @@ -203,7 +205,8 @@ public DeltaWriterFactory createBatchWriterFactory(PhysicalWriteInfo info) { broadcastRewritableDeletes(), command, context, - writeProperties); + writeProperties, + sortOrderId); } private Broadcast> broadcastRewritableDeletes() { @@ -413,18 +416,21 @@ private static class PositionDeltaWriteFactory implements DeltaWriterFactory { private final Command command; private final Context context; private final Map writeProperties; + private final int sortOrderId; PositionDeltaWriteFactory( Broadcast

      tableBroadcast, Broadcast> rewritableDeletesBroadcast, Command command, Context context, - Map writeProperties) { + Map writeProperties, + int sortOrderId) { this.tableBroadcast = tableBroadcast; this.rewritableDeletesBroadcast = rewritableDeletesBroadcast; this.command = command; this.context = context; this.writeProperties = writeProperties; + this.sortOrderId = sortOrderId; } @Override @@ -451,6 +457,7 @@ public DeltaWriter createWriter(int partitionId, long taskId) { .deleteFileFormat(context.deleteFileFormat()) .positionDeleteSparkType(context.deleteSparkType()) .writeProperties(writeProperties) + .dataSortOrder(table.sortOrders().get(sortOrderId)) .build(); if (command == DELETE) { diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java index 1ee9e9b08074..9e3c9a7e69e6 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java @@ -23,6 +23,7 @@ import java.io.IOException; import java.util.Map; +import java.util.Objects; import java.util.Set; import org.apache.iceberg.BaseMetadataTable; import org.apache.iceberg.BaseTable; @@ -57,6 +58,7 @@ import org.apache.iceberg.spark.CommitMetadata; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkReadOptions; +import org.apache.iceberg.spark.SparkSQLProperties; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkTableUtil; import org.apache.iceberg.spark.SparkUtil; @@ -375,11 +377,31 @@ public boolean canDeleteWhere(Predicate[] predicates) { } } - return canDeleteUsingMetadata(deleteExpr); + return canDeleteUsingMetadata(deleteExpr, scanBranchForDelete()); + } + + // Resolves the branch to scan during canDeleteWhere so it matches the branch deleteWhere + // will commit to. Falls back to main when WAP is configured but the WAP branch does not + // exist yet, since this is a read scan. + private String scanBranchForDelete() { + if (branch != null) { + return branch; + } + + if (!SparkTableUtil.wapEnabled(table())) { + return null; + } + + String wapBranch = sparkSession().conf().get(SparkSQLProperties.WAP_BRANCH, null); + if (wapBranch != null && table().refs().containsKey(wapBranch)) { + return wapBranch; + } + + return null; } // a metadata delete is possible iff matching files can be deleted entirely - private boolean canDeleteUsingMetadata(Expression deleteExpr) { + private boolean canDeleteUsingMetadata(Expression deleteExpr, String scanBranch) { boolean caseSensitive = SparkUtil.caseSensitive(sparkSession()); if (ExpressionUtil.selectsPartitions(deleteExpr, table(), caseSensitive)) { @@ -394,14 +416,14 @@ private boolean canDeleteUsingMetadata(Expression deleteExpr) { .includeColumnStats() .ignoreResiduals(); - if (branch != null) { - scan = scan.useRef(branch); + if (scanBranch != null) { + scan = scan.useRef(scanBranch); } try (CloseableIterable tasks = scan.planFiles()) { Map evaluators = Maps.newHashMap(); StrictMetricsEvaluator metricsEvaluator = - new StrictMetricsEvaluator(SnapshotUtil.schemaFor(table(), branch), deleteExpr); + new StrictMetricsEvaluator(SnapshotUtil.schemaFor(table(), scanBranch), deleteExpr); return Iterables.all( tasks, @@ -438,12 +460,13 @@ public void deleteWhere(Predicate[] predicates) { .set("spark.app.id", sparkSession().sparkContext().applicationId()) .deleteFromRowFilter(deleteExpr); + String writeBranch = branch; if (SparkTableUtil.wapEnabled(table())) { - branch = SparkTableUtil.determineWriteBranch(sparkSession(), branch); + writeBranch = SparkTableUtil.determineWriteBranch(sparkSession(), branch); } - if (branch != null) { - deleteFiles.toBranch(branch); + if (writeBranch != null) { + deleteFiles.toBranch(writeBranch); } if (!CommitMetadata.commitProperties().isEmpty()) { @@ -466,15 +489,16 @@ public boolean equals(Object other) { return false; } - // use only name in order to correctly invalidate Spark cache SparkTable that = (SparkTable) other; - return icebergTable.name().equals(that.icebergTable.name()); + return icebergTable.name().equals(that.icebergTable.name()) + && Objects.equals(table().uuid(), that.table().uuid()) + && Objects.equals(snapshotId, that.snapshotId) + && Objects.equals(branch, that.branch); } @Override public int hashCode() { - // use only name in order to correctly invalidate Spark cache - return icebergTable.name().hashCode(); + return Objects.hash(icebergTable.name(), table().uuid(), snapshotId, branch); } private static CaseInsensitiveStringMap addSnapshotId( diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkVariantShreddingAnalyzer.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkVariantShreddingAnalyzer.java new file mode 100644 index 000000000000..2c08c662c9da --- /dev/null +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkVariantShreddingAnalyzer.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.List; +import org.apache.iceberg.parquet.VariantShreddingAnalyzer; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.variants.VariantMetadata; +import org.apache.iceberg.variants.VariantValue; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.VariantVal; + +/** + * Spark-specific implementation that extracts variant values from {@link InternalRow} instances. + */ +class SparkVariantShreddingAnalyzer extends VariantShreddingAnalyzer { + + SparkVariantShreddingAnalyzer() {} + + @Override + protected int resolveColumnIndex(StructType sparkSchema, String columnName) { + try { + return sparkSchema.fieldIndex(columnName); + } catch (IllegalArgumentException e) { + return -1; + } + } + + @Override + protected List extractVariantValues( + List bufferedRows, int variantFieldIndex) { + List values = Lists.newArrayList(); + + for (InternalRow row : bufferedRows) { + if (!row.isNullAt(variantFieldIndex)) { + VariantVal variantVal = row.getVariant(variantFieldIndex); + if (variantVal != null) { + VariantValue variantValue = + VariantValue.from( + VariantMetadata.from( + ByteBuffer.wrap(variantVal.getMetadata()).order(ByteOrder.LITTLE_ENDIAN)), + ByteBuffer.wrap(variantVal.getValue()).order(ByteOrder.LITTLE_ENDIAN)); + values.add(variantValue); + } + } + } + + return values; + } +} diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java index 5f81689f41ed..c73a37ba3426 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java @@ -212,6 +212,7 @@ private WriterFactory createWriterFactory() { // broadcast the table metadata as the writer factory will be sent to executors Broadcast
      tableBroadcast = sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); + int sortOrderId = writeConf.outputSortOrderId(writeRequirements); return new WriterFactory( tableBroadcast, queryId, @@ -221,7 +222,8 @@ private WriterFactory createWriterFactory() { writeSchema, dsSchema, useFanoutWriter, - writeProperties); + writeProperties, + sortOrderId); } private void commitOperation(SnapshotUpdate operation, String description) { @@ -696,6 +698,7 @@ private static class WriterFactory implements DataWriterFactory, StreamingDataWr private final boolean useFanoutWriter; private final String queryId; private final Map writeProperties; + private final int sortOrderId; protected WriterFactory( Broadcast
      tableBroadcast, @@ -706,7 +709,8 @@ protected WriterFactory( Schema writeSchema, StructType dsSchema, boolean useFanoutWriter, - Map writeProperties) { + Map writeProperties, + int sortOrderId) { this.tableBroadcast = tableBroadcast; this.format = format; this.outputSpecId = outputSpecId; @@ -716,6 +720,7 @@ protected WriterFactory( this.useFanoutWriter = useFanoutWriter; this.queryId = queryId; this.writeProperties = writeProperties; + this.sortOrderId = sortOrderId; } @Override @@ -740,6 +745,7 @@ public DataWriter createWriter(int partitionId, long taskId, long e .dataSchema(writeSchema) .dataSparkType(dsSchema) .writeProperties(writeProperties) + .dataSortOrder(table.sortOrders().get(sortOrderId)) .build(); Function rowLineageExtractor = new ExtractRowLineage(writeSchema); diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java index 89af7740d988..c1867433fd8d 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java @@ -27,27 +27,27 @@ import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.spark.SparkFilters; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkUtil; +import org.apache.iceberg.spark.SparkV2Filters; import org.apache.iceberg.spark.SparkWriteConf; import org.apache.iceberg.spark.SparkWriteRequirements; import org.apache.iceberg.types.TypeUtil; import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.connector.expressions.filter.Predicate; import org.apache.spark.sql.connector.read.Scan; import org.apache.spark.sql.connector.write.BatchWrite; import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.connector.write.RowLevelOperation.Command; import org.apache.spark.sql.connector.write.SupportsDynamicOverwrite; -import org.apache.spark.sql.connector.write.SupportsOverwrite; +import org.apache.spark.sql.connector.write.SupportsOverwriteV2; import org.apache.spark.sql.connector.write.Write; import org.apache.spark.sql.connector.write.WriteBuilder; import org.apache.spark.sql.connector.write.streaming.StreamingWrite; -import org.apache.spark.sql.sources.Filter; import org.apache.spark.sql.types.LongType$; import org.apache.spark.sql.types.StructType; -class SparkWriteBuilder implements WriteBuilder, SupportsDynamicOverwrite, SupportsOverwrite { +class SparkWriteBuilder implements WriteBuilder, SupportsDynamicOverwrite, SupportsOverwriteV2 { private final SparkSession spark; private final Table table; private final SparkWriteConf writeConf; @@ -100,12 +100,12 @@ public WriteBuilder overwriteDynamicPartitions() { } @Override - public WriteBuilder overwrite(Filter[] filters) { + public WriteBuilder overwrite(Predicate[] predicates) { Preconditions.checkState( !overwriteFiles, "Cannot overwrite individual files and using filters"); Preconditions.checkState(rewrittenFileSetId == null, "Cannot overwrite and rewrite"); - this.overwriteExpr = SparkFilters.convert(filters); + this.overwriteExpr = SparkV2Filters.convert(predicates); if (overwriteExpr == Expressions.alwaysTrue() && "dynamic".equals(overwriteMode)) { // use the write option to override truncating the table. use dynamic overwrite instead. this.overwriteDynamic = true; diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SyncSparkMicroBatchPlanner.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SyncSparkMicroBatchPlanner.java new file mode 100644 index 000000000000..f1b0029c5432 --- /dev/null +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SyncSparkMicroBatchPlanner.java @@ -0,0 +1,249 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import java.io.IOException; +import java.util.List; +import java.util.Locale; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.MicroBatches; +import org.apache.iceberg.MicroBatches.MicroBatch; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.util.Pair; +import org.apache.iceberg.util.SnapshotUtil; +import org.apache.spark.sql.connector.read.streaming.ReadLimit; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class SyncSparkMicroBatchPlanner extends BaseSparkMicroBatchPlanner { + private static final Logger LOG = LoggerFactory.getLogger(SyncSparkMicroBatchPlanner.class); + + private final boolean caseSensitive; + private final long fromTimestamp; + private final StreamingOffset lastOffsetForTriggerAvailableNow; + + SyncSparkMicroBatchPlanner( + Table table, SparkReadConf readConf, StreamingOffset lastOffsetForTriggerAvailableNow) { + super(table, readConf); + this.caseSensitive = readConf().caseSensitive(); + this.fromTimestamp = readConf().streamFromTimestamp(); + this.lastOffsetForTriggerAvailableNow = lastOffsetForTriggerAvailableNow; + } + + @Override + public List planFiles(StreamingOffset startOffset, StreamingOffset endOffset) { + List fileScanTasks = Lists.newArrayList(); + StreamingOffset batchStartOffset = + StreamingOffset.START_OFFSET.equals(startOffset) + ? MicroBatchUtils.determineStartingOffset(table(), fromTimestamp) + : startOffset; + + StreamingOffset currentOffset = null; + + // [(startOffset : startFileIndex), (endOffset : endFileIndex) ) + do { + long endFileIndex; + if (currentOffset == null) { + currentOffset = batchStartOffset; + } else { + Snapshot snapshotAfter = SnapshotUtil.snapshotAfter(table(), currentOffset.snapshotId()); + // it may happen that we need to read this snapshot partially in case it's equal to + // endOffset. + if (currentOffset.snapshotId() != endOffset.snapshotId()) { + currentOffset = new StreamingOffset(snapshotAfter.snapshotId(), 0L, false); + } else { + currentOffset = endOffset; + } + } + + Snapshot snapshot = table().snapshot(currentOffset.snapshotId()); + + validateCurrentSnapshotExists(snapshot, currentOffset); + + if (!shouldProcess(snapshot)) { + LOG.debug("Skipping snapshot: {} of table {}", currentOffset.snapshotId(), table().name()); + continue; + } + + Snapshot currentSnapshot = table().snapshot(currentOffset.snapshotId()); + if (currentOffset.snapshotId() == endOffset.snapshotId()) { + endFileIndex = endOffset.position(); + } else { + endFileIndex = MicroBatchUtils.addedFilesCount(table(), currentSnapshot); + } + + MicroBatch latestMicroBatch = + MicroBatches.from(currentSnapshot, table().io()) + .caseSensitive(caseSensitive) + .specsById(table().specs()) + .generate( + currentOffset.position(), + endFileIndex, + Long.MAX_VALUE, + currentOffset.shouldScanAllFiles()); + + fileScanTasks.addAll(latestMicroBatch.tasks()); + } while (currentOffset.snapshotId() != endOffset.snapshotId()); + + return fileScanTasks; + } + + @Override + @SuppressWarnings("checkstyle:CyclomaticComplexity") + public StreamingOffset latestOffset(StreamingOffset startOffset, ReadLimit limit) { + table().refresh(); + if (table().currentSnapshot() == null) { + return StreamingOffset.START_OFFSET; + } + + if (table().currentSnapshot().timestampMillis() < fromTimestamp) { + return StreamingOffset.START_OFFSET; + } + + // end offset can expand to multiple snapshots + StreamingOffset startingOffset = startOffset; + + if (startOffset.equals(StreamingOffset.START_OFFSET)) { + startingOffset = MicroBatchUtils.determineStartingOffset(table(), fromTimestamp); + } + + Snapshot curSnapshot = table().snapshot(startingOffset.snapshotId()); + validateCurrentSnapshotExists(curSnapshot, startingOffset); + + // Use the pre-computed snapshotId when Trigger.AvailableNow is enabled. + long latestSnapshotId = + lastOffsetForTriggerAvailableNow != null + ? lastOffsetForTriggerAvailableNow.snapshotId() + : table().currentSnapshot().snapshotId(); + + int startPosOfSnapOffset = (int) startingOffset.position(); + + boolean scanAllFiles = startingOffset.shouldScanAllFiles(); + + boolean shouldContinueReading = true; + int curFilesAdded = 0; + long curRecordCount = 0; + int curPos = 0; + + // Extract limits once to avoid repeated calls in tight loop + UnpackedLimits unpackedLimits = new UnpackedLimits(limit); + long maxFiles = unpackedLimits.getMaxFiles(); + long maxRows = unpackedLimits.getMaxRows(); + + // Note : we produce nextOffset with pos as non-inclusive + while (shouldContinueReading) { + // generate manifest index for the curSnapshot + List> indexedManifests = + MicroBatches.skippedManifestIndexesFromSnapshot( + table().io(), curSnapshot, startPosOfSnapOffset, scanAllFiles); + // this is under assumption we will be able to add at-least 1 file in the new offset + for (int idx = 0; idx < indexedManifests.size() && shouldContinueReading; idx++) { + // be rest assured curPos >= startFileIndex + curPos = indexedManifests.get(idx).second(); + try (CloseableIterable taskIterable = + MicroBatches.openManifestFile( + table().io(), + table().specs(), + caseSensitive, + curSnapshot, + indexedManifests.get(idx).first(), + scanAllFiles); + CloseableIterator taskIter = taskIterable.iterator()) { + while (taskIter.hasNext()) { + FileScanTask task = taskIter.next(); + if (curPos >= startPosOfSnapOffset) { + if ((curFilesAdded + 1) > maxFiles) { + // On including the file it might happen that we might exceed, the configured + // soft limit on the number of records, since this is a soft limit its acceptable. + shouldContinueReading = false; + break; + } + + curFilesAdded += 1; + curRecordCount += task.file().recordCount(); + + if (curRecordCount >= maxRows) { + // we included the file, so increment the number of files + // read in the current snapshot. + if (curFilesAdded == 1 && curRecordCount > maxRows) { + LOG.warn( + "File {} contains {} records, exceeding maxRecordsPerMicroBatch limit of {}. " + + "This file will be processed entirely to guarantee forward progress. " + + "Consider increasing the limit or writing smaller files to avoid unexpected memory usage.", + task.file().location(), + task.file().recordCount(), + maxRows); + } + ++curPos; + shouldContinueReading = false; + break; + } + } + ++curPos; + } + } catch (IOException ioe) { + LOG.warn("Failed to close task iterable", ioe); + } + } + // if the currentSnapShot was also the latestSnapshot then break + if (curSnapshot.snapshotId() == latestSnapshotId) { + break; + } + + // if everything was OK and we consumed complete snapshot then move to next snapshot + if (shouldContinueReading) { + Snapshot nextValid = nextValidSnapshot(curSnapshot); + if (nextValid == null) { + // nextValid implies all the remaining snapshots should be skipped. + break; + } + // we found the next available snapshot, continue from there. + curSnapshot = nextValid; + startPosOfSnapOffset = -1; + // if anyhow we are moving to next snapshot we should only scan addedFiles + scanAllFiles = false; + } + } + + StreamingOffset latestStreamingOffset = + new StreamingOffset(curSnapshot.snapshotId(), curPos, scanAllFiles); + + // if no new data arrived, then return null. + return latestStreamingOffset.equals(startingOffset) ? null : latestStreamingOffset; + } + + @Override + public void stop() {} + + private void validateCurrentSnapshotExists(Snapshot snapshot, StreamingOffset currentOffset) { + if (snapshot == null) { + throw new IllegalStateException( + String.format( + Locale.ROOT, + "Cannot load current offset at snapshot %d, the snapshot was expired or removed", + currentOffset.snapshotId())); + } + } +} diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java index 404ba7284606..9b08d6f7ab1e 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.BeforeEach; @@ -90,6 +91,7 @@ protected static SparkSession initSpark(String serializer) { .master("local[2]") .config("spark.serializer", serializer) .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java index 659507e4c5e3..e28603c0b43a 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -73,6 +74,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java index a218f965ea65..eae640528f9e 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java @@ -23,6 +23,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -62,6 +63,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java index 2665d7ba8d3b..4f789d2c5ae9 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -63,6 +64,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java new file mode 100644 index 000000000000..ee1f29e56fb3 --- /dev/null +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark; + +import com.codahale.metrics.MetricRegistry; +import java.util.Properties; +import org.apache.spark.SparkConf; +import org.apache.spark.metrics.sink.MetricsServlet; +import org.sparkproject.jetty.servlet.ServletContextHandler; + +/** + * A dummy implementation of {@link MetricsServlet} that does not start a server or report metrics. + * This is used in tests to avoid conflicts with Spark's jetty dependencies. + */ +public class DummyMetricsServlet extends MetricsServlet { + + /** + * Constructor required by Spark's reflection-based instantiation. + * + * @param properties Metrics properties + * @param registry Metric registry + */ + public DummyMetricsServlet(Properties properties, MetricRegistry registry) { + super(properties, registry); + } + + @Override + public ServletContextHandler[] getHandlers(SparkConf conf) { + return new ServletContextHandler[] {}; + } + + @Override + public void start() { + // No-op for tests + } + + @Override + public void stop() { + // No-op for tests + } + + @Override + public void report() { + // No-op for tests + } +} diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestBase.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestBase.java index daf4e29ac075..5e7e1a1f6193 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestBase.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestBase.java @@ -65,6 +65,13 @@ public abstract class TestBase extends SparkTestHelperBase { protected static SparkSession spark = null; protected static JavaSparkContext sparkContext = null; protected static HiveCatalog catalog = null; + // disable Spark UI and use dummy servlet to avoid dependency conflicts with Spark's Jetty version + public static final Map DISABLE_UI = + ImmutableMap.of( + "spark.ui.enabled", + "false", + "spark.metrics.conf.*.sink.servlet.class", + "org.apache.iceberg.spark.DummyMetricsServlet"); @BeforeAll public static void startMetastoreAndSpark() { @@ -79,6 +86,7 @@ public static void startMetastoreAndSpark() { .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") + .config(DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java index 61aacfa4589d..c5cfbe62b1be 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java @@ -34,6 +34,7 @@ import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY; import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION; import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.PARQUET_SHRED_VARIANTS; import static org.apache.iceberg.TableProperties.UPDATE_DISTRIBUTION_MODE; import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_HASH; @@ -45,6 +46,7 @@ import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.MERGE; import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.UPDATE; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException; import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.time.Duration; @@ -60,6 +62,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.spark.sql.internal.SQLConf; +import org.apache.spark.sql.util.CaseInsensitiveStringMap; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.TestTemplate; @@ -339,6 +342,8 @@ public void testSparkConfOverride() { TableProperties.DELETE_PARQUET_COMPRESSION, "snappy"), ImmutableMap.of( + PARQUET_SHRED_VARIANTS, + "false", DELETE_PARQUET_COMPRESSION, "zstd", PARQUET_COMPRESSION, @@ -460,6 +465,8 @@ public void testDataPropsDefaultsAsDeleteProps() { PARQUET_COMPRESSION_LEVEL, "5"), ImmutableMap.of( + PARQUET_SHRED_VARIANTS, + "false", DELETE_PARQUET_COMPRESSION, "zstd", PARQUET_COMPRESSION, @@ -531,6 +538,8 @@ public void testDeleteFileWriteConf() { DELETE_PARQUET_COMPRESSION_LEVEL, "6"), ImmutableMap.of( + PARQUET_SHRED_VARIANTS, + "false", DELETE_PARQUET_COMPRESSION, "zstd", PARQUET_COMPRESSION, @@ -600,6 +609,51 @@ public void testDVWriteConf() { assertThat(writeConf.deleteFileFormat()).isEqualTo(FileFormat.PUFFIN); } + @TestTemplate + public void testSortOrderWriteConf() { + Table table = validationCatalog.loadTable(tableIdent); + + table.replaceSortOrder().asc("id").commit(); + + SparkWriteConf writeConf = + new SparkWriteConf( + spark, table, ImmutableMap.of(SparkWriteOptions.OUTPUT_SORT_ORDER_ID, "1")); + + assertThat(writeConf.outputSortOrderId(SparkWriteRequirements.EMPTY)) + .isEqualTo(table.sortOrder().orderId()); + } + + @TestTemplate + public void testSortOrderWriteConfWithInvalidId() { + Table table = validationCatalog.loadTable(tableIdent); + + table.replaceSortOrder().asc("id").commit(); + + SparkWriteConf writeConfForUnknownSortOrder = + new SparkWriteConf( + spark, table, ImmutableMap.of(SparkWriteOptions.OUTPUT_SORT_ORDER_ID, "999")); + + assertThatIllegalArgumentException() + .isThrownBy( + () -> writeConfForUnknownSortOrder.outputSortOrderId(SparkWriteRequirements.EMPTY)) + .withMessage( + "Cannot use output sort order id 999 because the table does not contain a sort order with that id"); + } + + @TestTemplate + public void testSortOrderWriteConfWithNoOption() { + Table table = validationCatalog.loadTable(tableIdent); + + table.replaceSortOrder().asc("id").commit(); + + SparkWriteConf writeConfNoOption = new SparkWriteConf(spark, table, ImmutableMap.of()); + + assertThat(writeConfNoOption.outputSortOrderId(writeConfNoOption.writeRequirements())) + .isEqualTo(table.sortOrder().orderId()); + + assertThat(writeConfNoOption.outputSortOrderId(SparkWriteRequirements.EMPTY)).isEqualTo(0); + } + private void testWriteProperties(List> propertiesSuite) { withSQLConf( propertiesSuite.get(0), @@ -640,4 +694,81 @@ private void checkMode(DistributionMode expectedMode, SparkWriteConf writeConf) assertThat(writeConf.copyOnWriteDistributionMode(MERGE)).isEqualTo(expectedMode); assertThat(writeConf.positionDeltaDistributionMode(MERGE)).isEqualTo(expectedMode); } + + @TestTemplate + public void testShredVariantsDefault() { + Table table = validationCatalog.loadTable(tableIdent); + SparkWriteConf writeConf = new SparkWriteConf(spark, table, ImmutableMap.of()); + assertThat(writeConf.shredVariants()).isFalse(); + } + + @TestTemplate + public void testVariantInferenceBufferSizeDefault() { + Table table = validationCatalog.loadTable(tableIdent); + SparkWriteConf writeConf = new SparkWriteConf(spark, table, ImmutableMap.of()); + assertThat(writeConf.variantInferenceBufferSize()) + .isEqualTo(TableProperties.PARQUET_VARIANT_BUFFER_SIZE_DEFAULT); + } + + @TestTemplate + public void testVariantInferenceBufferSizeTableProperty() { + Table table = validationCatalog.loadTable(tableIdent); + + table.updateProperties().set(TableProperties.PARQUET_VARIANT_BUFFER_SIZE, "500").commit(); + + SparkWriteConf writeConf = new SparkWriteConf(spark, table, ImmutableMap.of()); + assertThat(writeConf.variantInferenceBufferSize()).isEqualTo(500); + } + + @TestTemplate + public void testShredVariantsSessionOverridesTableProperty() { + Table table = validationCatalog.loadTable(tableIdent); + table.updateProperties().set(TableProperties.PARQUET_SHRED_VARIANTS, "false").commit(); + + withSQLConf( + ImmutableMap.of(SparkSQLProperties.SHRED_VARIANTS, "true"), + () -> { + SparkWriteConf writeConf = new SparkWriteConf(spark, table, ImmutableMap.of()); + assertThat(writeConf.shredVariants()).isTrue(); + }); + } + + @TestTemplate + public void testShredVariantsWriteOptionOverridesSessionConf() { + withSQLConf( + ImmutableMap.of(SparkSQLProperties.SHRED_VARIANTS, "false"), + () -> { + Table table = validationCatalog.loadTable(tableIdent); + SparkWriteConf writeConf = + new SparkWriteConf( + spark, + table, + new CaseInsensitiveStringMap( + ImmutableMap.of(SparkWriteOptions.SHRED_VARIANTS, "true"))); + assertThat(writeConf.shredVariants()).isTrue(); + }); + } + + @TestTemplate + public void testVariantInferenceBufferSizeSessionConf() { + withSQLConf( + ImmutableMap.of(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "250"), + () -> { + Table table = validationCatalog.loadTable(tableIdent); + SparkWriteConf writeConf = new SparkWriteConf(spark, table, ImmutableMap.of()); + assertThat(writeConf.variantInferenceBufferSize()).isEqualTo(250); + }); + } + + @TestTemplate + public void testWritePropertiesIncludeVariantShredding() { + Table table = validationCatalog.loadTable(tableIdent); + table.updateProperties().set(TableProperties.PARQUET_SHRED_VARIANTS, "true").commit(); + table.updateProperties().set(TableProperties.PARQUET_VARIANT_BUFFER_SIZE, "200").commit(); + + SparkWriteConf writeConf = new SparkWriteConf(spark, table, ImmutableMap.of()); + Map writeProperties = writeConf.writeProperties(); + assertThat(writeProperties).containsEntry(PARQUET_SHRED_VARIANTS, "true"); + assertThat(writeProperties).containsEntry(TableProperties.PARQUET_VARIANT_BUFFER_SIZE, "200"); + } } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java index eb89b0a23274..50afb53e0539 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java @@ -662,7 +662,7 @@ public void schemaEvolutionTestWithSparkSQL() throws Exception { "CAST(id AS FLOAT) col1", "CAST(id AS STRING) col2", "CAST(id AS INT) col3") - .registerTempTable("tempdata"); + .createOrReplaceTempView("tempdata"); sql("INSERT INTO TABLE %s SELECT * FROM tempdata", tblName); List expectedBeforeAddColumn = sql("SELECT * FROM %s ORDER BY col0", tblName); List expectedAfterAddColumn = diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index 1645d0c84e35..38ddefd26a45 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -1517,7 +1517,7 @@ public void testSortMultipleGroups() { } @TestTemplate - public void testSimpleSort() { + public void testSimpleSort() throws IOException { Table table = createTable(20); shouldHaveFiles(table, 20); table.replaceSortOrder().asc("c2").commit(); @@ -1545,10 +1545,11 @@ public void testSimpleSort() { shouldHaveACleanCache(table); shouldHaveMultipleFiles(table); shouldHaveLastCommitSorted(table, "c2"); + dataFilesSortOrderShouldMatchTableSortOrder(table); } @TestTemplate - public void testSortAfterPartitionChange() { + public void testSortAfterPartitionChange() throws IOException { Table table = createTable(20); shouldHaveFiles(table, 20); table.updateSpec().addField(Expressions.bucket("c1", 4)).commit(); @@ -1579,10 +1580,11 @@ public void testSortAfterPartitionChange() { shouldHaveACleanCache(table); shouldHaveMultipleFiles(table); shouldHaveLastCommitSorted(table, "c2"); + dataFilesSortOrderShouldMatchTableSortOrder(table); } @TestTemplate - public void testSortCustomSortOrder() { + public void testSortCustomSortOrder() throws IOException { Table table = createTable(20); shouldHaveLastCommitUnsorted(table, "c2"); shouldHaveFiles(table, 20); @@ -1608,10 +1610,11 @@ public void testSortCustomSortOrder() { shouldHaveACleanCache(table); shouldHaveMultipleFiles(table); shouldHaveLastCommitSorted(table, "c2"); + dataFilesShouldHaveSortOrderIdMatching(table, SortOrder.unsorted()); } @TestTemplate - public void testSortCustomSortOrderRequiresRepartition() { + public void testSortCustomSortOrderRequiresRepartition() throws IOException { int partitions = 4; Table table = createTable(); writeRecords(20, SCALE, partitions); @@ -1647,10 +1650,40 @@ public void testSortCustomSortOrderRequiresRepartition() { shouldHaveMultipleFiles(table); shouldHaveLastCommitUnsorted(table, "c2"); shouldHaveLastCommitSorted(table, "c3"); + dataFilesShouldHaveSortOrderIdMatching(table, SortOrder.unsorted()); } @TestTemplate - public void testAutoSortShuffleOutput() { + public void testSortPastTableSortOrderGetsAppliedToFiles() throws IOException { + Table table = createTable(1); + + table.replaceSortOrder().asc("c3").commit(); + SortOrder c3SortOrder = table.sortOrder(); + + table.replaceSortOrder().asc("c2").commit(); + + List originalData = currentData(); + + RewriteDataFiles.Result result = + basicRewrite(table) + .sort(SortOrder.builderFor(table.schema()).asc("c3").build()) + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") + .execute(); + + assertThat(result.rewriteResults()).as("Should have 1 fileGroups").hasSize(1); + + table.refresh(); + + List postRewriteData = currentData(); + assertEquals("We shouldn't have changed the data", originalData, postRewriteData); + + shouldHaveSnapshots(table, 2); + shouldHaveACleanCache(table); + dataFilesShouldHaveSortOrderIdMatching(table, c3SortOrder); + } + + @TestTemplate + public void testAutoSortShuffleOutput() throws IOException { Table table = createTable(20); shouldHaveLastCommitUnsorted(table, "c2"); shouldHaveFiles(table, 20); @@ -1685,6 +1718,7 @@ public void testAutoSortShuffleOutput() { shouldHaveACleanCache(table); shouldHaveMultipleFiles(table); shouldHaveLastCommitSorted(table, "c2"); + dataFilesShouldHaveSortOrderIdMatching(table, SortOrder.unsorted()); } @TestTemplate @@ -2095,6 +2129,23 @@ public void testZOrderUDFWithDateType() { assertThat(zorderBytes).isNotNull().isNotEmpty(); } + @TestTemplate + public void testZOrderUDFWithTimestampNTZType() { + SparkZOrderUDF zorderUDF = new SparkZOrderUDF(1, 16, 1024); + Dataset result = + spark + .sql("SELECT timestamp_ntz '2025-01-01 12:00:00' as test_col") + .withColumn( + "zorder_result", + zorderUDF.sortedLexicographically(col("test_col"), DataTypes.TimestampNTZType)); + + assertThat(result.schema().apply("zorder_result").dataType()).isEqualTo(DataTypes.BinaryType); + List rows = result.collectAsList(); + Row row = rows.get(0); + byte[] zorderBytes = row.getAs("zorder_result"); + assertThat(zorderBytes).isNotNull().isNotEmpty(); + } + protected void shouldRewriteDataFilesWithPartitionSpec(Table table, int outputSpecId) { List rewrittenFiles = currentDataFiles(table); assertThat(rewrittenFiles).allMatch(file -> file.specId() == outputSpecId); @@ -2623,4 +2674,17 @@ public boolean matches(RewriteFileGroup argument) { return groupIDs.contains(argument.info().globalIndex()); } } + + private void dataFilesSortOrderShouldMatchTableSortOrder(Table table) throws IOException { + dataFilesShouldHaveSortOrderIdMatching(table, table.sortOrder()); + } + + private void dataFilesShouldHaveSortOrderIdMatching(Table table, SortOrder sortOrder) + throws IOException { + try (CloseableIterable files = table.newScan().planFiles()) { + assertThat(files) + .extracting(fileScanTask -> fileScanTask.file().sortOrderId()) + .containsOnly(sortOrder.orderId()); + } + } } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkFormatModel.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkFormatModel.java index c18e4c053f50..291bb2bca4f5 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkFormatModel.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkFormatModel.java @@ -25,6 +25,8 @@ import org.apache.iceberg.data.BaseFormatModelTests; import org.apache.iceberg.data.Record; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.SparkUtil; +import org.apache.iceberg.types.Type; import org.apache.spark.sql.catalyst.InternalRow; public class TestSparkFormatModel extends BaseFormatModelTests { @@ -51,4 +53,9 @@ protected void assertEquals(Schema schema, List expected, List expected = Lists.newArrayListWithCapacity(10); + for (int i = 0; i < 10; i++) { + expected.add(new SimpleRecord(i, "a")); + } + + Dataset df = spark.createDataFrame(expected, SimpleRecord.class); + + df.select("id", "data") + .write() + .format("iceberg") + .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) + .mode(SaveMode.Append) + .save(location.toString()); + + Dataset result = spark.read().format("iceberg").load(location.toString()); + + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + assertThat(actual).hasSameSizeAs(expected).isEqualTo(expected); + + try (CloseableIterable fileScanTasks = table.newScan().planFiles()) { + assertThat(fileScanTasks) + .extracting(task -> task.file().sortOrderId()) + .as("All DataFiles are written with the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + } + + @TestTemplate + public void testWriteDataFilesUnsortedTable() throws IOException { + File parent = temp.resolve(format.toString()).toFile(); + File location = new File(parent, "test"); + + HadoopTables tables = new HadoopTables(CONF); + PartitionSpec spec = PartitionSpec.unpartitioned(); + Table table = tables.create(SCHEMA, spec, location.toString()); + + List expected = Lists.newArrayList(new SimpleRecord(1, "a")); + Dataset df = spark.createDataFrame(expected, SimpleRecord.class); + + df.select("id", "data") + .write() + .format("iceberg") + .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) + .mode(SaveMode.Append) + .save(location.toString()); + + try (CloseableIterable tasks = table.newScan().planFiles()) { + assertThat(tasks) + .extracting(task -> task.file().sortOrderId()) + .as("All DataFiles should have unsorted sort order id") + .containsOnly(SortOrder.unsorted().orderId()); + } + } + + @TestTemplate + public void testWriteDataFilesAfterSortOrderChange() throws IOException { + File parent = temp.resolve(format.toString()).toFile(); + File location = new File(parent, "test"); + + HadoopTables tables = new HadoopTables(CONF); + PartitionSpec spec = PartitionSpec.unpartitioned(); + Table table = tables.create(SCHEMA, spec, location.toString()); + + List records = Lists.newArrayList(new SimpleRecord(1, "a")); + Dataset df = spark.createDataFrame(records, SimpleRecord.class); + + df.select("id", "data") + .write() + .format("iceberg") + .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) + .mode(SaveMode.Append) + .save(location.toString()); + + table.refresh(); + int unsortedId = SortOrder.unsorted().orderId(); + + try (CloseableIterable tasks = table.newScan().planFiles()) { + assertThat(tasks).extracting(task -> task.file().sortOrderId()).containsOnly(unsortedId); + } + + table.replaceSortOrder().asc("id").commit(); + int sortedId = table.sortOrder().orderId(); + + df.select("id", "data") + .write() + .format("iceberg") + .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) + .mode(SaveMode.Append) + .save(location.toString()); + + table.refresh(); + + try (CloseableIterable tasks = table.newScan().planFiles()) { + assertThat(tasks) + .extracting(task -> task.file().sortOrderId()) + .as("Should contain both unsorted and sorted files") + .containsOnly(unsortedId, sortedId); + } + } + public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType option) { File parent = temp.resolve(format.toString()).toFile(); File location = new File(parent, "test"); diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java index 8ccea303d0c1..de6a5e59029c 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java @@ -50,6 +50,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkValueConverter; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; @@ -88,6 +89,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); ImmutableMap config = ImmutableMap.of( diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java index b8adfaa0ff97..b3106951eaea 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java @@ -76,6 +76,7 @@ import org.apache.iceberg.spark.ParquetBatchReadConf; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkStructLike; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.VortexBatchReadConf; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.SparkParquetWriters; @@ -139,6 +140,7 @@ public static void startMetastoreAndSpark() { .config("spark.ui.liveUpdate.period", 0) .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java index d22ecb02d483..cb2f866fab10 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java @@ -64,6 +64,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkValueConverter; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.PropertyUtil; import org.apache.spark.sql.Dataset; @@ -182,6 +183,7 @@ public static void startMetastoreAndSpark() { SparkSession.builder() .master("local[2]") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java index d14b1a52cf82..e3934faa60ce 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java @@ -20,7 +20,9 @@ import static org.assertj.core.api.Assertions.assertThat; +import org.apache.iceberg.HistoryEntry; import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Table; import org.apache.iceberg.spark.CatalogTestBase; import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; import org.apache.spark.sql.connector.catalog.CatalogManager; @@ -56,4 +58,58 @@ public void testTableEquality() throws NoSuchTableException { assertThat(table1).as("References must be different").isNotSameAs(table2); assertThat(table1).as("Tables must be equivalent").isEqualTo(table2); } + + @TestTemplate + public void testTableInequalityWithDifferentSnapshots() throws NoSuchTableException { + sql("INSERT INTO %s VALUES (1, 'a')", tableName); + sql("INSERT INTO %s VALUES (2, 'b')", tableName); + + CatalogManager catalogManager = spark.sessionState().catalogManager(); + TableCatalog catalog = (TableCatalog) catalogManager.catalog(catalogName); + Identifier identifier = Identifier.of(tableIdent.namespace().levels(), tableIdent.name()); + SparkTable table = (SparkTable) catalog.loadTable(identifier); + + Table icebergTable = validationCatalog.loadTable(tableIdent); + long[] snapshotIds = + icebergTable.history().stream().mapToLong(HistoryEntry::snapshotId).toArray(); + + SparkTable tableAtSnapshot1 = table.copyWithSnapshotId(snapshotIds[0]); + SparkTable tableAtSnapshot2 = table.copyWithSnapshotId(snapshotIds[1]); + + assertThat(tableAtSnapshot1) + .as("Tables at different snapshots must not be equal") + .isNotEqualTo(tableAtSnapshot2); + assertThat(tableAtSnapshot1.hashCode()) + .as("Hash codes should differ for different snapshots") + .isNotEqualTo(tableAtSnapshot2.hashCode()); + } + + @TestTemplate + public void testTableInequalityWithDifferentBranches() throws NoSuchTableException { + sql("INSERT INTO %s VALUES (1, 'a')", tableName); + + CatalogManager catalogManager = spark.sessionState().catalogManager(); + TableCatalog catalog = (TableCatalog) catalogManager.catalog(catalogName); + Identifier identifier = Identifier.of(tableIdent.namespace().levels(), tableIdent.name()); + + Table icebergTable = validationCatalog.loadTable(tableIdent); + icebergTable + .manageSnapshots() + .createBranch("testBranch", icebergTable.currentSnapshot().snapshotId()) + .commit(); + + // reload after branch creation so the table sees the new ref + SparkTable table = (SparkTable) catalog.loadTable(identifier); + table.table().refresh(); + + SparkTable tableOnMain = table.copyWithBranch("main"); + SparkTable tableOnBranch = table.copyWithBranch("testBranch"); + + assertThat(tableOnMain) + .as("Tables on different branches must not be equal") + .isNotEqualTo(tableOnBranch); + assertThat(tableOnMain.hashCode()) + .as("Hash codes should differ for different branches") + .isNotEqualTo(tableOnBranch.hashCode()); + } } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java index 54048bbf218a..ab760010535b 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java @@ -29,11 +29,16 @@ import java.nio.file.Paths; import java.util.List; import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.FileScanTask; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; import org.apache.iceberg.Table; import org.apache.iceberg.hadoop.HadoopTables; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoder; @@ -69,6 +74,7 @@ public static void startSpark() { .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) .config("spark.sql.shuffle.partitions", 4) + .config(TestBase.DISABLE_UI) .getOrCreate(); } @@ -263,6 +269,50 @@ public void testStreamingWriteCompleteModeWithProjection() throws Exception { } } + @Test + public void testStreamingWriteDataFilesInTableSortOrder() throws Exception { + File parent = temp.resolve("parquet").toFile(); + File location = new File(parent, "test-table"); + File checkpoint = new File(parent, "checkpoint"); + + HadoopTables tables = new HadoopTables(CONF); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); + SortOrder sortOrder = SortOrder.builderFor(SCHEMA).asc("id").build(); + Table table = tables.create(SCHEMA, spec, sortOrder, ImmutableMap.of(), location.toString()); + + MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .selectExpr("value AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("append") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); + + try { + StreamingQuery query = streamWriter.start(); + List batch1 = Lists.newArrayList(1, 2); + send(batch1, inputStream); + query.processAllAvailable(); + query.stop(); + + table.refresh(); + + try (CloseableIterable tasks = table.newScan().planFiles()) { + assertThat(tasks) + .extracting(task -> task.file().sortOrderId()) + .as("All DataFiles are written with the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + } finally { + for (StreamingQuery query : spark.streams().active()) { + query.stop(); + } + } + } + @Test public void testStreamingWriteUpdateMode() throws Exception { File parent = temp.resolve("parquet").toFile(); diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java index 80f2c6864051..5f9b460f3707 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java @@ -31,13 +31,17 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.IntStream; import org.apache.iceberg.BaseTable; +import org.apache.iceberg.CatalogProperties; import org.apache.iceberg.DataFile; import org.apache.iceberg.DataFiles; import org.apache.iceberg.DataOperations; import org.apache.iceberg.DeleteFile; import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; import org.apache.iceberg.Files; +import org.apache.iceberg.Parameter; import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; import org.apache.iceberg.RewriteFiles; import org.apache.iceberg.Schema; import org.apache.iceberg.Snapshot; @@ -50,15 +54,22 @@ import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.data.Record; import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.CatalogTestBase; +import org.apache.iceberg.spark.SparkCatalogConfig; +import org.apache.iceberg.spark.SparkReadConf; import org.apache.iceberg.spark.SparkReadOptions; +import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.VoidFunction2; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Row; +import org.apache.spark.sql.connector.read.InputPartition; +import org.apache.spark.sql.connector.read.streaming.Offset; import org.apache.spark.sql.internal.SQLConf; import org.apache.spark.sql.streaming.DataStreamWriter; import org.apache.spark.sql.streaming.OutputMode; @@ -73,10 +84,73 @@ @ExtendWith(ParameterizedTestExtension.class) public final class TestStructuredStreamingRead3 extends CatalogTestBase { + @Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}, async = {3}") + public static Object[][] parameters() { + return new Object[][] { + { + SparkCatalogConfig.HIVE.catalogName(), + SparkCatalogConfig.HIVE.implementation(), + SparkCatalogConfig.HIVE.properties(), + false + }, + { + SparkCatalogConfig.HIVE.catalogName(), + SparkCatalogConfig.HIVE.implementation(), + SparkCatalogConfig.HIVE.properties(), + true + }, + { + SparkCatalogConfig.HADOOP.catalogName(), + SparkCatalogConfig.HADOOP.implementation(), + SparkCatalogConfig.HADOOP.properties(), + false + }, + { + SparkCatalogConfig.HADOOP.catalogName(), + SparkCatalogConfig.HADOOP.implementation(), + SparkCatalogConfig.HADOOP.properties(), + true + }, + { + SparkCatalogConfig.REST.catalogName(), + SparkCatalogConfig.REST.implementation(), + ImmutableMap.builder() + .putAll(SparkCatalogConfig.REST.properties()) + .put(CatalogProperties.URI, restCatalog.properties().get(CatalogProperties.URI)) + .build(), + false + }, + { + SparkCatalogConfig.REST.catalogName(), + SparkCatalogConfig.REST.implementation(), + ImmutableMap.builder() + .putAll(SparkCatalogConfig.REST.properties()) + .put(CatalogProperties.URI, restCatalog.properties().get(CatalogProperties.URI)) + .build(), + true + }, + { + SparkCatalogConfig.SPARK_SESSION.catalogName(), + SparkCatalogConfig.SPARK_SESSION.implementation(), + SparkCatalogConfig.SPARK_SESSION.properties(), + false + }, + { + SparkCatalogConfig.SPARK_SESSION.catalogName(), + SparkCatalogConfig.SPARK_SESSION.implementation(), + SparkCatalogConfig.SPARK_SESSION.properties(), + true + } + }; + } + private Table table; private final AtomicInteger microBatches = new AtomicInteger(); + @Parameter(index = 3) + private Boolean async; + /** * test data to be used by multiple writes each write creates a snapshot and writes a list of * records @@ -197,8 +271,7 @@ public void testReadStreamWithMaxRows1() throws Exception { Trigger.AvailableNow()); // soft limit of 1 is being enforced, the stream is not blocked. - StreamingQuery query = - startStream(ImmutableMap.of(SparkReadOptions.STREAMING_MAX_ROWS_PER_MICRO_BATCH, "1")); + StreamingQuery query = startStream(SparkReadOptions.STREAMING_MAX_ROWS_PER_MICRO_BATCH, "1"); // check answer correctness only 1 record read the micro-batch will be stuck List actual = rowsAvailable(query); @@ -258,15 +331,41 @@ public void testReadStreamWithCompositeReadLimit() throws Exception { Trigger.AvailableNow()); } + @TestTemplate + public void testReadStreamWithLowAsyncQueuePreload() throws Exception { + appendDataAsMultipleSnapshots(TEST_DATA_MULTIPLE_SNAPSHOTS); + // Set low preload limits to test async queue behavior - background thread should load + // remaining data + + StreamingQuery query = + startStream( + ImmutableMap.of( + SparkReadOptions.ASYNC_QUEUE_PRELOAD_ROW_LIMIT, + "5", + SparkReadOptions.ASYNC_QUEUE_PRELOAD_FILE_LIMIT, + "5")); + + List actual = rowsAvailable(query); + assertThat(actual) + .containsExactlyInAnyOrderElementsOf(Iterables.concat(TEST_DATA_MULTIPLE_SNAPSHOTS)); + } + @TestTemplate public void testAvailableNowStreamReadShouldNotHangOrReprocessData() throws Exception { File writerCheckpointFolder = temp.resolve("writer-checkpoint-folder").toFile(); File writerCheckpoint = new File(writerCheckpointFolder, "writer-checkpoint"); File output = temp.resolve("junit").toFile(); + Map options = Maps.newHashMap(); + options.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + options.put(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } + DataStreamWriter querySource = spark .readStream() + .options(options) .format("iceberg") .load(tableName) .writeStream() @@ -321,10 +420,17 @@ public void testTriggerAvailableNowDoesNotProcessNewDataWhileRunning() throws Ex long expectedSnapshotId = table.currentSnapshot().snapshotId(); String sinkTable = "availablenow_sink"; + Map options = Maps.newHashMap(); + options.put(SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, "1"); + options.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + options.put(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } + StreamingQuery query = spark .readStream() - .option(SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, "1") + .options(options) .format("iceberg") .load(tableName) .writeStream() @@ -366,6 +472,142 @@ public void testTriggerAvailableNowDoesNotProcessNewDataWhileRunning() throws Ex assertThat(actualResults).containsExactlyInAnyOrderElementsOf(Iterables.concat(expectedData)); } + @TestTemplate + public void testTriggerAvailableNowCapsAsyncPreloadAfterPrepare() { + List> initialData = + List.of(List.of(new SimpleRecord(1, "one")), List.of(new SimpleRecord(2, "two"))); + appendDataAsMultipleSnapshots(initialData); + + table.refresh(); + long expectedCapSnapshotId = table.currentSnapshot().snapshotId(); + + SparkMicroBatchStream stream = + new SparkMicroBatchStream( + JavaSparkContext.fromSparkContext(spark.sparkContext()), + table, + table::io, + new SparkReadConf( + spark, + table, + ImmutableMap.of( + SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, + async.toString(), + SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, + "1", + SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, + "1", + SparkReadOptions.ASYNC_QUEUE_PRELOAD_FILE_LIMIT, + "10", + SparkReadOptions.ASYNC_QUEUE_PRELOAD_ROW_LIMIT, + "10")), + table.schema(), + temp.resolve("available-now-cap-checkpoint").toString()); + + try { + stream.prepareForTriggerAvailableNow(); + + appendData(List.of(new SimpleRecord(3, "three"))); + + Offset startOffset = stream.initialOffset(); + Offset firstEndOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + assertThat(firstEndOffset).isNotNull(); + stream.planInputPartitions(startOffset, firstEndOffset); + + Offset secondEndOffset = stream.latestOffset(firstEndOffset, stream.getDefaultReadLimit()); + assertThat(secondEndOffset).isNotNull(); + stream.planInputPartitions(firstEndOffset, secondEndOffset); + + assertThat(stream.latestOffset(secondEndOffset, stream.getDefaultReadLimit())).isNull(); + assertThat(((StreamingOffset) secondEndOffset).snapshotId()).isEqualTo(expectedCapSnapshotId); + } finally { + stream.stop(); + } + } + + @TestTemplate + public void testLatestOffsetReturnsNullAfterFinalBatchIsConsumed() throws Exception { + appendDataAsMultipleSnapshots(TEST_DATA_MULTIPLE_SNAPSHOTS); + + table.refresh(); + int expectedBatchCount; + try (CloseableIterable tasks = table.newScan().planFiles()) { + expectedBatchCount = Iterables.size(tasks); + } + + SparkMicroBatchStream stream = + newMicroBatchStream( + ImmutableMap.of(SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, "1"), + "drain-to-null-checkpoint"); + + try { + int plannedBatchCount = 0; + Offset startOffset = stream.initialOffset(); + Offset endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + while (endOffset != null) { + InputPartition[] partitions = stream.planInputPartitions(startOffset, endOffset); + assertThat(partitions).isNotEmpty(); + plannedBatchCount += 1; + startOffset = endOffset; + endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + } + + assertThat(endOffset).isNull(); + assertThat(plannedBatchCount).isEqualTo(expectedBatchCount); + } finally { + stream.stop(); + } + } + + @TestTemplate + public void testPlanInputPartitionsIsIdempotentForSameOffsets() { + appendDataAsMultipleSnapshots(TEST_DATA_MULTIPLE_SNAPSHOTS); + + SparkMicroBatchStream stream = + newMicroBatchStream( + ImmutableMap.of(SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, "1"), + "idempotent-plan-files-checkpoint"); + + try { + Offset startOffset = stream.initialOffset(); + Offset endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + + assertThat(endOffset).isNotNull(); + + InputPartition[] firstPartitions = stream.planInputPartitions(startOffset, endOffset); + InputPartition[] secondPartitions = stream.planInputPartitions(startOffset, endOffset); + + List firstFileLocations = Lists.newArrayList(); + for (InputPartition partition : firstPartitions) { + SparkInputPartition sparkInputPartition = (SparkInputPartition) partition; + for (FileScanTask task : sparkInputPartition.taskGroup().tasks()) { + firstFileLocations.add(task.file().location()); + } + } + + List secondFileLocations = Lists.newArrayList(); + for (InputPartition partition : secondPartitions) { + SparkInputPartition sparkInputPartition = (SparkInputPartition) partition; + for (FileScanTask task : sparkInputPartition.taskGroup().tasks()) { + secondFileLocations.add(task.file().location()); + } + } + + assertThat(firstFileLocations).containsExactlyInAnyOrderElementsOf(secondFileLocations); + + startOffset = endOffset; + endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + while (endOffset != null) { + assertThat(stream.planInputPartitions(startOffset, endOffset)).isNotEmpty(); + startOffset = endOffset; + endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + } + + assertThat(endOffset).isNull(); + } finally { + stream.stop(); + } + } + @TestTemplate public void testReadStreamOnIcebergThenAddData() throws Exception { List> expected = TEST_DATA_MULTIPLE_SNAPSHOTS; @@ -433,6 +675,8 @@ public void testReadingStreamFromFutureTimetsamp() throws Exception { // Data appended after the timestamp should appear appendData(data); + // Allow async background thread to refresh, else test sometimes fails + Thread.sleep(50); actual = rowsAvailable(query); assertThat(actual).containsExactlyInAnyOrderElementsOf(data); } @@ -885,13 +1129,18 @@ private void appendData(List data, String format) { private static final String MEMORY_TABLE = "_stream_view_mem"; private StreamingQuery startStream(Map options) throws TimeoutException { + Map allOptions = Maps.newHashMap(options); + allOptions.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + allOptions.putIfAbsent(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } return spark .readStream() - .options(options) + .options(allOptions) .format("iceberg") .load(tableName) .writeStream() - .options(options) + .options(allOptions) .format("memory") .queryName(MEMORY_TABLE) .outputMode(OutputMode.Append()) @@ -916,11 +1165,17 @@ private void assertMicroBatchRecordSizes( private void assertMicroBatchRecordSizes( Map options, List expectedMicroBatchRecordSize, Trigger trigger) throws TimeoutException { - Dataset ds = spark.readStream().options(options).format("iceberg").load(tableName); + Map allOptions = Maps.newHashMap(options); + allOptions.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + allOptions.putIfAbsent(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } + + Dataset ds = spark.readStream().options(allOptions).format("iceberg").load(tableName); List syncList = Collections.synchronizedList(Lists.newArrayList()); ds.writeStream() - .options(options) + .options(allOptions) .trigger(trigger) .foreachBatch( (VoidFunction2, Long>) @@ -941,4 +1196,21 @@ private List rowsAvailable(StreamingQuery query) { .as(Encoders.bean(SimpleRecord.class)) .collectAsList(); } + + private SparkMicroBatchStream newMicroBatchStream( + Map options, String checkpointDirName) { + Map allOptions = Maps.newHashMap(options); + allOptions.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + allOptions.putIfAbsent(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } + + return new SparkMicroBatchStream( + JavaSparkContext.fromSparkContext(spark.sparkContext()), + table, + table::io, + new SparkReadConf(spark, table, allOptions), + table.schema(), + temp.resolve(checkpointDirName).toString()); + } } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java index e2b5d8920e9f..ab2479d61058 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java @@ -42,6 +42,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.ByteBuffers; @@ -85,6 +86,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); TestWriteMetricsConfig.sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java index ce0a0f26a096..e1d2b19f890c 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java @@ -63,6 +63,7 @@ public static void startMetastoreAndSpark() { SparkSession.builder() .master("local[2]") .config("spark.sql.iceberg.aggregate_pushdown", "true") + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestSparkVariantRead.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestSparkVariantRead.java index 599bf591e9a4..2d6e919a91ee 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestSparkVariantRead.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestSparkVariantRead.java @@ -302,6 +302,55 @@ public void testNestedMapVariant(boolean vectorized) { sql("DROP TABLE IF EXISTS %s", mapTable); } + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void testMergeIntoWithVariant(boolean vectorized) { + // Variant columns are not vectorized yet, but MERGE INTO should not crash regardless of the + // vectorization setting. The reader falls back to non-vectorized for variant columns. + String mergeTable = CATALOG + ".default.var_merge"; + sql("DROP TABLE IF EXISTS %s", mergeTable); + sql( + "CREATE TABLE %s (id BIGINT, data VARIANT) USING iceberg " + + "TBLPROPERTIES ('format-version'='3')", + mergeTable); + setVectorization(mergeTable, vectorized); + + sql( + "INSERT INTO %s VALUES " + + "(1, parse_json('{\"name\":\"alice\",\"age\":30}')), " + + "(2, parse_json('{\"name\":\"bob\",\"age\":25}'))", + mergeTable); + + sql( + "MERGE INTO %s AS target " + + "USING (SELECT 1 AS id, parse_json('{\"name\":\"alice\",\"age\":31}') AS data) AS source " + + "ON target.id = source.id " + + "WHEN MATCHED THEN UPDATE SET target.data = source.data " + + "WHEN NOT MATCHED THEN INSERT *", + mergeTable); + + List rows = spark.table(mergeTable).select("id", "data").orderBy("id").collectAsList(); + + assertThat(rows).hasSize(2); + assertThat(rows.get(0).getLong(0)).isEqualTo(1L); + Variant v1 = + new Variant( + ((VariantVal) rows.get(0).get(1)).getValue(), + ((VariantVal) rows.get(0).get(1)).getMetadata()); + assertThat(v1.getFieldByKey("name").getString()).describedAs("v1.name").isEqualTo("alice"); + assertThat(v1.getFieldByKey("age").getLong()).describedAs("v1.age").isEqualTo(31L); + + assertThat(rows.get(1).getLong(0)).isEqualTo(2L); + Variant v2 = + new Variant( + ((VariantVal) rows.get(1).get(1)).getValue(), + ((VariantVal) rows.get(1).get(1)).getMetadata()); + assertThat(v2.getFieldByKey("name").getString()).describedAs("v2.name").isEqualTo("bob"); + assertThat(v2.getFieldByKey("age").getLong()).describedAs("v2.age").isEqualTo(25L); + + sql("DROP TABLE IF EXISTS %s", mergeTable); + } + private void setVectorization(boolean on) { sql( "ALTER TABLE %s SET TBLPROPERTIES ('read.parquet.vectorization.enabled'='%s')", diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestTableEncryption.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestTableEncryption.java index a38506d621f9..3b36b7bb0a25 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestTableEncryption.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestTableEncryption.java @@ -56,6 +56,7 @@ import org.apache.iceberg.spark.SparkCatalogConfig; import org.apache.iceberg.types.Types; import org.apache.parquet.crypto.ParquetCryptoRuntimeException; +import org.apache.spark.SparkException; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.TestTemplate; @@ -248,14 +249,28 @@ public void testMetadataTamperproofing() throws IOException { public void testKeyDelete() { assertThatThrownBy( () -> sql("ALTER TABLE %s UNSET TBLPROPERTIES (`encryption.key-id`)", tableName)) - .hasMessageContaining("Cannot remove key in encrypted table"); + .isInstanceOf(SparkException.class) + .hasMessage("Unsupported table change: Cannot remove key ID from an encrypted table"); } @TestTemplate public void testKeyAlter() { assertThatThrownBy( () -> sql("ALTER TABLE %s SET TBLPROPERTIES ('encryption.key-id'='abcd')", tableName)) - .hasMessageContaining("Cannot modify key in encrypted table"); + .isInstanceOf(SparkException.class) + .hasMessage("Unsupported table change: Cannot modify key ID of an encrypted table"); + } + + @TestTemplate + public void testReplaceKeyChange() { + // Replacing a table with a different encryption key is disallowed + assertThatThrownBy( + () -> + sql( + "REPLACE TABLE %s (id bigint) USING iceberg TBLPROPERTIES ('encryption.key-id'='%s')", + tableName, UnitestKMS.MASTER_KEY_NAME2)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Cannot modify key ID of an encrypted table"); } @TestTemplate diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/variant/TestVariantShredding.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/variant/TestVariantShredding.java new file mode 100644 index 000000000000..8cdcf22e5817 --- /dev/null +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/variant/TestVariantShredding.java @@ -0,0 +1,1101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.variant; + +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.parquet.schema.Types.optional; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.math.BigDecimal; +import java.net.InetAddress; +import java.util.List; +import java.util.Map; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.spark.CatalogTestBase; +import org.apache.iceberg.spark.SparkCatalogConfig; +import org.apache.iceberg.spark.SparkSQLProperties; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.variants.Variant; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; +import org.apache.spark.sql.internal.SQLConf; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; + +public class TestVariantShredding extends CatalogTestBase { + + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "address", Types.VariantType.get())); + + private static final Schema SCHEMA2 = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "address", Types.VariantType.get()), + Types.NestedField.optional(3, "metadata", Types.VariantType.get())); + + @Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}") + protected static Object[][] parameters() { + return new Object[][] { + { + SparkCatalogConfig.HADOOP.catalogName(), + SparkCatalogConfig.HADOOP.implementation(), + SparkCatalogConfig.HADOOP.properties() + }, + }; + } + + @BeforeAll + public static void startMetastoreAndSpark() { + // First call parent to initialize metastore and spark with local[2] + CatalogTestBase.startMetastoreAndSpark(); + + // Now stop and recreate spark with local[1] to write all rows to a single file + if (spark != null) { + spark.stop(); + } + + spark = + SparkSession.builder() + .master("local[1]") // Use one thread to write the rows to a single parquet file + .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") + .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") + .config(DISABLE_UI) + .enableHiveSupport() + .getOrCreate(); + + sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); + } + + @BeforeEach + public void before() { + super.before(); + validationCatalog.createTable( + tableIdent, SCHEMA, null, Map.of(TableProperties.FORMAT_VERSION, "3")); + } + + @AfterEach + public void after() { + spark.conf().unset(SparkSQLProperties.SHRED_VARIANTS); + spark.conf().unset(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE); + validationCatalog.dropTable(tableIdent, true); + } + + @TestTemplate + public void testVariantShreddingDisabled() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "false"); + + String values = "(1, parse_json('{\"city\": \"NYC\", \"zip\": 10001}')), (2, null)"; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType address = variant("address", 2, Type.Repetition.OPTIONAL); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testExcludingNullValue() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"name": "Alice", "age": 30, "dummy": null}')),\ + (2, parse_json('{"name": "Bob", "age": 25}')),\ + (3, parse_json('{"name": "Charlie", "age": 35}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testInconsistentType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"age": "25"}')),\ + (2, parse_json('{"age": 30}')),\ + (3, parse_json('{"age": "35"}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + List rows = + sql("SELECT variant_get(address, '$.age', 'int') FROM %s WHERE id = 2", tableName); + assertThat(rows).hasSize(1); + assertThat(rows.get(0)[0]).isEqualTo(30); + } + + @TestTemplate + public void testPrimitiveType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = "(1, parse_json('123')), (2, parse_json('456')), (3, parse_json('789'))"; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType address = + variant( + "address", + 2, + Type.Repetition.REQUIRED, + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(16, true))); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testPrimitiveDecimalType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + "(1, parse_json('123.56')), (2, parse_json('\"abc\"')), (3, parse_json('12.56'))"; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType address = + variant( + "address", + 2, + Type.Repetition.REQUIRED, + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.decimalType(2, 5))); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testBooleanType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"active": true}')),\ + (2, parse_json('{"active": false}')),\ + (3, parse_json('{"active": true}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType active = field("active", shreddedPrimitive(PrimitiveType.PrimitiveTypeName.BOOLEAN)); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(active)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testDecimalTypeWithInconsistentScales() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"price": 123.456789}')),\ + (2, parse_json('{"price": 678.90}')),\ + (3, parse_json('{"price": 999.99}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType price = + field( + "price", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.decimalType(6, 9))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(price)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testDecimalTypeWithConsistentScales() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"price": 123.45}')),\ + (2, parse_json('{"price": 678.90}')),\ + (3, parse_json('{"price": 999.99}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType price = + field( + "price", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.decimalType(2, 5))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(price)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testArrayType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('["java", "scala", "python"]')),\ + (2, parse_json('["rust", "go"]')),\ + (3, parse_json('["javascript"]'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType arr = + list( + element( + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType()))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, arr); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testNestedArrayType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"tags": ["java", "scala", "python"]}')),\ + (2, parse_json('{"tags": ["rust", "go"]}')),\ + (3, parse_json('{"tags": ["javascript"]}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType tags = + field( + "tags", + list( + element( + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, + LogicalTypeAnnotation.stringType())))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(tags)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testNestedObjectType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"location": {"city": "Seattle", "zip": 98101}, "tags": ["java", "scala", "python"]}')),\ + (2, parse_json('{"location": {"city": "Portland", "zip": 97201}}')),\ + (3, parse_json('{"location": {"city": "NYC", "zip": 10001}}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType city = + field( + "city", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType zip = + field( + "zip", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(32, true))); + GroupType location = field("location", objectFields(city, zip)); + GroupType tags = + field( + "tags", + list( + element( + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, + LogicalTypeAnnotation.stringType())))); + + GroupType address = + variant("address", 2, Type.Repetition.REQUIRED, objectFields(location, tags)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testLazyInitializationWithBufferedRows() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "5"); + + String values = + """ + (1, parse_json('{"name": "Alice", "age": 30}')),\ + (2, parse_json('{"name": "Bob", "age": 25}')),\ + (3, parse_json('{"name": "Charlie", "age": 35}')),\ + (4, parse_json('{"name": "David", "age": 28}')),\ + (5, parse_json('{"name": "Eve", "age": 32}')),\ + (6, parse_json('{"name": "Frank", "age": 40}')),\ + (7, parse_json('{"name": "Grace", "age": 27}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + long rowCount = spark.read().format("iceberg").load(tableName).count(); + assertThat(rowCount).isEqualTo(7); + } + + @TestTemplate + public void testMultipleRowGroups() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "3"); + + int numRows = 1000; + StringBuilder valuesBuilder = new StringBuilder(); + for (int i = 1; i <= numRows; i++) { + if (i > 1) { + valuesBuilder.append(", "); + } + valuesBuilder.append( + String.format("(%d, parse_json('{\"name\": \"User%d\", \"age\": %d}'))", i, i, 20 + i)); + } + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", + tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 1024); + sql("INSERT INTO %s VALUES %s", tableName, valuesBuilder.toString()); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + long rowCount = spark.read().format("iceberg").load(tableName).count(); + assertThat(rowCount).isEqualTo(numRows); + } + + @TestTemplate + public void testColumnIndexTruncateLength() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "3"); + + int customTruncateLength = 10; + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", + tableName, "parquet.columnindex.truncate.length", customTruncateLength); + + StringBuilder valuesBuilder = new StringBuilder(); + for (int i = 1; i <= 10; i++) { + if (i > 1) { + valuesBuilder.append(", "); + } + String longValue = "A".repeat(20); + valuesBuilder.append( + String.format( + "(%d, parse_json('{\"description\": \"%s\", \"id\": %d}'))", i, longValue, i)); + } + sql("INSERT INTO %s VALUES %s", tableName, valuesBuilder.toString()); + + GroupType description = + field( + "description", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType id = + field( + "id", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = + variant("address", 2, Type.Repetition.REQUIRED, objectFields(description, id)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + long rowCount = spark.read().format("iceberg").load(tableName).count(); + assertThat(rowCount).isEqualTo(10); + } + + @TestTemplate + public void testIntegerFamilyPromotion() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + // Mix of INT8, INT16, INT32, INT64 - should promote to INT64 + String values = + """ + (1, parse_json('{"value": 10}')),\ + (2, parse_json('{"value": 1000}')),\ + (3, parse_json('{"value": 100000}')),\ + (4, parse_json('{"value": 10000000000}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType value = + field( + "value", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT64, LogicalTypeAnnotation.intType(64, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(value)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testDecimalFamilyPromotion() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + // Test that they get promoted to the most capable decimal type observed + String values = + """ + (1, parse_json('{"value": 1.5}')),\ + (2, parse_json('{"value": 123.456789}')),\ + (3, parse_json('{"value": 123456789123456.789}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType value = + field( + "value", + optional(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) + .length(16) + .as(LogicalTypeAnnotation.decimalType(6, 21)) + .named("typed_value")); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(value)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testDataRoundTripWithShredding() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"name": "Alice", "age": 30}')),\ + (2, parse_json('{"name": "Bob", "age": 25}')),\ + (3, parse_json('{"name": "Charlie", "age": 35}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + // Verify that we can read the data back correctly + List rows = + sql( + "SELECT id, variant_get(address, '$.name', 'string')," + + " variant_get(address, '$.age', 'int')" + + " FROM %s ORDER BY id", + tableName); + assertThat(rows).hasSize(3); + assertThat(rows.get(0)[0]).isEqualTo(1); + assertThat(rows.get(0)[1]).isEqualTo("Alice"); + assertThat(rows.get(0)[2]).isEqualTo(30); + assertThat(rows.get(1)[0]).isEqualTo(2); + assertThat(rows.get(1)[1]).isEqualTo("Bob"); + assertThat(rows.get(1)[2]).isEqualTo(25); + assertThat(rows.get(2)[0]).isEqualTo(3); + assertThat(rows.get(2)[1]).isEqualTo("Charlie"); + assertThat(rows.get(2)[2]).isEqualTo(35); + } + + @TestTemplate + public void testMultipleVariantsWithShredding() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + // Recreate table with SCHEMA2 (address + metadata variant columns) + validationCatalog.dropTable(tableIdent, true); + validationCatalog.createTable( + tableIdent, SCHEMA2, null, Map.of(TableProperties.FORMAT_VERSION, "3")); + + String values = + """ + (1, parse_json('{"city": "NYC"}'), parse_json('{"source": "web"}')),\ + (2, parse_json('{"city": "LA"}'), parse_json('{"source": "app"}')),\ + (3, parse_json('{"city": "SF"}'), parse_json('{"source": "api"}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType city = + field( + "city", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(city)); + + GroupType source = + field( + "source", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType metadata = variant("metadata", 3, Type.Repetition.REQUIRED, objectFields(source)); + MessageType expectedSchema = parquetSchema(address, metadata); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testVariantWithNullValues() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('null')),\ + (2, parse_json('null')),\ + (3, parse_json('null'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType address = variant("address", 2, Type.Repetition.REQUIRED); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testArrayOfNullElementsWithShredding() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + sql( + "INSERT INTO %s VALUES (1, parse_json('[null, null, null]')), " + + "(2, parse_json('[null]'))", + tableName); + + // Array elements are all null, element type is null, falls back to unshredded + GroupType address = variant("address", 2, Type.Repetition.REQUIRED); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testMixedNullAndNonNullVariantValues() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"name": "Alice", "age": 30}')),\ + (2, null),\ + (3, parse_json('{"name": "Charlie", "age": 35}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.OPTIONAL, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + long rowCount = spark.read().format("iceberg").load(tableName).count(); + assertThat(rowCount).isEqualTo(3); + } + + @TestTemplate + public void testWriteOptionOverridesSessionConfig() throws IOException, NoSuchTableException { + // Disable shredding at session level + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "false"); + + // Enable shredding via per-write option + String query = + "SELECT 1 as id, parse_json('{\"name\": \"Alice\", \"age\": 30}') as address" + + " UNION ALL SELECT 2, parse_json('{\"name\": \"Bob\", \"age\": 25}')" + + " UNION ALL SELECT 3, parse_json('{\"name\": \"Charlie\", \"age\": 35}')"; + spark.sql(query).writeTo(tableName).option("shred-variants", "true").append(); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testInfrequentFieldPruning() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "11"); + + StringBuilder valuesBuilder = new StringBuilder(); + for (int i = 1; i <= 11; i++) { + if (i > 1) { + valuesBuilder.append(", "); + } + if (i == 1) { + // Only the first row has rare_field + valuesBuilder.append( + String.format( + "(%d, parse_json('{\"name\": \"User%d\", \"rare_field\": \"rare\"}'))", i, i)); + } else { + valuesBuilder.append(String.format("(%d, parse_json('{\"name\": \"User%d\"}'))", i, i)); + } + } + sql("INSERT INTO %s VALUES %s", tableName, valuesBuilder.toString()); + + // rare_field appears in 1/11 rows, should be pruned + // name appears in 11/11 rows and should be kept + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testMixedTypeTieBreaking() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "10"); + + StringBuilder valuesBuilder = new StringBuilder(); + for (int i = 1; i <= 10; i++) { + if (i > 1) { + valuesBuilder.append(", "); + } + if (i <= 5) { + valuesBuilder.append(String.format("(%d, parse_json('{\"val\": %d}'))", i, i)); + } else { + valuesBuilder.append(String.format("(%d, parse_json('{\"val\": \"text%d\"}'))", i, i)); + } + } + sql("INSERT INTO %s VALUES %s", tableName, valuesBuilder.toString()); + + // 5 ints + 5 strings is a tie so STRING wins (higher TIE_BREAK_PRIORITY) + GroupType val = + field( + "val", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(val)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + // Verify data round-trips correctly + List rows = + sql("SELECT id, variant_get(address, '$.val', 'string') FROM %s ORDER BY id", tableName); + assertThat(rows).hasSize(10); + assertThat(rows.get(0)[1]).isEqualTo("1"); + assertThat(rows.get(5)[1]).isEqualTo("text6"); + } + + @TestTemplate + public void testFieldOnlyAfterBuffer() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "3"); + + String values = + """ + (1, parse_json('{"name": "Alice"}')),\ + (2, parse_json('{"name": "Bob"}')),\ + (3, parse_json('{"name": "Charlie"}')),\ + (4, parse_json('{"name": "David", "score": 95}')),\ + (5, parse_json('{"name": "Eve", "score": 88}')),\ + (6, parse_json('{"name": "Frank", "score": 72}')),\ + (7, parse_json('{"name": "Grace", "score": 91}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + // Schema is determined from buffer (rows 1-3) which only has "name". + // "score" is not shredded + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + // Verify all data round-trips despite "score" not being shredded + List rows = + sql( + "SELECT id, variant_get(address, '$.name', 'string')," + + " variant_get(address, '$.score', 'int')" + + " FROM %s ORDER BY id", + tableName); + assertThat(rows).hasSize(7); + assertThat(rows.get(0)[1]).isEqualTo("Alice"); + assertThat(rows.get(0)[2]).isNull(); + assertThat(rows.get(3)[1]).isEqualTo("David"); + assertThat(rows.get(3)[2]).isEqualTo(95); + assertThat(rows.get(6)[1]).isEqualTo("Grace"); + assertThat(rows.get(6)[2]).isEqualTo(91); + } + + @TestTemplate + public void testCrossFileDifferentShreddedType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "3"); + + // File 1: "score" is always integer → shredded as INT8 + String batch1 = + """ + (1, parse_json('{"score": 95}')),\ + (2, parse_json('{"score": 88}')),\ + (3, parse_json('{"score": 72}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, batch1); + + // Verify file 1 schema: score shredded as INT8 + Table table = validationCatalog.loadTable(tableIdent); + GroupType scoreInt = + field( + "score", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + MessageType expectedSchema1 = + parquetSchema(variant("address", 2, Type.Repetition.REQUIRED, objectFields(scoreInt))); + verifyParquetSchema(table, expectedSchema1); + + // File 2: "score" is always string → shredded as STRING + String batch2 = + """ + (4, parse_json('{"score": "high"}')),\ + (5, parse_json('{"score": "medium"}')),\ + (6, parse_json('{"score": "low"}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, batch2); + + // Query across both files, reader must handle different shredded types + List rows = + sql("SELECT id, variant_get(address, '$.score', 'string') FROM %s ORDER BY id", tableName); + assertThat(rows).hasSize(6); + assertThat(rows.get(0)[1]).isEqualTo("95"); + assertThat(rows.get(1)[1]).isEqualTo("88"); + assertThat(rows.get(3)[1]).isEqualTo("high"); + assertThat(rows.get(5)[1]).isEqualTo("low"); + } + + @TestTemplate + public void testAllNullVariantColumn() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + sql("INSERT INTO %s VALUES (1, null), (2, null), (3, null)", tableName); + + // All variant values are SQL NULL, so no shredding should occur + Table table = validationCatalog.loadTable(tableIdent); + MessageType expectedSchema = parquetSchema(variant("address", 2, Type.Repetition.OPTIONAL)); + verifyParquetSchema(table, expectedSchema); + + List rows = sql("SELECT id, address FROM %s ORDER BY id", tableName); + assertThat(rows).hasSize(3); + assertThat(rows.get(0)[1]).isNull(); + assertThat(rows.get(1)[1]).isNull(); + assertThat(rows.get(2)[1]).isNull(); + } + + @TestTemplate + public void testBufferSizeOne() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "1"); + + sql( + """ + INSERT INTO %s VALUES + (1, parse_json('{"name": "Alice", "age": 30}')), + (2, parse_json('{"name": "Bob", "age": 25}')), + (3, parse_json('{"name": "Charlie", "age": 35}')) + """, + tableName); + + // Schema inferred from first row only, should still shred name and age + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + List rows = + sql("SELECT id, variant_get(address, '$.name', 'string') FROM %s ORDER BY id", tableName); + assertThat(rows).hasSize(3); + assertThat(rows.get(0)[1]).isEqualTo("Alice"); + assertThat(rows.get(2)[1]).isEqualTo("Charlie"); + } + + @TestTemplate + public void testDecimalFallbackAfterBuffer() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "3"); + + // Buffer: scale=2, 3 integer digits -> DECIMAL(5,2) + // Row 4: precision overflow -> fallback to value field + // Row 5: scale overflow -> fallback to value field + // Row 6: fits typed column, scale widened from 1 to 2 via setScale + String values = + """ + (1, parse_json('{"val": 123.45}')),\ + (2, parse_json('{"val": 678.90}')),\ + (3, parse_json('{"val": 999.99}')),\ + (4, parse_json('{"val": 123456.78}')),\ + (5, parse_json('{"val": 1.2345}')),\ + (6, parse_json('{"val": 12.3}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + List rows = + sql( + "SELECT id, variant_get(address, '$.val', 'decimal(10,4)') FROM %s ORDER BY id", + tableName); + assertThat(rows).hasSize(6); + assertThat(rows.get(0)[1]).isEqualTo(new BigDecimal("123.4500")); + assertThat(rows.get(3)[1]).isEqualTo(new BigDecimal("123456.7800")); + assertThat(rows.get(4)[1]).isEqualTo(new BigDecimal("1.2345")); + assertThat(rows.get(5)[1]).isEqualTo(new BigDecimal("12.3000")); + } + + private void verifyParquetSchema(Table table, MessageType expectedSchema) throws IOException { + try (CloseableIterable tasks = table.newScan().planFiles()) { + assertThat(tasks).isNotEmpty(); + + for (FileScanTask task : tasks) { + String path = task.file().location(); + + HadoopInputFile inputFile = HadoopInputFile.fromPath(new Path(path), new Configuration()); + + try (ParquetFileReader reader = ParquetFileReader.open(inputFile)) { + MessageType actualSchema = reader.getFileMetaData().getSchema(); + assertThat(actualSchema).isEqualTo(expectedSchema); + } + } + } + } + + private static MessageType parquetSchema(Type... variantTypes) { + return org.apache.parquet.schema.Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.INT32) + .id(1) + .named("id") + .addFields(variantTypes) + .named("table"); + } + + private static GroupType variant(String name, int fieldId, Type.Repetition repetition) { + return org.apache.parquet.schema.Types.buildGroup(repetition) + .id(fieldId) + .as(LogicalTypeAnnotation.variantType(Variant.VARIANT_SPEC_VERSION)) + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .named("metadata") + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .named("value") + .named(name); + } + + private static GroupType variant( + String name, int fieldId, Type.Repetition repetition, Type shreddedType) { + checkShreddedType(shreddedType); + return org.apache.parquet.schema.Types.buildGroup(repetition) + .id(fieldId) + .as(LogicalTypeAnnotation.variantType(Variant.VARIANT_SPEC_VERSION)) + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .named("metadata") + .optional(PrimitiveType.PrimitiveTypeName.BINARY) + .named("value") + .addField(shreddedType) + .named(name); + } + + private static Type shreddedPrimitive(PrimitiveType.PrimitiveTypeName primitive) { + return optional(primitive).named("typed_value"); + } + + private static Type shreddedPrimitive( + PrimitiveType.PrimitiveTypeName primitive, LogicalTypeAnnotation annotation) { + return optional(primitive).as(annotation).named("typed_value"); + } + + private static GroupType objectFields(GroupType... fields) { + for (GroupType fieldType : fields) { + checkField(fieldType); + } + + return org.apache.parquet.schema.Types.buildGroup(Type.Repetition.OPTIONAL) + .addFields(fields) + .named("typed_value"); + } + + private static GroupType field(String name, Type shreddedType) { + checkShreddedType(shreddedType); + return org.apache.parquet.schema.Types.buildGroup(Type.Repetition.REQUIRED) + .optional(PrimitiveType.PrimitiveTypeName.BINARY) + .named("value") + .addField(shreddedType) + .named(name); + } + + private static GroupType element(Type shreddedType) { + return field("element", shreddedType); + } + + private static GroupType list(GroupType elementType) { + return org.apache.parquet.schema.Types.optionalList().element(elementType).named("typed_value"); + } + + private static void checkShreddedType(Type shreddedType) { + Preconditions.checkArgument( + shreddedType.getName().equals("typed_value"), + "Invalid shredded type name: %s should be typed_value", + shreddedType.getName()); + Preconditions.checkArgument( + shreddedType.isRepetition(Type.Repetition.OPTIONAL), + "Invalid shredded type repetition: %s should be OPTIONAL", + shreddedType.getRepetition()); + } + + private static void checkField(GroupType fieldType) { + Preconditions.checkArgument( + fieldType.isRepetition(Type.Repetition.REQUIRED), + "Invalid field type repetition: %s should be REQUIRED", + fieldType.getRepetition()); + } +} diff --git a/spark/v4.1/build.gradle b/spark/v4.1/build.gradle index 8e74a79cba49..11d10469fce6 100644 --- a/spark/v4.1/build.gradle +++ b/spark/v4.1/build.gradle @@ -112,14 +112,10 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") { testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') - testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - testImplementation libs.sqlite.jdbc + testImplementation project(path: ':iceberg-orc', configuration: 'testArtifacts') + testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) testImplementation libs.awaitility testImplementation(testFixtures(project(':iceberg-parquet'))) - // runtime dependencies for running REST Catalog based integration test - testRuntimeOnly libs.jetty.servlet } test { @@ -180,13 +176,7 @@ project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVer testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') testImplementation project(path: ":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}", configuration: 'testArtifacts') - testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - // runtime dependencies for running REST Catalog based integration test - testRuntimeOnly libs.jetty.servlet - testRuntimeOnly libs.sqlite.jdbc - + testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) testImplementation libs.avro.avro testImplementation libs.parquet.hadoop testImplementation libs.awaitility @@ -278,11 +268,7 @@ project(":iceberg-spark:iceberg-spark-runtime-${sparkMajorVersion}_${scalaVersio integrationRuntimeOnly project(':iceberg-hive-metastore') // runtime dependencies for running REST Catalog based integration test integrationRuntimeOnly project(path: ':iceberg-core', configuration: 'testArtifacts') - integrationRuntimeOnly (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - integrationRuntimeOnly libs.jetty.servlet - integrationRuntimeOnly libs.sqlite.jdbc + integrationRuntimeOnly (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) // Not allowed on our classpath, only the runtime jar is allowed integrationCompileOnly project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVersion}") @@ -345,5 +331,7 @@ project(":iceberg-spark:iceberg-spark-runtime-${sparkMajorVersion}_${scalaVersio jar { enabled = false } + + apply from: "${rootDir}/runtime-deps.gradle" } diff --git a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java index a8b226ea1e37..a468a1cc8717 100644 --- a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java +++ b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java @@ -31,6 +31,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions; import org.apache.iceberg.util.ThreadPools; import org.apache.spark.sql.SparkSession; @@ -205,7 +206,7 @@ private void initDataAndDVs() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) diff --git a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java index eeea81634596..bc34bf33e35e 100644 --- a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java +++ b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java @@ -155,7 +155,7 @@ private void runBenchmark(RowLevelOperationMode mode, double updatePercentage) { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java index c50a3fd406d7..0df55de933cf 100644 --- a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java +++ b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java @@ -215,7 +215,7 @@ public void localPlanningViaDistributedScanWithoutFilterWithStats(Blackhole blac private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.driver.maxResultSize", "8G") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) diff --git a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java index 8a8097834ef8..fd3eab4d9df6 100644 --- a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java +++ b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java @@ -198,7 +198,7 @@ private void initDataAndDeletes() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) diff --git a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java index d917eae5eb0f..caa23625fc44 100644 --- a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java +++ b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java @@ -138,7 +138,7 @@ private void runBenchmark(RowLevelOperationMode mode, double updatePercentage) { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java index bfcb5af235d3..ef4f0090292c 100644 --- a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java +++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java @@ -49,7 +49,12 @@ public class TestExtendedParser { @BeforeAll public static void before() { - spark = SparkSession.builder().master("local").appName("TestExtendedParser").getOrCreate(); + spark = + SparkSession.builder() + .master("local") + .appName("TestExtendedParser") + .config(TestBase.DISABLE_UI) + .getOrCreate(); } @AfterAll diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java index 6de4e0d6461e..f766fbb79aff 100644 --- a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java +++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java @@ -71,6 +71,7 @@ public static void startMetastoreAndSpark() { .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") .config( SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), String.valueOf(RANDOM.nextBoolean())) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java index 296564e20d4a..2db56fa844bb 100644 --- a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java +++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java @@ -647,4 +647,17 @@ public void deleteAfterDroppingPartitionAndSourceColumn() { sql("DELETE FROM %s WHERE id >= 1", tableName); assertThat(sql("SELECT * FROM %s WHERE id >= 1", tableName)).isEmpty(); } + + @TestTemplate + public void testReaddColumnAfterIdentityPartitionDrop() { + createTable("id bigint NOT NULL, category string, data string", "category"); + + sql("ALTER TABLE %s DROP PARTITION FIELD category", tableName); + sql("ALTER TABLE %s DROP COLUMN category", tableName); + sql("ALTER TABLE %s ADD COLUMN category string", tableName); + + sql("INSERT INTO %s (id, category, data) VALUES (1, 'books', 'a')", tableName); + assertThat(sql("SELECT id, category, data FROM %s ORDER BY id", tableName)) + .containsExactly(row(1L, "books", "a")); + } } diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java index 7e0f6207edc9..9e9d751691be 100644 --- a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java +++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java @@ -1424,6 +1424,62 @@ public void testDeleteToCustomWapBranchWithoutWhereClause() throws NoSuchTableEx }); } + @TestTemplate + public void testDeleteToWapBranchCanDeleteWhereScansWapBranch() throws NoSuchTableException { + assumeThat(branch).as("WAP branch only works for table identifier without branch").isNull(); + + createAndInitPartitionedTable(); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' = 'true')", + tableName, TableProperties.WRITE_AUDIT_PUBLISH_ENABLED); + + append(tableName, new Employee(1, "hr")); + + spark.conf().set(SparkSQLProperties.WAP_BRANCH, "wap"); + try { + append(tableName, new Employee(0, "hr"), new Employee(1, "hr"), new Employee(2, "hr")); + + sql("DELETE FROM %s WHERE id = 1", tableName); + + assertThat(sql("SELECT id, dep FROM %s.branch_wap ORDER BY id", tableName)) + .as("DELETE should remove the matching rows from the WAP branch") + .containsExactly(row(0, "hr"), row(2, "hr")); + assertThat(sql("SELECT id, dep FROM %s.branch_main", tableName)) + .as("Main branch must not be modified by a WAP-targeted DELETE") + .containsExactly(row(1, "hr")); + } finally { + spark.conf().unset(SparkSQLProperties.WAP_BRANCH); + } + } + + @TestTemplate + public void testMetadataDeleteToWapBranchCommitsToWapBranch() throws NoSuchTableException { + assumeThat(branch).as("WAP branch only works for table identifier without branch").isNull(); + + createAndInitPartitionedTable(); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' = 'true')", + tableName, TableProperties.WRITE_AUDIT_PUBLISH_ENABLED); + + append(tableName, new Employee(1, "hr"), new Employee(5, "eng")); + + spark.conf().set(SparkSQLProperties.WAP_BRANCH, "wap"); + try { + append(tableName, new Employee(0, "hr"), new Employee(2, "eng")); + + sql("DELETE FROM %s WHERE dep = 'hr'", tableName); + + assertThat(sql("SELECT id, dep FROM %s.branch_wap ORDER BY id", tableName)) + .as("Metadata delete should remove the hr partition on the WAP branch") + .containsExactly(row(2, "eng"), row(5, "eng")); + assertThat(sql("SELECT id, dep FROM %s.branch_main ORDER BY id", tableName)) + .as("Metadata delete must not commit to main when WAP is set") + .containsExactly(row(1, "hr"), row(5, "eng")); + } finally { + spark.conf().unset(SparkSQLProperties.WAP_BRANCH); + } + } + @TestTemplate public void testDeleteWithFilterOnNestedColumn() { createAndInitNestedColumnsTable(); diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeSchemaEvolution.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeSchemaEvolution.java index d760d56b7a1d..782321b588a7 100644 --- a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeSchemaEvolution.java +++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeSchemaEvolution.java @@ -19,10 +19,12 @@ package org.apache.iceberg.spark.extensions; import static org.apache.spark.sql.functions.col; +import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assumptions.assumeThat; import java.util.Map; import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.TableProperties; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.junit.jupiter.api.AfterEach; @@ -260,6 +262,49 @@ public void testMergeWithSchemaEvolutionTypeWidening() { sql("SELECT id, value FROM %s ORDER BY id", selectTarget())); } + @TestTemplate + public void testMergeWithSchemaEvolutionDisabledByTableProperty() { + assumeThat(branch).as("Schema evolution does not work for branches currently").isNull(); + + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"software\" }"); + + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' = 'false')", + tableName, TableProperties.SPARK_WRITE_AUTO_SCHEMA_EVOLUTION); + + createOrReplaceView( + "source", + "id INT, dep STRING, salary INT", + "{ \"id\": 1, \"dep\": \"hr\", \"salary\": 100 }\n" + + "{ \"id\": 3, \"dep\": \"finance\", \"salary\": 300 }"); + + sql( + "MERGE WITH SCHEMA EVOLUTION INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED THEN " + + " UPDATE SET * " + + "WHEN NOT MATCHED THEN " + + " INSERT *", + commitTarget()); + + // Schema should NOT be evolved - 'salary' column should not be added + assertThat(sql("SELECT * FROM %s", selectTarget()).get(0).length) + .as("Table should still have only 2 columns (id, dep)") + .isEqualTo(2); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "hr"), // updated without salary + row(2, "software"), // kept + row(3, "finance")); // new without salary + assertEquals( + "Should have expected rows without schema evolution", + expectedRows, + sql("SELECT id, dep FROM %s ORDER BY id", selectTarget())); + } + @Override protected Map extraTableProperties() { return Map.of(); diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPartitionedWritesToWapBranch.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPartitionedWritesToWapBranch.java index 1db18f3a857d..af065451ab69 100644 --- a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPartitionedWritesToWapBranch.java +++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPartitionedWritesToWapBranch.java @@ -69,6 +69,7 @@ public static void startMetastoreAndSpark() { .config("spark.sql.shuffle.partitions", "4") .config("spark.sql.hive.metastorePartitionPruningFallbackOnException", "true") .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRowLevelOperationsWithLineage.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRowLevelOperationsWithLineage.java index 77303685235d..f38178a8e883 100644 --- a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRowLevelOperationsWithLineage.java +++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRowLevelOperationsWithLineage.java @@ -95,6 +95,18 @@ record -> + " fanout = {6}, branch = {7}, planningMode = {8}, formatVersion = {9}") public static Object[][] parameters() { return new Object[][] { + { + "testhadoop", + SparkCatalog.class.getName(), + ImmutableMap.of("type", "hadoop"), + FileFormat.ORC, + false, + WRITE_DISTRIBUTION_MODE_HASH, + true, + null, + LOCAL, + 3 + }, { "testhadoop", SparkCatalog.class.getName(), diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java index 19800c2f4666..3f8b574126ba 100644 --- a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java +++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java @@ -375,4 +375,54 @@ public void testSnapshotPartitionedV1() throws IOException { } } } + + @TestTemplate + public void testSnapshotWithVariant() throws IOException { + assumeThat(catalogName) + .as("Variant type requires Hive 4 which is not yet supported") + .isNotEqualTo("testhive") + .isNotEqualTo("spark_catalog"); + String location = Files.createTempDirectory(temp, "junit").toFile().toString(); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data variant) USING parquet LOCATION '%s'", + SOURCE_NAME, location); + sql("INSERT INTO TABLE %s VALUES (1, parse_json('{\"key\": 123}'))", SOURCE_NAME); + + Object result = + scalarSql( + "CALL %s.system.snapshot('%s', '%s', properties => map('format-version','3'))", + catalogName, SOURCE_NAME, tableName); + assertThat(result).as("Should have added one file").isEqualTo(1L); + + assertEquals( + "Should have expected rows", + ImmutableList.of(row(1L, 123)), + sql("SELECT id, variant_get(data, '$.key', 'int') FROM %s", tableName)); + } + + @TestTemplate + public void testSnapshotPartitionedWithVariant() throws IOException { + assumeThat(catalogName) + .as("Variant type requires Hive 4 which is not yet supported") + .isNotEqualTo("testhive") + .isNotEqualTo("spark_catalog"); + String location = Files.createTempDirectory(temp, "junit").toFile().toString(); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data variant) USING parquet PARTITIONED BY (id) LOCATION '%s'", + SOURCE_NAME, location); + sql( + "INSERT INTO TABLE %s (id, data) VALUES (1, parse_json('{\"key\": 123}')), (2, parse_json('{\"key\": 456}'))", + SOURCE_NAME); + + Object result = + scalarSql( + "CALL %s.system.snapshot('%s', '%s', properties => map('format-version','3'))", + catalogName, SOURCE_NAME, tableName); + assertThat(result).as("Should have added two files").isEqualTo(2L); + + assertEquals( + "Should have expected rows", + ImmutableList.of(row(123, 1L), row(456, 2L)), + sql("SELECT variant_get(data, '$.key', 'int'), id FROM %s ORDER BY id", tableName)); + } } diff --git a/spark/v4.1/spark-runtime/LICENSE b/spark/v4.1/spark-runtime/LICENSE index a67296eb412c..50c91faf8edb 100644 --- a/spark/v4.1/spark-runtime/LICENSE +++ b/spark/v4.1/spark-runtime/LICENSE @@ -219,6 +219,94 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles FastDoubleParser (via Jackson JSON Processor). + +Copyright: 2023 Werner Randelshofer, Switzerland +Project URL: https://github.com/wrandelshofer/FastDoubleParser +License: MIT + +| Copyright (c) 2023 Werner Randelshofer, Switzerland +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles fast_float (bundled by FastDoubleParser). + +Copyright: 2021 The fast_float authors +Project URL: https://github.com/fastfloat/fast_float +License: MIT + +| Copyright (c) 2021 The fast_float authors +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles bigint (bundled by FastDoubleParser). + +Copyright: 2022 Tim Buktu +Project URL: https://github.com/tbuktu/bigint +License: BSD 2-Clause + +| Copyright 2022 Tim Buktu +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| 1. Redistributions of source code must retain the above copyright notice, this +| list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + This product bundles Apache Parquet. Copyright: 2014-2024 The Apache Software Foundation @@ -227,7 +315,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Thrift. +This product bundles Apache Thrift (bundled by Parquet). Copyright: 2006-2017 The Apache Software Foundation. Project URL: https://thrift.apache.org/ @@ -235,7 +323,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product includes code from Daniel Lemire's JavaFastPFOR project. +This product includes code from Daniel Lemire's JavaFastPFOR project (bundled by Parquet). Copyright: 2013 Daniel Lemire Project URL: https://github.com/lemire/JavaFastPFOR @@ -243,7 +331,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles fastutil. +This product bundles fastutil (bundled by Parquet). Copyright: 2002-2014 Sebastiano Vigna Project URL: http://fastutil.di.unimi.it/ @@ -251,6 +339,13 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles Zero-Allocation Hashing (bundled by Parquet). + +Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + This product bundles Apache ORC. Copyright: 2013 and onwards The Apache Software Foundation. @@ -259,7 +354,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Hive. +This product bundles Apache Hive's Storage API (bundled by ORC). Copyright: 2008-2020 The Apache Software Foundation Project URL: https://hive.apache.org/ @@ -267,11 +362,12 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Google protobuf. +This product bundles Google protobuf (bundled by ORC). Copyright: 2008 Google Inc. Project URL: https://developers.google.com/protocol-buffers License: BSD 3-Clause + | Copyright 2008 Google Inc. All rights reserved. | | Redistribution and use in source and binary forms, with or without @@ -339,6 +435,387 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles the Mozilla Public Suffix List, distributed by Apache HttpComponents. + +Project URL: https://publicsuffix.org/ +License: Mozilla Public License, Version 2.0 - https://mozilla.org/MPL/2.0/ + +| Mozilla Public License Version 2.0 +| ================================== +| +| 1. Definitions +| -------------- +| +| 1.1. "Contributor" +| means each individual or legal entity that creates, contributes to +| the creation of, or owns Covered Software. +| +| 1.2. "Contributor Version" +| means the combination of the Contributions of others (if any) used +| by a Contributor and that particular Contributor's Contribution. +| +| 1.3. "Contribution" +| means Covered Software of a particular Contributor. +| +| 1.4. "Covered Software" +| means Source Code Form to which the initial Contributor has attached +| the notice in Exhibit A, the Executable Form of such Source Code +| Form, and Modifications of such Source Code Form, in each case +| including portions thereof. +| +| 1.5. "Incompatible With Secondary Licenses" +| means +| +| (a) that the initial Contributor has attached the notice described +| in Exhibit B to the Covered Software; or +| +| (b) that the Covered Software was made available under the terms of +| version 1.1 or earlier of the License, but not also under the +| terms of a Secondary License. +| +| 1.6. "Executable Form" +| means any form of the work other than Source Code Form. +| +| 1.7. "Larger Work" +| means a work that combines Covered Software with other material, in +| a separate file or files, that is not Covered Software. +| +| 1.8. "License" +| means this document. +| +| 1.9. "Licensable" +| means having the right to grant, to the maximum extent possible, +| whether at the time of the initial grant or subsequently, any and +| all of the rights conveyed by this License. +| +| 1.10. "Modifications" +| means any of the following: +| +| (a) any file in Source Code Form that results from an addition to, +| deletion from, or modification of the contents of Covered +| Software; or +| +| (b) any new file in Source Code Form that contains any Covered +| Software. +| +| 1.11. "Patent Claims" of a Contributor +| means any patent claim(s), including without limitation, method, +| process, and apparatus claims, in any patent Licensable by such +| Contributor that would be infringed, but for the grant of the +| License, by the making, using, selling, offering for sale, having +| made, import, or transfer of either its Contributions or its +| Contributor Version. +| +| 1.12. "Secondary License" +| means either the GNU General Public License, Version 2.0, the GNU +| Lesser General Public License, Version 2.1, the GNU Affero General +| Public License, Version 3.0, or any later versions of those +| licenses. +| +| 1.13. "Source Code Form" +| means the form of the work preferred for making modifications. +| +| 1.14. "You" (or "Your") +| means an individual or a legal entity exercising rights under this +| License. For legal entities, "You" includes any entity that +| controls, is controlled by, or is under common control with You. For +| purposes of this definition, "control" means (a) the power, direct +| or indirect, to cause the direction or management of such entity, +| whether by contract or otherwise, or (b) ownership of more than +| fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants and Conditions +| -------------------------------- +| +| 2.1. Grants +| +| Each Contributor hereby grants You a world-wide, royalty-free, +| non-exclusive license: +| +| (a) under intellectual property rights (other than patent or trademark) +| Licensable by such Contributor to use, reproduce, make available, +| modify, display, perform, distribute, and otherwise exploit its +| Contributions, either on an unmodified basis, with Modifications, or +| as part of a Larger Work; and +| +| (b) under Patent Claims of such Contributor to make, use, sell, offer +| for sale, have made, import, and otherwise transfer either its +| Contributions or its Contributor Version. +| +| 2.2. Effective Date +| +| The licenses granted in Section 2.1 with respect to any Contribution +| become effective for each Contribution on the date the Contributor first +| distributes such Contribution. +| +| 2.3. Limitations on Grant Scope +| +| The licenses granted in this Section 2 are the only rights granted under +| this License. No additional rights or licenses will be implied from the +| distribution or licensing of Covered Software under this License. +| Notwithstanding Section 2.1(b) above, no patent license is granted by a +| Contributor: +| +| (a) for any code that a Contributor has removed from Covered Software; +| or +| +| (b) for infringements caused by: (i) Your and any other third party's +| modifications of Covered Software, or (ii) the combination of its +| Contributions with other software (except as part of its Contributor +| Version); or +| +| (c) under Patent Claims infringed by Covered Software in the absence of +| its Contributions. +| +| This License does not grant any rights in the trademarks, service marks, +| or logos of any Contributor (except as may be necessary to comply with +| the notice requirements in Section 3.4). +| +| 2.4. Subsequent Licenses +| +| No Contributor makes additional grants as a result of Your choice to +| distribute the Covered Software under a subsequent version of this +| License (see Section 10.2) or under the terms of a Secondary License (if +| permitted under the terms of Section 3.3). +| +| 2.5. Representation +| +| Each Contributor represents that the Contributor believes its +| Contributions are its original creation(s) or it has sufficient rights +| to grant the rights to its Contributions conveyed by this License. +| +| 2.6. Fair Use +| +| This License is not intended to limit any rights You have under +| applicable copyright doctrines of fair use, fair dealing, or other +| equivalents. +| +| 2.7. Conditions +| +| Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +| in Section 2.1. +| +| 3. Responsibilities +| ------------------- +| +| 3.1. Distribution of Source Form +| +| All distribution of Covered Software in Source Code Form, including any +| Modifications that You create or to which You contribute, must be under +| the terms of this License. You must inform recipients that the Source +| Code Form of the Covered Software is governed by the terms of this +| License, and how they can obtain a copy of this License. You may not +| attempt to alter or restrict the recipients' rights in the Source Code +| Form. +| +| 3.2. Distribution of Executable Form +| +| If You distribute Covered Software in Executable Form then: +| +| (a) such Covered Software must also be made available in Source Code +| Form, as described in Section 3.1, and You must inform recipients of +| the Executable Form how they can obtain a copy of such Source Code +| Form by reasonable means in a timely manner, at a charge no more +| than the cost of distribution to the recipient; and +| +| (b) You may distribute such Executable Form under the terms of this +| License, or sublicense it under different terms, provided that the +| license for the Executable Form does not attempt to limit or alter +| the recipients' rights in the Source Code Form under this License. +| +| 3.3. Distribution of a Larger Work +| +| You may create and distribute a Larger Work under terms of Your choice, +| provided that You also comply with the requirements of this License for +| the Covered Software. If the Larger Work is a combination of Covered +| Software with a work governed by one or more Secondary Licenses, and the +| Covered Software is not Incompatible With Secondary Licenses, this +| License permits You to additionally distribute such Covered Software +| under the terms of such Secondary License(s), so that the recipient of +| the Larger Work may, at their option, further distribute the Covered +| Software under the terms of either this License or such Secondary +| License(s). +| +| 3.4. Notices +| +| You may not remove or alter the substance of any license notices +| (including copyright notices, patent notices, disclaimers of warranty, +| or limitations of liability) contained within the Source Code Form of +| the Covered Software, except that You may alter any license notices to +| the extent required to remedy known factual inaccuracies. +| +| 3.5. Application of Additional Terms +| +| You may choose to offer, and to charge a fee for, warranty, support, +| indemnity or liability obligations to one or more recipients of Covered +| Software. However, You may do so only on Your own behalf, and not on +| behalf of any Contributor. You must make it absolutely clear that any +| such warranty, support, indemnity, or liability obligation is offered by +| You alone, and You hereby agree to indemnify every Contributor for any +| liability incurred by such Contributor as a result of warranty, support, +| indemnity or liability terms You offer. You may include additional +| disclaimers of warranty and limitations of liability specific to any +| jurisdiction. +| +| 4. Inability to Comply Due to Statute or Regulation +| --------------------------------------------------- +| +| If it is impossible for You to comply with any of the terms of this +| License with respect to some or all of the Covered Software due to +| statute, judicial order, or regulation then You must: (a) comply with +| the terms of this License to the maximum extent possible; and (b) +| describe the limitations and the code they affect. Such description must +| be placed in a text file included with all distributions of the Covered +| Software under this License. Except to the extent prohibited by statute +| or regulation, such description must be sufficiently detailed for a +| recipient of ordinary skill to be able to understand it. +| +| 5. Termination +| -------------- +| +| 5.1. The rights granted under this License will terminate automatically +| if You fail to comply with any of its terms. However, if You become +| compliant, then the rights granted under this License from a particular +| Contributor are reinstated (a) provisionally, unless and until such +| Contributor explicitly and finally terminates Your grants, and (b) on an +| ongoing basis, if such Contributor fails to notify You of the +| non-compliance by some reasonable means prior to 60 days after You have +| come back into compliance. Moreover, Your grants from a particular +| Contributor are reinstated on an ongoing basis if such Contributor +| notifies You of the non-compliance by some reasonable means, this is the +| first time You have received notice of non-compliance with this License +| from such Contributor, and You become compliant prior to 30 days after +| Your receipt of the notice. +| +| 5.2. If You initiate litigation against any entity by asserting a patent +| infringement claim (excluding declaratory judgment actions, +| counter-claims, and cross-claims) alleging that a Contributor Version +| directly or indirectly infringes any patent, then the rights granted to +| You by any and all Contributors for the Covered Software under Section +| 2.1 of this License shall terminate. +| +| 5.3. In the event of termination under Sections 5.1 or 5.2 above, all +| end user license agreements (excluding distributors and resellers) which +| have been validly granted by You or Your distributors under this License +| prior to termination shall survive termination. +| +| ************************************************************************ +| * * +| * 6. Disclaimer of Warranty * +| * ------------------------- * +| * * +| * Covered Software is provided under this License on an "as is" * +| * basis, without warranty of any kind, either expressed, implied, or * +| * statutory, including, without limitation, warranties that the * +| * Covered Software is free of defects, merchantable, fit for a * +| * particular purpose or non-infringing. The entire risk as to the * +| * quality and performance of the Covered Software is with You. * +| * Should any Covered Software prove defective in any respect, You * +| * (not any Contributor) assume the cost of any necessary servicing, * +| * repair, or correction. This disclaimer of warranty constitutes an * +| * essential part of this License. No use of any Covered Software is * +| * authorized under this License except under this disclaimer. * +| * * +| ************************************************************************ +| +| ************************************************************************ +| * * +| * 7. Limitation of Liability * +| * -------------------------- * +| * * +| * Under no circumstances and under no legal theory, whether tort * +| * (including negligence), contract, or otherwise, shall any * +| * Contributor, or anyone who distributes Covered Software as * +| * permitted above, be liable to You for any direct, indirect, * +| * special, incidental, or consequential damages of any character * +| * including, without limitation, damages for lost profits, loss of * +| * goodwill, work stoppage, computer failure or malfunction, or any * +| * and all other commercial damages or losses, even if such party * +| * shall have been informed of the possibility of such damages. This * +| * limitation of liability shall not apply to liability for death or * +| * personal injury resulting from such party's negligence to the * +| * extent applicable law prohibits such limitation. Some * +| * jurisdictions do not allow the exclusion or limitation of * +| * incidental or consequential damages, so this exclusion and * +| * limitation may not apply to You. * +| * * +| ************************************************************************ +| +| 8. Litigation +| ------------- +| +| Any litigation relating to this License may be brought only in the +| courts of a jurisdiction where the defendant maintains its principal +| place of business and such litigation shall be governed by laws of that +| jurisdiction, without reference to its conflict-of-law provisions. +| Nothing in this Section shall prevent a party's ability to bring +| cross-claims or counter-claims. +| +| 9. Miscellaneous +| ---------------- +| +| This License represents the complete agreement concerning the subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. Any law or regulation which provides +| that the language of a contract shall be construed against the drafter +| shall not be used to construe this License against a Contributor. +| +| 10. Versions of the License +| --------------------------- +| +| 10.1. New Versions +| +| Mozilla Foundation is the license steward. Except as provided in Section +| 10.3, no one other than the license steward has the right to modify or +| publish new versions of this License. Each version will be given a +| distinguishing version number. +| +| 10.2. Effect of New Versions +| +| You may distribute the Covered Software under the terms of the version +| of the License under which You originally received the Covered Software, +| or under the terms of any subsequent version published by the license +| steward. +| +| 10.3. Modified Versions +| +| If you create software not governed by this License, and you want to +| create a new license for such software, you may create and use a +| modified version of this License if you rename the license and remove +| any references to the name of the license steward (except to note that +| such modified license differs from this License). +| +| 10.4. Distributing Source Code Form that is Incompatible With Secondary +| Licenses +| +| If You choose to distribute Source Code Form that is Incompatible With +| Secondary Licenses under the terms of this version of the License, the +| notice described in Exhibit B of this License must be attached. +| +| Exhibit A - Source Code Form License Notice +| ------------------------------------------- +| +| This Source Code Form is subject to the terms of the Mozilla Public +| License, v. 2.0. If a copy of the MPL was not distributed with this +| file, You can obtain one at http://mozilla.org/MPL/2.0/. +| +| If it is not possible or desirable to put the notice in a particular +| file, then You may include the notice in a location (such as a LICENSE +| file in a relevant directory) where a recipient would be likely to look +| for such a notice. +| +| You may add additional accurate notices of copyright ownership. +| +| Exhibit B - "Incompatible With Secondary Licenses" Notice +| --------------------------------------------------------- +| +| This Source Code Form is "Incompatible With Secondary Licenses", as +| defined by the Mozilla Public License, v. 2.0. + +-------------------------------------------------------------------------------- + This product bundles Google Error Prone Annotations. Copyright: Copyright 2011-2019 The Error Prone Authors @@ -352,6 +829,7 @@ This product bundles checkerframework checker-qual. Copyright: 2004-2019 the Checker Framework developers Project URL: https://github.com/typetools/checker-framework License: MIT license + | The annotations are licensed under the MIT License. (The text of this | license appears below.) More specifically, all the parts of the Checker | Framework that you might want to include with your own program use the @@ -390,20 +868,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Perfmark. - -Project URL: https://github.com/perfmark/perfmark -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles org.json. - -Project URL: https://github.com/douglascrockford/JSON-java -License: Public Domain - https://github.com/stleary/JSON-java/blob/master/LICENSE - --------------------------------------------------------------------------------- - This product bundles Apache Arrow. Copyright: 2016-2019 The Apache Software Foundation. @@ -420,47 +884,18 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Google FlatBuffers. +This product bundles JCTools (via Netty). -Copyright: 2013-2020 Google Inc. -Project URL: https://google.github.io/flatbuffers/ +Project URL: https://github.com/JCTools/JCTools License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles ThreeTen BP. +This product bundles Google FlatBuffers. -Project URL: https://www.threeten.org/threetenbp -License: BSD 3-Clause -| Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. -| -| All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are met: -| -| * Redistributions of source code must retain the above copyright notice, -| this list of conditions and the following disclaimer. -| -| * Redistributions in binary form must reproduce the above copyright notice, -| this list of conditions and the following disclaimer in the documentation -| and/or other materials provided with the distribution. -| -| * Neither the name of JSR-310 nor the names of its contributors -| may be used to endorse or promote products derived from this software -| without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +Copyright: 2013-2020 Google Inc. +Project URL: https://google.github.io/flatbuffers/ +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -469,6 +904,7 @@ This product bundles ThreeTen Extra. Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. Project URL: https://www.threeten.org/threeten-extra/ License: BSD 3-Clause + | All rights reserved. | | * Redistribution and use in source and binary forms, with or without @@ -540,19 +976,11 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache HttpComponents (client and core). - -Copyright: 1999-2022 The Apache Software Foundation. -Project URL: https://hc.apache.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product includes code from Apache HttpComponents Client. +This product bundles and includes code from Apache HttpComponents (core/client). * retry and error handling logic in ExponentialHttpRequestRetryStrategy.java -Copyright: 1999-2022 The Apache Software Foundation. +Copyright: 1999-2022 The Apache Software Foundation Project URL: https://hc.apache.org/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 @@ -573,16 +1001,46 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Datasketches. +This product bundles Eclipse Collections. -Project URL: https://datasketches.apache.org -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 +Project URL: https://github.com/eclipse-collections/eclipse-collections +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php + +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| * Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| * Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- -This product bundles Zero-Allocation Hashing. +This product bundles Apache Datasketches. -Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing +Project URL: https://datasketches.apache.org License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -590,70 +1048,37 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 This product bundles JTS Topology Suite. Project URL: https://github.com/locationtech/jts -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt - --------------------------------------------------------------------------------- - -This product bundles JSpecify. - -Project URL: https://github.com/jspecify/jspecify -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Animal Sniffer Annotations. - -License: MIT -| The MIT License -| -| Copyright (c) 2009 codehaus.org. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in -| all copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -| THE SOFTWARE. - --------------------------------------------------------------------------------- - -This product bundles Android Annotations. - -Project URL: http://source.android.com/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Conscrypt (openjdk-uber). - -Project URL: https://conscrypt.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud BigQuery Client for Java. - -Project URL: https://github.com/googleapis/java-bigquery -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud Java Client Libraries. - -Project URL: https://github.com/googleapis/google-cloud-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| * Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| * Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- This product bundles RoaringBitmap. @@ -661,128 +1086,3 @@ This product bundles RoaringBitmap. Copyright: (c) 2013-... the RoaringBitmap authors Project URL: https://github.com/RoaringBitmap/RoaringBitmap License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Credentials for Java. - -Project URL: https://github.com/aliyun/credentials-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Tea for Java. - -Project URL: https://github.com/aliyun/tea-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google GAX. - -Project URL: https://github.com/googleapis/gax-java -License: BSD 3-Clause -| Copyright 2016, Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This product bundles Google Gson. - -Project URL: https://github.com/google/gson/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles gRPC. - -Project URL: https://github.com/grpc/grpc-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OpenCensus. - -Project URL: https://github.com/census-instrumentation/opencensus-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OkHttp. - -Project URL: https://github.com/square/okhttp -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Auto Value. - -Project URL: https://github.com/google/auto/tree/master/value -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JaCoCo. - -Project URL: https://github.com/jacoco/jacoco/ -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt - --------------------------------------------------------------------------------- - -This product bundles OpenTelemetry. - -Project URL: https://opentelemetry.io/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Kotlin. - -Project URL: https://github.com/JetBrains/kotlin -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JAXB. - -Project URL: http://jaxb.java.net/ -License: CDDL 1.1 - https://glassfish.java.net/public/CDDL+GPL_1_1.html - --------------------------------------------------------------------------------- - -This product bundles EMMA runtime. - -Project URL: https://github.com/ehelms/Emma/ -License: Common Public License - v 1.0 - --------------------------------------------------------------------------------- - -This product bundles Google j2objc. - -Project URL: https://github.com/google/j2objc/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 diff --git a/spark/v4.1/spark-runtime/NOTICE b/spark/v4.1/spark-runtime/NOTICE index 68abd73906b1..551ef59f2010 100644 --- a/spark/v4.1/spark-runtime/NOTICE +++ b/spark/v4.1/spark-runtime/NOTICE @@ -66,42 +66,6 @@ This product bundles Airlift Aircompressor with the following in its NOTICE file -------------------------------------------------------------------------------- -This product bundles Google Protobuf with the following in its NOTICE file: -| Copyright 2008 Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -| -| Code generated by the Protocol Buffer compiler is owned by the owner -| of the input file used when generating it. This code is not -| standalone and requires a support library to be linked with it. This -| support library is itself covered by the above license. - --------------------------------------------------------------------------------- - This product bundles Netty with the following in its NOTICE file: | The Netty Project | ================= @@ -353,6 +317,42 @@ This product bundles Netty with the following in its NOTICE file: -------------------------------------------------------------------------------- +This product bundles Jackson JSON Processor with the following in its NOTICE file: +| # Jackson JSON processor +| +| Jackson is a high-performance, Free/Open Source JSON processing library. +| It was originally written by Tatu Saloranta (tatu.saloranta@iki.fi), and has +| been in development since 2007. +| It is currently developed by a community of developers. +| +| ## Copyright +| +| Copyright 2007-, Tatu Saloranta (tatu.saloranta@iki.fi) +| +| ## Licensing +| +| Jackson 2.x core and extension components are licensed under Apache License 2.0 +| To find the details that apply to this artifact see the accompanying LICENSE file. +| +| ## Credits +| +| A list of contributors may be found from CREDITS(-2.x) file, which is included +| in some artifacts (usually source distributions); but is always available +| from the source code management (SCM) system project uses. +| +| ## FastDoubleParser +| +| jackson-core bundles a shaded copy of FastDoubleParser . +| That code is available under an MIT license +| under the following copyright. +| +| Copyright © 2023 Werner Randelshofer, Switzerland. MIT License. +| +| See FastDoubleParser-NOTICE for details of other source code included in FastDoubleParser +| and the licenses and copyrights that apply to that code. + +-------------------------------------------------------------------------------- + This product bundles Project Nessie with the following in its NOTICE file: | Nessie | Copyright 2015-2025 Dremio Corporation @@ -391,69 +391,3 @@ This product bundles Eclipse MicroProfile OpenAPI with the following in its NOTI | PackageCopyrightText: | Arthur De Magalhaes arthurdm@ca.ibm.com | - --------------------------------------------------------------------------------- - -This product bundles gRPC with the following in its NOTICE file: -| Copyright 2014 The gRPC Authors -| -| Licensed under the Apache License, Version 2.0 (the "License"); -| you may not use this file except in compliance with the License. -| You may obtain a copy of the License at -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, -| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -| See the License for the specific language governing permissions and -| limitations under the License. -| -| ----------------------------------------------------------------------- -| -| This product contains a modified portion of 'OkHttp', an open source -| HTTP & SPDY client for Android and Java applications, which can be obtained -| at: -| -| * LICENSE: -| * okhttp/third_party/okhttp/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/square/okhttp -| * LOCATION_IN_GRPC: -| * okhttp/third_party/okhttp -| -| This product contains a modified portion of 'Envoy', an open source -| cloud-native high-performance edge/middle/service proxy, which can be -| obtained at: -| -| * LICENSE: -| * xds/third_party/envoy/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/envoy/NOTICE -| * HOMEPAGE: -| * https://www.envoyproxy.io -| * LOCATION_IN_GRPC: -| * xds/third_party/envoy -| -| This product contains a modified portion of 'protoc-gen-validate (PGV)', -| an open source protoc plugin to generate polyglot message validators, -| which can be obtained at: -| -| * LICENSE: -| * xds/third_party/protoc-gen-validate/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/protoc-gen-validate/NOTICE -| * HOMEPAGE: -| * https://github.com/envoyproxy/protoc-gen-validate -| * LOCATION_IN_GRPC: -| * xds/third_party/protoc-gen-validate -| -| This product contains a modified portion of 'udpa', -| an open source universal data plane API, which can be obtained at: -| -| * LICENSE: -| * xds/third_party/udpa/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/cncf/udpa -| * LOCATION_IN_GRPC: -| * xds/third_party/udpa diff --git a/spark/v4.1/spark-runtime/baseline-class-uniqueness.lock b/spark/v4.1/spark-runtime/baseline-class-uniqueness.lock index 35cad90d888f..6197975f3900 100644 --- a/spark/v4.1/spark-runtime/baseline-class-uniqueness.lock +++ b/spark/v4.1/spark-runtime/baseline-class-uniqueness.lock @@ -77,21 +77,21 @@ - io.netty.buffer.AdaptivePoolingAllocator$SizeClassChunkController - io.netty.buffer.AdaptivePoolingAllocator$SizeClassedChunk - io.netty.buffer.ByteBufUtil - - io.netty.buffer.ByteBufUtil$2 - - io.netty.buffer.ByteBufUtil$HexUtil - - io.netty.buffer.ByteBufUtil$ThreadLocalDirectByteBuf - - io.netty.buffer.ByteBufUtil$ThreadLocalDirectByteBuf$1 - - io.netty.buffer.ByteBufUtil$ThreadLocalUnsafeDirectByteBuf - - io.netty.buffer.ByteBufUtil$ThreadLocalUnsafeDirectByteBuf$1 - io.netty.buffer.CompositeByteBuf - io.netty.buffer.EmptyByteBuf - io.netty.buffer.PoolArena + - io.netty.buffer.PoolArena$DirectArena - io.netty.buffer.PoolThreadCache$FreeOnFinalize - io.netty.buffer.PooledByteBufAllocator + - io.netty.buffer.PooledByteBufAllocator$PoolThreadLocalCache - io.netty.buffer.ReadOnlyAbstractByteBuf - io.netty.buffer.SimpleLeakAwareByteBuf - io.netty.buffer.Unpooled + - io.netty.buffer.UnpooledByteBufAllocator$DecrementingCleanableDirectBuffer + - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledDirectByteBuf + - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeDirectByteBuf - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeNoCleanerDirectByteBuf + - io.netty.buffer.UnpooledByteBufAllocator$UnpooledByteBufAllocatorMetric - io.netty.buffer.UnpooledDirectByteBuf - io.netty.buffer.UnpooledHeapByteBuf - io.netty.buffer.UnpooledUnsafeDirectByteBuf @@ -99,11 +99,19 @@ - io.netty.buffer.UnsafeByteBufUtil [dev.vortex:vortex-jni (classifier=all), io.netty:netty-common] - io.netty.util.AbstractReferenceCounted + - io.netty.util.DefaultAttributeMap - io.netty.util.HashedWheelTimer - io.netty.util.HashedWheelTimer$HashedWheelBucket - io.netty.util.LeakPresenceDetector - io.netty.util.LeakPresenceDetector$LeakCreation - io.netty.util.LeakPresenceDetector$ResourceScope + - io.netty.util.Recycler + - io.netty.util.Recycler$BlockingMessageQueue + - io.netty.util.Recycler$DefaultHandle + - io.netty.util.Recycler$EnhancedHandle + - io.netty.util.Recycler$GuardedLocalPool + - io.netty.util.Recycler$LocalPool + - io.netty.util.Recycler$UnguardedLocalPool - io.netty.util.concurrent.AbstractScheduledEventExecutor - io.netty.util.concurrent.GlobalEventExecutor - io.netty.util.concurrent.GlobalEventExecutor$2 @@ -118,11 +126,23 @@ - io.netty.util.concurrent.SingleThreadEventExecutor$4 - io.netty.util.concurrent.SingleThreadEventExecutor$5 - io.netty.util.concurrent.SingleThreadEventExecutor$DefaultThreadProperties + - io.netty.util.internal.Cleaner - io.netty.util.internal.CleanerJava24Linker - io.netty.util.internal.CleanerJava24Linker$CleanableDirectBufferImpl - io.netty.util.internal.CleanerJava25 - io.netty.util.internal.CleanerJava25$CleanableDirectBufferImpl + - io.netty.util.internal.CleanerJava6 + - io.netty.util.internal.CleanerJava6$2 + - io.netty.util.internal.CleanerJava6$CleanableDirectBufferImpl + - io.netty.util.internal.CleanerJava9 + - io.netty.util.internal.CleanerJava9$2 + - io.netty.util.internal.CleanerJava9$CleanableDirectBufferImpl + - io.netty.util.internal.DirectCleaner + - io.netty.util.internal.DirectCleaner$CleanableDirectBufferImpl + - io.netty.util.internal.EmptyArrays - io.netty.util.internal.PlatformDependent + - io.netty.util.internal.PlatformDependent$1 + - io.netty.util.internal.PlatformDependent$1$1 - io.netty.util.internal.PlatformDependent$Mpsc - io.netty.util.internal.PlatformDependent$Mpsc$1 - io.netty.util.internal.PlatformDependent0 diff --git a/spark/v4.1/spark-runtime/runtime-deps.txt b/spark/v4.1/spark-runtime/runtime-deps.txt new file mode 100644 index 000000000000..ec5a5a3785fc --- /dev/null +++ b/spark/v4.1/spark-runtime/runtime-deps.txt @@ -0,0 +1,48 @@ +com.fasterxml.jackson.core:jackson-annotations:2.21 +com.fasterxml.jackson.core:jackson-core:2.15.2 +com.fasterxml.jackson.core:jackson-databind:2.15.2 +com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.21.3 +com.github.ben-manes.caffeine:caffeine:2.9.3 +com.google.errorprone:error_prone_annotations:2.41.0 +com.google.flatbuffers:flatbuffers-java:25.2.10 +com.google.guava:failureaccess:1.0.3 +com.google.guava:guava:33.5.0-jre +com.google.guava:listenablefuture:9999.0-empty-to-avoid-conflict-with-guava +com.google.j2objc:j2objc-annotations:3.1 +com.google.protobuf:protobuf-java:4.33.5 +dev.failsafe:failsafe:3.3.2 +dev.vortex:vortex-jni:0.67.0 +dev.vortex:vortex-spark_2.13:0.67.0 +io.airlift:aircompressor:2.0.3 +io.netty:netty-buffer:4.2.12.Final +io.netty:netty-common:4.2.12.Final +org.apache.arrow:arrow-c-data:18.3.0 +org.apache.arrow:arrow-format:18.3.0 +org.apache.arrow:arrow-memory-core:18.3.0 +org.apache.arrow:arrow-memory-netty-buffer-patch:18.3.0 +org.apache.arrow:arrow-memory-netty:18.3.0 +org.apache.arrow:arrow-vector:18.3.0 +org.apache.avro:avro:1.12.1 +org.apache.datasketches:datasketches-java:6.2.0 +org.apache.datasketches:datasketches-memory:3.0.2 +org.apache.httpcomponents.client5:httpclient5:5.6.1 +org.apache.httpcomponents.core5:httpcore5-h2:5.4 +org.apache.httpcomponents.core5:httpcore5:5.4 +org.apache.orc:orc-core:1.9.8 +org.apache.orc:orc-shims:1.9.8 +org.apache.parquet:parquet-avro:1.17.0 +org.apache.parquet:parquet-column:1.17.0 +org.apache.parquet:parquet-common:1.17.0 +org.apache.parquet:parquet-encoding:1.17.0 +org.apache.parquet:parquet-format-structures:1.17.0 +org.apache.parquet:parquet-hadoop:1.17.0 +org.apache.parquet:parquet-jackson:1.17.0 +org.apache.parquet:parquet-variant:1.17.0 +org.checkerframework:checker-qual:3.19.0 +org.eclipse.microprofile.openapi:microprofile-openapi-api:4.1.1 +org.jspecify:jspecify:1.0.0 +org.locationtech.jts:jts-core:1.20.0 +org.projectnessie.nessie:nessie-client:0.107.5 +org.projectnessie.nessie:nessie-model:0.107.5 +org.roaringbitmap:RoaringBitmap:1.6.14 +org.threeten:threeten-extra:1.7.1 diff --git a/spark/v4.1/spark/baseline-class-uniqueness.lock b/spark/v4.1/spark/baseline-class-uniqueness.lock index 4a6e30c63973..72c0c24fb849 100644 --- a/spark/v4.1/spark/baseline-class-uniqueness.lock +++ b/spark/v4.1/spark/baseline-class-uniqueness.lock @@ -125,21 +125,21 @@ - io.netty.buffer.AdaptivePoolingAllocator$SizeClassChunkController - io.netty.buffer.AdaptivePoolingAllocator$SizeClassedChunk - io.netty.buffer.ByteBufUtil - - io.netty.buffer.ByteBufUtil$2 - - io.netty.buffer.ByteBufUtil$HexUtil - - io.netty.buffer.ByteBufUtil$ThreadLocalDirectByteBuf - - io.netty.buffer.ByteBufUtil$ThreadLocalDirectByteBuf$1 - - io.netty.buffer.ByteBufUtil$ThreadLocalUnsafeDirectByteBuf - - io.netty.buffer.ByteBufUtil$ThreadLocalUnsafeDirectByteBuf$1 - io.netty.buffer.CompositeByteBuf - io.netty.buffer.EmptyByteBuf - io.netty.buffer.PoolArena + - io.netty.buffer.PoolArena$DirectArena - io.netty.buffer.PoolThreadCache$FreeOnFinalize - io.netty.buffer.PooledByteBufAllocator + - io.netty.buffer.PooledByteBufAllocator$PoolThreadLocalCache - io.netty.buffer.ReadOnlyAbstractByteBuf - io.netty.buffer.SimpleLeakAwareByteBuf - io.netty.buffer.Unpooled + - io.netty.buffer.UnpooledByteBufAllocator$DecrementingCleanableDirectBuffer + - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledDirectByteBuf + - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeDirectByteBuf - io.netty.buffer.UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeNoCleanerDirectByteBuf + - io.netty.buffer.UnpooledByteBufAllocator$UnpooledByteBufAllocatorMetric - io.netty.buffer.UnpooledDirectByteBuf - io.netty.buffer.UnpooledHeapByteBuf - io.netty.buffer.UnpooledUnsafeDirectByteBuf @@ -147,11 +147,19 @@ - io.netty.buffer.UnsafeByteBufUtil [dev.vortex:vortex-jni (classifier=all), io.netty:netty-common] - io.netty.util.AbstractReferenceCounted + - io.netty.util.DefaultAttributeMap - io.netty.util.HashedWheelTimer - io.netty.util.HashedWheelTimer$HashedWheelBucket - io.netty.util.LeakPresenceDetector - io.netty.util.LeakPresenceDetector$LeakCreation - io.netty.util.LeakPresenceDetector$ResourceScope + - io.netty.util.Recycler + - io.netty.util.Recycler$BlockingMessageQueue + - io.netty.util.Recycler$DefaultHandle + - io.netty.util.Recycler$EnhancedHandle + - io.netty.util.Recycler$GuardedLocalPool + - io.netty.util.Recycler$LocalPool + - io.netty.util.Recycler$UnguardedLocalPool - io.netty.util.concurrent.AbstractScheduledEventExecutor - io.netty.util.concurrent.GlobalEventExecutor - io.netty.util.concurrent.GlobalEventExecutor$2 @@ -166,11 +174,23 @@ - io.netty.util.concurrent.SingleThreadEventExecutor$4 - io.netty.util.concurrent.SingleThreadEventExecutor$5 - io.netty.util.concurrent.SingleThreadEventExecutor$DefaultThreadProperties + - io.netty.util.internal.Cleaner - io.netty.util.internal.CleanerJava24Linker - io.netty.util.internal.CleanerJava24Linker$CleanableDirectBufferImpl - io.netty.util.internal.CleanerJava25 - io.netty.util.internal.CleanerJava25$CleanableDirectBufferImpl + - io.netty.util.internal.CleanerJava6 + - io.netty.util.internal.CleanerJava6$2 + - io.netty.util.internal.CleanerJava6$CleanableDirectBufferImpl + - io.netty.util.internal.CleanerJava9 + - io.netty.util.internal.CleanerJava9$2 + - io.netty.util.internal.CleanerJava9$CleanableDirectBufferImpl + - io.netty.util.internal.DirectCleaner + - io.netty.util.internal.DirectCleaner$CleanableDirectBufferImpl + - io.netty.util.internal.EmptyArrays - io.netty.util.internal.PlatformDependent + - io.netty.util.internal.PlatformDependent$1 + - io.netty.util.internal.PlatformDependent$1$1 - io.netty.util.internal.PlatformDependent$Mpsc - io.netty.util.internal.PlatformDependent$Mpsc$1 - io.netty.util.internal.PlatformDependent0 diff --git a/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java b/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java index 231bb7c619f4..3fd84553f033 100644 --- a/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java +++ b/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java @@ -37,6 +37,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.actions.SparkActions; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -179,6 +180,7 @@ private void setupSpark() { .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") .config("spark.sql.catalog.spark_catalog.warehouse", catalogWarehouse()) + .config(TestBase.DISABLE_UI) .master("local"); spark = builder.getOrCreate(); } diff --git a/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java b/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java index 4978961be641..683f6bb46d05 100644 --- a/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java +++ b/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java @@ -41,6 +41,7 @@ import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.actions.SparkActions; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; @@ -394,6 +395,7 @@ protected void setupSpark() { "spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") .config("spark.sql.catalog.spark_catalog.type", "hadoop") .config("spark.sql.catalog.spark_catalog.warehouse", getCatalogWarehouse()) + .config(TestBase.DISABLE_UI) .master("local[*]"); spark = builder.getOrCreate(); Configuration sparkHadoopConf = spark.sessionState().newHadoopConf(); diff --git a/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java b/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java index c6794e43c636..3f242ce228ca 100644 --- a/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java +++ b/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java @@ -49,6 +49,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.util.ContentFileUtil; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.catalyst.InternalRow; @@ -234,7 +235,7 @@ private String generateDataFilePath() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java b/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java index ac74fb5a109c..db5789724056 100644 --- a/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java +++ b/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java @@ -43,6 +43,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; @@ -218,7 +219,7 @@ private String generateDataFilePath() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java b/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java index 68c537e34a4a..debe37866ff7 100644 --- a/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java +++ b/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java @@ -30,6 +30,7 @@ import org.apache.iceberg.UpdateProperties; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SaveMode; @@ -94,7 +95,7 @@ protected void cleanupFiles() throws IOException { } protected void setupSpark(boolean enableDictionaryEncoding) { - SparkSession.Builder builder = SparkSession.builder().config("spark.ui.enabled", false); + SparkSession.Builder builder = SparkSession.builder().config(TestBase.DISABLE_UI); if (!enableDictionaryEncoding) { builder .config("parquet.dictionary.page.size", "1") diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java index ea400a779235..cb9da3edc678 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java @@ -56,7 +56,7 @@ public static R withCommitProperties( ExceptionUtil.castAndThrow(e, exClass); return null; } finally { - COMMIT_PROPERTIES.set(ImmutableMap.of()); + COMMIT_PROPERTIES.remove(); } } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java index bc8a966488ee..f1709277525a 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java @@ -24,17 +24,17 @@ import java.util.function.Function; import org.apache.spark.sql.connector.catalog.Identifier; import org.apache.spark.sql.connector.catalog.StagedTable; -import org.apache.spark.sql.connector.catalog.SupportsDelete; +import org.apache.spark.sql.connector.catalog.SupportsDeleteV2; import org.apache.spark.sql.connector.catalog.SupportsRead; import org.apache.spark.sql.connector.catalog.SupportsWrite; import org.apache.spark.sql.connector.catalog.Table; import org.apache.spark.sql.connector.catalog.TableCapability; import org.apache.spark.sql.connector.catalog.TableCatalog; import org.apache.spark.sql.connector.expressions.Transform; +import org.apache.spark.sql.connector.expressions.filter.Predicate; import org.apache.spark.sql.connector.read.ScanBuilder; import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.connector.write.WriteBuilder; -import org.apache.spark.sql.sources.Filter; import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.util.CaseInsensitiveStringMap; @@ -58,7 +58,7 @@ * #capabilities()}. */ public class RollbackStagedTable - implements StagedTable, SupportsRead, SupportsWrite, SupportsDelete { + implements StagedTable, SupportsRead, SupportsWrite, SupportsDeleteV2 { private final TableCatalog catalog; private final Identifier ident; private final Table table; @@ -106,8 +106,8 @@ public Set capabilities() { } @Override - public void deleteWhere(Filter[] filters) { - call(SupportsDelete.class, t -> t.deleteWhere(filters)); + public void deleteWhere(Predicate[] predicates) { + call(SupportsDeleteV2.class, t -> t.deleteWhere(predicates)); } @Override diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java index bad31d8d85f4..78d69eeaaf61 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java @@ -35,8 +35,6 @@ public abstract class SparkContentFile implements ContentFile { - private static final FileContent[] FILE_CONTENT_VALUES = FileContent.values(); - private final int fileContentPosition; private final int filePathPosition; private final int fileFormatPosition; @@ -139,7 +137,7 @@ public FileContent content() { if (wrapped.isNullAt(fileContentPosition)) { return null; } - return FILE_CONTENT_VALUES[wrapped.getInt(fileContentPosition)]; + return FileContent.fromId(wrapped.getInt(fileContentPosition)); } @Override diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java index 161f09d53e2c..af549dfd8e7a 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java @@ -114,4 +114,12 @@ private SparkSQLProperties() {} public static final String ASYNC_MICRO_BATCH_PLANNING_ENABLED = "spark.sql.iceberg.async-micro-batch-planning-enabled"; public static final boolean ASYNC_MICRO_BATCH_PLANNING_ENABLED_DEFAULT = false; + + // Controls whether to shred variant columns during write operations + public static final String SHRED_VARIANTS = "spark.sql.iceberg.shred-variants"; + + // Controls the buffer size for variant schema inference during writes + // This determines how many rows are buffered before inferring shredded schema + public static final String VARIANT_INFERENCE_BUFFER_SIZE = + "spark.sql.iceberg.variant-inference-buffer-size"; } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java index 04c47f49596d..96499184cab3 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java @@ -329,15 +329,15 @@ private static List listPartition( private static SparkPartition toSparkPartition( CatalogTablePartition partition, CatalogTable table) { Option locationUri = partition.storage().locationUri(); - Option serde = partition.storage().serde(); + Option partitionSerde = partition.storage().serde(); Preconditions.checkArgument(locationUri.nonEmpty(), "Partition URI should be defined"); Preconditions.checkArgument( - serde.nonEmpty() || table.provider().nonEmpty(), "Partition format should be defined"); + partitionSerde.nonEmpty() || table.provider().nonEmpty(), + "Partition format should be defined"); String uri = Util.uriToString(locationUri.get()); - String format = serde.nonEmpty() ? serde.get() : table.provider().get(); - + String format = resolveFileFormat(partitionSerde.getOrElse(() -> null), table); Map partitionSpec = JavaConverters.mapAsJavaMapConverter(partition.spec()).asJava(); return new SparkPartition(partitionSpec, uri, format); @@ -682,11 +682,7 @@ private static void importUnpartitionedSparkTable( ExecutorService service) { try { CatalogTable sourceTable = spark.sessionState().catalog().getTableMetadata(sourceTableIdent); - Option format = - sourceTable.storage().serde().nonEmpty() - ? sourceTable.storage().serde() - : sourceTable.provider(); - Preconditions.checkArgument(format.nonEmpty(), "Could not determine table format"); + String format = resolveFileFormat(null, sourceTable); Map partition = Collections.emptyMap(); PartitionSpec spec = PartitionSpec.unpartitioned(); @@ -700,7 +696,7 @@ private static void importUnpartitionedSparkTable( TableMigrationUtil.listPartition( partition, Util.uriToString(sourceTable.location()), - format.get(), + format, spec, conf, metricsConfig, @@ -1143,6 +1139,30 @@ private static boolean wapEnabled(Table table) { Boolean.parseBoolean(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED_DEFAULT)); } + private static String resolveFileFormat(String partitionSerde, CatalogTable table) { + if (partitionSerde != null && isKnownFileFormat(partitionSerde)) { + return partitionSerde; + } + + Option serde = table.storage().serde(); + if (serde.nonEmpty() && isKnownFileFormat(serde.get())) { + return serde.get(); + } + + Preconditions.checkArgument( + table.provider().nonEmpty(), + "Could not determine table format from serde %s and no provider set", + serde.getOrElse(() -> "unknown")); + return table.provider().get(); + } + + private static boolean isKnownFileFormat(String serde) { + String lowerSerde = serde.toLowerCase(Locale.ROOT); + return lowerSerde.contains("parquet") + || lowerSerde.contains("avro") + || lowerSerde.contains("orc"); + } + /** Class representing a table partition. */ public static class SparkPartition implements Serializable { private final Map values; diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java index 2296c076f0c4..80f93427805a 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java @@ -33,6 +33,8 @@ import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY; import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION; import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.PARQUET_SHRED_VARIANTS; +import static org.apache.iceberg.TableProperties.PARQUET_VARIANT_BUFFER_SIZE; import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.DELETE; import java.util.Locale; @@ -504,6 +506,14 @@ private Map dataWriteProperties() { if (parquetCompressionLevel != null) { writeProperties.put(PARQUET_COMPRESSION_LEVEL, parquetCompressionLevel); } + boolean shouldShredVariants = shredVariants(); + writeProperties.put(PARQUET_SHRED_VARIANTS, String.valueOf(shouldShredVariants)); + + // Add variant shredding configuration properties + if (shouldShredVariants) { + writeProperties.put( + PARQUET_VARIANT_BUFFER_SIZE, String.valueOf(variantInferenceBufferSize())); + } break; case AVRO: @@ -724,4 +734,24 @@ public DeleteGranularity deleteGranularity() { .defaultValue(DeleteGranularity.FILE) .parse(); } + + public boolean shredVariants() { + return confParser + .booleanConf() + .option(SparkWriteOptions.SHRED_VARIANTS) + .sessionConf(SparkSQLProperties.SHRED_VARIANTS) + .tableProperty(TableProperties.PARQUET_SHRED_VARIANTS) + .defaultValue(TableProperties.PARQUET_SHRED_VARIANTS_DEFAULT) + .parse(); + } + + public int variantInferenceBufferSize() { + return confParser + .intConf() + .option(SparkWriteOptions.VARIANT_INFERENCE_BUFFER_SIZE) + .sessionConf(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE) + .tableProperty(TableProperties.PARQUET_VARIANT_BUFFER_SIZE) + .defaultValue(TableProperties.PARQUET_VARIANT_BUFFER_SIZE_DEFAULT) + .parse(); + } } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java index 2b88d2bb1e44..621db891d46c 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java @@ -86,4 +86,10 @@ private SparkWriteOptions() {} // Overrides the delete granularity public static final String DELETE_GRANULARITY = "delete-granularity"; + + // Controls whether to shred variant columns during write operations + public static final String SHRED_VARIANTS = "shred-variants"; + + // Controls the buffer size for variant schema inference during writes + public static final String VARIANT_INFERENCE_BUFFER_SIZE = "variant-inference-buffer-size"; } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java index d142e3fd1aee..cf9cc8fd511a 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java @@ -24,6 +24,8 @@ import java.nio.ByteBuffer; import java.nio.charset.CharsetEncoder; import java.nio.charset.StandardCharsets; +import java.time.LocalDateTime; +import org.apache.iceberg.util.DateTimeUtil; import org.apache.iceberg.util.ZOrderByteUtils; import org.apache.spark.sql.Column; import org.apache.spark.sql.expressions.UserDefinedFunction; @@ -40,6 +42,7 @@ import org.apache.spark.sql.types.LongType; import org.apache.spark.sql.types.ShortType; import org.apache.spark.sql.types.StringType; +import org.apache.spark.sql.types.TimestampNTZType; import org.apache.spark.sql.types.TimestampType; import scala.collection.JavaConverters; import scala.collection.Seq; @@ -180,6 +183,29 @@ value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) return udf; } + private UserDefinedFunction timestampNtzToOrderedBytesUDF() { + int position = inputCol; + UserDefinedFunction udf = + functions + .udf( + (LocalDateTime value) -> { + if (value == null) { + return PRIMITIVE_EMPTY; + } + long micros = DateTimeUtil.microsFromTimestamp(value); + return ZOrderByteUtils.longToOrderedBytes( + micros, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) + .array(); + }, + DataTypes.BinaryType) + .withName("TIMESTAMP_NTZ_ORDERED_BYTES"); + + this.inputCol++; + increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); + + return udf; + } + private UserDefinedFunction floatToOrderedBytesUDF() { int position = inputCol; UserDefinedFunction udf = @@ -309,6 +335,8 @@ Column sortedLexicographically(Column column, DataType type) { return booleanToOrderedBytesUDF().apply(column); } else if (type instanceof TimestampType) { return longToOrderedBytesUDF().apply(column.cast(DataTypes.LongType)); + } else if (type instanceof TimestampNTZType) { + return timestampNtzToOrderedBytesUDF().apply(column); } else if (type instanceof DateType) { return longToOrderedBytesUDF().apply(functions.unix_date(column).cast(DataTypes.LongType)); } else { diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java index c20be44f6735..c0d3d3efe026 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java @@ -77,7 +77,7 @@ public OrcValueReader record( TypeDescription record, List names, List> fields) { - return SparkOrcValueReaders.struct(fields, expected, idToConstant); + return SparkOrcValueReaders.struct(record, fields, expected, idToConstant); } @Override diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java index 670537fbf872..67664ac6c753 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java @@ -28,6 +28,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.UUIDUtil; +import org.apache.orc.TypeDescription; import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; import org.apache.orc.storage.ql.exec.vector.ColumnVector; import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; @@ -70,8 +71,11 @@ public static OrcValueReader decimals(int precision, int scale) { } static OrcValueReader struct( - List> readers, Types.StructType struct, Map idToConstant) { - return new StructReader(readers, struct, idToConstant); + TypeDescription record, + List> readers, + Types.StructType struct, + Map idToConstant) { + return new StructReader(record, readers, struct, idToConstant); } static OrcValueReader array(OrcValueReader elementReader) { @@ -143,8 +147,11 @@ static class StructReader extends OrcValueReaders.StructReader { private final int numFields; protected StructReader( - List> readers, Types.StructType struct, Map idToConstant) { - super(readers, struct, idToConstant); + TypeDescription record, + List> readers, + Types.StructType struct, + Map idToConstant) { + super(record, readers, struct, idToConstant); this.numFields = struct.fields().size(); } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java index 527b41cdcff2..3e442f9917d4 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java @@ -23,7 +23,7 @@ import java.util.LinkedList; import java.util.List; import java.util.concurrent.Executors; -import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; @@ -31,6 +31,7 @@ import org.apache.iceberg.MicroBatches; import org.apache.iceberg.Snapshot; import org.apache.iceberg.Table; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadConf; @@ -52,7 +53,7 @@ class AsyncSparkMicroBatchPlanner extends BaseSparkMicroBatchPlanner implements private final Cache, List> planFilesCache; // Queue to buffer pre-fetched file scan tasks - private final LinkedBlockingQueue> queue; + private final LinkedBlockingDeque> queue; // Background executor for async operations private final ScheduledExecutorService executor; @@ -64,7 +65,6 @@ class AsyncSparkMicroBatchPlanner extends BaseSparkMicroBatchPlanner implements // Tracking queue state private final AtomicLong queuedFileCount = new AtomicLong(0); private final AtomicLong queuedRowCount = new AtomicLong(0); - private volatile Pair tail; private Snapshot lastQueuedSnapshot; private boolean stopped; @@ -90,10 +90,14 @@ class AsyncSparkMicroBatchPlanner extends BaseSparkMicroBatchPlanner implements this.minQueuedRows = readConf().maxRecordsPerMicroBatch(); this.lastOffsetForTriggerAvailableNow = lastOffsetForTriggerAvailableNow; this.planFilesCache = Caffeine.newBuilder().maximumSize(PLAN_FILES_CACHE_MAX_SIZE).build(); - this.queue = new LinkedBlockingQueue<>(); + this.queue = new LinkedBlockingDeque<>(); table().refresh(); - // Synchronously add data to the queue to meet our initial constraints + + // Synchronously add data to the queue to meet our initial constraints. + // For Trigger.AvailableNow, constructor-time preload is normally initialized from + // latestOffset(...) with no explicit end offset, so bounded preload must stop at + // Trigger.AvailableNow snapshot. fillQueue(initialOffset, maybeEndOffset); this.executor = @@ -172,17 +176,11 @@ public synchronized List planFiles( long rowsInPlan = 0; do { - // Synchronize here since we are polling, checking for empty and updating tail - synchronized (queue) { - try { - elem = queue.poll(QUEUE_POLL_TIMEOUT_MS, TimeUnit.MILLISECONDS); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new RuntimeException("Interrupted while polling queue", e); - } - if (queue.isEmpty()) { - tail = null; - } + try { + elem = queue.pollFirst(QUEUE_POLL_TIMEOUT_MS, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException("Interrupted while polling queue", e); } if (elem != null) { @@ -197,7 +195,7 @@ public synchronized List planFiles( result.add(currentTask); // try to peek at the next entry of the queue and see if we should stop - Pair nextElem = queue.peek(); + Pair nextElem = queue.peekFirst(); boolean endOffsetPeek = false; if (nextElem != null) { endOffsetPeek = endOffset.equals(nextElem.first()); @@ -210,12 +208,18 @@ public synchronized List planFiles( } else { LOG.trace("planFiles hasn't reached {}, waiting", endOffset); } - } while (!shouldTerminate && refreshFailedThrowable == null); + } while (!shouldTerminate + && refreshFailedThrowable == null + && fillQueueFailedThrowable == null); if (refreshFailedThrowable != null) { throw new RuntimeException("Table refresh failed", refreshFailedThrowable); } + if (fillQueueFailedThrowable != null) { + throw new RuntimeException("Queue filling failed", fillQueueFailedThrowable); + } + LOG.info( "completed planFiles for {}, startOffset: {}, endOffset: {}, files: {}, rows: {}", table().name(), @@ -293,10 +297,12 @@ private StreamingOffset computeLimitedOffset(ReadLimit limit) { queuedFileCount.get(), queuedRowCount.get()); - // Convert to list for indexed access - List> queueList = Lists.newArrayList(queue); - for (int i = 0; i < queueList.size(); i++) { - Pair elem = queueList.get(i); + List> queueSnapshot = Lists.newArrayList(queue); + Pair queueTail = + queueSnapshot.isEmpty() ? null : queueSnapshot.get(queueSnapshot.size() - 1); + + for (int i = 0; i < queueSnapshot.size(); i++) { + Pair elem = queueSnapshot.get(i); long fileRows = elem.second().file().recordCount(); // Hard limit on files - stop BEFORE exceeding @@ -329,13 +335,13 @@ private StreamingOffset computeLimitedOffset(ReadLimit limit) { unpackedLimits.getMaxRows()); } // Return the offset of the NEXT element (or synthesize tail+1) - if (i + 1 < queueList.size()) { + if (i + 1 < queueSnapshot.size()) { LOG.debug( "latestOffset hit row limit at {}, rows: {}, files: {}", - queueList.get(i + 1).first(), + queueSnapshot.get(i + 1).first(), rowsSeen, filesSeen); - return queueList.get(i + 1).first(); + return queueSnapshot.get(i + 1).first(); } else { // This is the last element - return tail+1 StreamingOffset current = elem.first(); @@ -353,8 +359,8 @@ private StreamingOffset computeLimitedOffset(ReadLimit limit) { } // if we got here there aren't enough files to exceed our limits - if (tail != null) { - StreamingOffset tailOffset = tail.first(); + if (queueTail != null) { + StreamingOffset tailOffset = queueTail.first(); // we have to increment the position by 1 since we want to include the tail in the read and // position is non-inclusive StreamingOffset latestOffset = @@ -405,11 +411,7 @@ private void addMicroBatchToQueue( Pair.of(new StreamingOffset(microBatch.snapshotId(), position, shouldScanAllFile), task); queuedFileCount.incrementAndGet(); queuedRowCount.addAndGet(task.file().recordCount()); - // I have to synchronize here so queue and tail can never be out of sync - synchronized (queue) { - queue.add(elem); - tail = elem; - } + queue.addLast(elem); position += 1; } if (LOG.isDebugEnabled()) { @@ -461,8 +463,8 @@ private void fillQueueInitialBuffer(Snapshot startSnapshot) { long targetRows = readConf().asyncQueuePreloadRowLimit(); long targetFiles = readConf().asyncQueuePreloadFileLimit(); - Snapshot tableCurrentSnapshot = table().currentSnapshot(); - if (tableCurrentSnapshot == null) { + Snapshot preloadEndSnapshot = initialPreloadEndSnapshot(); + if (preloadEndSnapshot == null) { return; // Empty table } @@ -478,7 +480,7 @@ private void fillQueueInitialBuffer(Snapshot startSnapshot) { // Continue loading more snapshots within safety limits if (current != null) { while ((queuedRowCount.get() < targetRows || queuedFileCount.get() < targetFiles) - && current.snapshotId() != tableCurrentSnapshot.snapshotId()) { + && current.snapshotId() != preloadEndSnapshot.snapshotId()) { current = nextValidSnapshot(current); if (current != null) { addMicroBatchToQueue( @@ -490,12 +492,26 @@ private void fillQueueInitialBuffer(Snapshot startSnapshot) { } } + private Snapshot initialPreloadEndSnapshot() { + if (lastOffsetForTriggerAvailableNow != null) { + return table().snapshot(lastOffsetForTriggerAvailableNow.snapshotId()); + } + + return table().currentSnapshot(); + } + + @VisibleForTesting + static boolean reachedAvailableNowCap( + Snapshot readFrom, StreamingOffset lastOffsetForTriggerAvailableNow) { + return lastOffsetForTriggerAvailableNow != null + && readFrom != null + && readFrom.snapshotId() == lastOffsetForTriggerAvailableNow.snapshotId(); + } + /** Try to populate the queue with data from unread snapshots */ private void fillQueue(Snapshot readFrom) { // Don't add beyond cap for Trigger.AvailableNow - if (this.lastOffsetForTriggerAvailableNow != null - && readFrom != null - && readFrom.snapshotId() >= this.lastOffsetForTriggerAvailableNow.snapshotId()) { + if (reachedAvailableNowCap(readFrom, lastOffsetForTriggerAvailableNow)) { LOG.debug( "Reached cap snapshot {}, not adding more", this.lastOffsetForTriggerAvailableNow.snapshotId()); diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java index 3f2e0f1af08f..5c973ae711b9 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java @@ -55,7 +55,9 @@ public static void register() { StructType.class, SparkParquetWriters::buildWriter, (icebergSchema, fileSchema, engineSchema, idToConstant) -> - SparkParquetReaders.buildReader(icebergSchema, fileSchema, idToConstant))); + SparkParquetReaders.buildReader(icebergSchema, fileSchema, idToConstant), + new SparkVariantShreddingAnalyzer(), + InternalRow::copy)); FormatModelRegistry.register( ParquetFormatModel.create( diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java index ae3c0ce0c8bb..80a40d72c8d1 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java @@ -49,6 +49,7 @@ import org.apache.iceberg.spark.CommitMetadata; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.SparkTableUtil; import org.apache.iceberg.spark.SparkUtil; import org.apache.iceberg.spark.SparkV2Filters; import org.apache.iceberg.spark.TimeTravel; @@ -83,20 +84,14 @@ public class SparkTable extends BaseSparkTable private static final Logger LOG = LoggerFactory.getLogger(SparkTable.class); - private static final Set CAPABILITIES = + private static final Set BASE_CAPABILITIES = ImmutableSet.of( - TableCapability.AUTOMATIC_SCHEMA_EVOLUTION, TableCapability.BATCH_READ, TableCapability.BATCH_WRITE, TableCapability.MICRO_BATCH_READ, TableCapability.STREAMING_WRITE, TableCapability.OVERWRITE_BY_FILTER, TableCapability.OVERWRITE_DYNAMIC); - private static final Set CAPABILITIES_WITH_ACCEPT_ANY_SCHEMA = - ImmutableSet.builder() - .addAll(CAPABILITIES) - .add(TableCapability.ACCEPT_ANY_SCHEMA) - .build(); private final Schema schema; // effective schema (not necessarily current table schema) private final Snapshot snapshot; // always set unless table is empty @@ -133,7 +128,7 @@ private SparkTable( this.snapshot = snapshot; this.branch = branch; this.timeTravel = timeTravel; - this.capabilities = acceptAnySchema(table) ? CAPABILITIES_WITH_ACCEPT_ANY_SCHEMA : CAPABILITIES; + this.capabilities = computeCapabilities(table); } public SparkTable copyWithBranch(String newBranch) { @@ -214,11 +209,14 @@ public boolean canDeleteWhere(Predicate[] predicates) { } } - return canDeleteUsingMetadata(deleteExpr); + String scanBranch = + SparkTableUtil.determineReadBranch( + spark(), table(), branch, CaseInsensitiveStringMap.empty()); + return canDeleteUsingMetadata(deleteExpr, scanBranch); } // a metadata delete is possible iff matching files can be deleted entirely - private boolean canDeleteUsingMetadata(Expression deleteExpr) { + private boolean canDeleteUsingMetadata(Expression deleteExpr, String scanBranch) { boolean caseSensitive = SparkUtil.caseSensitive(spark()); if (ExpressionUtil.selectsPartitions(deleteExpr, table(), caseSensitive)) { @@ -233,7 +231,9 @@ private boolean canDeleteUsingMetadata(Expression deleteExpr) { .includeColumnStats() .ignoreResiduals(); - if (snapshot != null) { + if (scanBranch != null) { + scan = scan.useRef(scanBranch); + } else if (snapshot != null) { scan = scan.useSnapshot(snapshot.snapshotId()); } @@ -275,8 +275,12 @@ public void deleteWhere(Predicate[] predicates) { .set("spark.app.id", spark().sparkContext().applicationId()) .deleteFromRowFilter(deleteExpr); - if (branch != null) { - deleteFiles.toBranch(branch); + String writeBranch = + SparkTableUtil.determineWriteBranch( + spark(), table(), branch, CaseInsensitiveStringMap.empty()); + + if (writeBranch != null) { + deleteFiles.toBranch(writeBranch); } if (!CommitMetadata.commitProperties().isEmpty()) { @@ -353,6 +357,21 @@ private static SparkTable createWithTimestamp(Table table, AsOfTimestamp timeTra return new SparkTable(table, snapshotId, timeTravel); } + private static Set computeCapabilities(Table table) { + ImmutableSet.Builder tableCapabilities = ImmutableSet.builder(); + tableCapabilities.addAll(BASE_CAPABILITIES); + + if (autoSchemaEvolution(table)) { + tableCapabilities.add(TableCapability.AUTOMATIC_SCHEMA_EVOLUTION); + } + + if (acceptAnySchema(table)) { + tableCapabilities.add(TableCapability.ACCEPT_ANY_SCHEMA); + } + + return tableCapabilities.build(); + } + private static boolean acceptAnySchema(Table table) { return PropertyUtil.propertyAsBoolean( table.properties(), @@ -360,6 +379,13 @@ private static boolean acceptAnySchema(Table table) { TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA_DEFAULT); } + private static boolean autoSchemaEvolution(Table table) { + return PropertyUtil.propertyAsBoolean( + table.properties(), + TableProperties.SPARK_WRITE_AUTO_SCHEMA_EVOLUTION, + TableProperties.SPARK_WRITE_AUTO_SCHEMA_EVOLUTION_DEFAULT); + } + // returns latest snapshot for branch or current snapshot if branch is yet to be created private static Snapshot determineLatestSnapshot(Table table, String branch) { if (branch != null && table.refs().containsKey(branch)) { diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkVariantShreddingAnalyzer.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkVariantShreddingAnalyzer.java new file mode 100644 index 000000000000..2c08c662c9da --- /dev/null +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkVariantShreddingAnalyzer.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.List; +import org.apache.iceberg.parquet.VariantShreddingAnalyzer; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.variants.VariantMetadata; +import org.apache.iceberg.variants.VariantValue; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.VariantVal; + +/** + * Spark-specific implementation that extracts variant values from {@link InternalRow} instances. + */ +class SparkVariantShreddingAnalyzer extends VariantShreddingAnalyzer { + + SparkVariantShreddingAnalyzer() {} + + @Override + protected int resolveColumnIndex(StructType sparkSchema, String columnName) { + try { + return sparkSchema.fieldIndex(columnName); + } catch (IllegalArgumentException e) { + return -1; + } + } + + @Override + protected List extractVariantValues( + List bufferedRows, int variantFieldIndex) { + List values = Lists.newArrayList(); + + for (InternalRow row : bufferedRows) { + if (!row.isNullAt(variantFieldIndex)) { + VariantVal variantVal = row.getVariant(variantFieldIndex); + if (variantVal != null) { + VariantValue variantValue = + VariantValue.from( + VariantMetadata.from( + ByteBuffer.wrap(variantVal.getMetadata()).order(ByteOrder.LITTLE_ENDIAN)), + ByteBuffer.wrap(variantVal.getValue()).order(ByteOrder.LITTLE_ENDIAN)); + values.add(variantValue); + } + } + } + + return values; + } +} diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java index 4858f7793c69..79730cd63d4b 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java @@ -27,10 +27,10 @@ import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.spark.SparkFilters; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkTableUtil; import org.apache.iceberg.spark.SparkUtil; +import org.apache.iceberg.spark.SparkV2Filters; import org.apache.iceberg.spark.SparkWriteConf; import org.apache.iceberg.spark.SparkWriteRequirements; import org.apache.iceberg.spark.source.SparkWriteBuilder.Mode.Append; @@ -39,20 +39,20 @@ import org.apache.iceberg.spark.source.SparkWriteBuilder.Mode.OverwriteByFilter; import org.apache.iceberg.types.TypeUtil; import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.connector.expressions.filter.Predicate; import org.apache.spark.sql.connector.read.Scan; import org.apache.spark.sql.connector.write.BatchWrite; import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.connector.write.RowLevelOperation.Command; import org.apache.spark.sql.connector.write.SupportsDynamicOverwrite; -import org.apache.spark.sql.connector.write.SupportsOverwrite; +import org.apache.spark.sql.connector.write.SupportsOverwriteV2; import org.apache.spark.sql.connector.write.Write; import org.apache.spark.sql.connector.write.WriteBuilder; import org.apache.spark.sql.connector.write.streaming.StreamingWrite; -import org.apache.spark.sql.sources.Filter; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; -class SparkWriteBuilder implements WriteBuilder, SupportsDynamicOverwrite, SupportsOverwrite { +class SparkWriteBuilder implements WriteBuilder, SupportsDynamicOverwrite, SupportsOverwriteV2 { private final SparkSession spark; private final Table table; private final String branch; @@ -91,9 +91,9 @@ public WriteBuilder overwriteDynamicPartitions() { } @Override - public WriteBuilder overwrite(Filter[] filters) { + public WriteBuilder overwrite(Predicate[] predicates) { Preconditions.checkState(mode == null, "Cannot use overwrite by filter with other modes"); - Expression expr = SparkFilters.convert(filters); + Expression expr = SparkV2Filters.convert(predicates); this.mode = useDynamicOverwrite(expr) ? new DynamicOverwrite() : new OverwriteByFilter(expr); return this; } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java index aa4f3dc72416..d1c724425c9f 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java @@ -24,6 +24,7 @@ import java.util.Arrays; import java.util.List; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.BeforeEach; @@ -89,6 +90,7 @@ protected static SparkSession initSpark(String serializer) { .master("local[2]") .config("spark.serializer", serializer) .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java index 6ffaede5b069..a21c6a08ec3b 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java @@ -24,6 +24,7 @@ import java.util.Arrays; import java.util.List; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -72,6 +73,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java index 1e680ace292f..5edf4828229a 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java @@ -22,6 +22,7 @@ import static org.apache.iceberg.PlanningMode.LOCAL; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -61,6 +62,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java index 9b736004de57..e6f3c75475d8 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java @@ -24,6 +24,7 @@ import java.util.Arrays; import java.util.List; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -62,6 +63,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java new file mode 100644 index 000000000000..ee1f29e56fb3 --- /dev/null +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark; + +import com.codahale.metrics.MetricRegistry; +import java.util.Properties; +import org.apache.spark.SparkConf; +import org.apache.spark.metrics.sink.MetricsServlet; +import org.sparkproject.jetty.servlet.ServletContextHandler; + +/** + * A dummy implementation of {@link MetricsServlet} that does not start a server or report metrics. + * This is used in tests to avoid conflicts with Spark's jetty dependencies. + */ +public class DummyMetricsServlet extends MetricsServlet { + + /** + * Constructor required by Spark's reflection-based instantiation. + * + * @param properties Metrics properties + * @param registry Metric registry + */ + public DummyMetricsServlet(Properties properties, MetricRegistry registry) { + super(properties, registry); + } + + @Override + public ServletContextHandler[] getHandlers(SparkConf conf) { + return new ServletContextHandler[] {}; + } + + @Override + public void start() { + // No-op for tests + } + + @Override + public void stop() { + // No-op for tests + } + + @Override + public void report() { + // No-op for tests + } +} diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java index a9fbee2fc262..b20c87619ed8 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java @@ -65,7 +65,23 @@ public enum SparkCatalogConfig { SPARK_WITH_HIVE_VIEWS( "spark_hive_with_views", SparkCatalog.class.getName(), - ImmutableMap.of("type", "hive", "default-namespace", "default", "cache-enabled", "false")); + ImmutableMap.of("type", "hive", "default-namespace", "default", "cache-enabled", "false")), + SPARK_SESSION_WITH_UNIQUE_LOCATION( + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "parquet-enabled", "true", + "unique-table-location", "true", + "cache-enabled", "false")), + HIVE_WITH_UNIQUE_LOCATION( + "hive_with_unique_location", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "unique-table-location", "true")); private final String catalogName; private final String implementation; diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestBase.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestBase.java index daf4e29ac075..507d7b313b42 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestBase.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestBase.java @@ -65,6 +65,13 @@ public abstract class TestBase extends SparkTestHelperBase { protected static SparkSession spark = null; protected static JavaSparkContext sparkContext = null; protected static HiveCatalog catalog = null; + // disable Spark UI and use dummy servlet to avoid dependency conflicts with Spark's Jetty version + public static final Map DISABLE_UI = + ImmutableMap.of( + "spark.ui.enabled", + "false", + "spark.metrics.conf.*.sink.servlet.class", + "org.apache.iceberg.spark.DummyMetricsServlet"); @BeforeAll public static void startMetastoreAndSpark() { @@ -79,6 +86,8 @@ public static void startMetastoreAndSpark() { .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") + .config("spark.ui.enabled", "false") + .config(DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java index 383a21087d7f..336067c31235 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java @@ -34,6 +34,7 @@ import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY; import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION; import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.PARQUET_SHRED_VARIANTS; import static org.apache.iceberg.TableProperties.UPDATE_DISTRIBUTION_MODE; import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_HASH; @@ -345,6 +346,8 @@ public void testSparkConfOverride() { TableProperties.DELETE_PARQUET_COMPRESSION, "snappy"), ImmutableMap.of( + PARQUET_SHRED_VARIANTS, + "false", DELETE_PARQUET_COMPRESSION, "zstd", PARQUET_COMPRESSION, @@ -467,6 +470,8 @@ public void testDataPropsDefaultsAsDeleteProps() { PARQUET_COMPRESSION_LEVEL, "5"), ImmutableMap.of( + PARQUET_SHRED_VARIANTS, + "false", DELETE_PARQUET_COMPRESSION, "zstd", PARQUET_COMPRESSION, @@ -538,6 +543,8 @@ public void testDeleteFileWriteConf() { DELETE_PARQUET_COMPRESSION_LEVEL, "6"), ImmutableMap.of( + PARQUET_SHRED_VARIANTS, + "false", DELETE_PARQUET_COMPRESSION, "zstd", PARQUET_COMPRESSION, @@ -698,4 +705,81 @@ private void checkMode(DistributionMode expectedMode, SparkWriteConf writeConf) assertThat(writeConf.copyOnWriteDistributionMode(MERGE)).isEqualTo(expectedMode); assertThat(writeConf.positionDeltaDistributionMode(MERGE)).isEqualTo(expectedMode); } + + @TestTemplate + public void testShredVariantsDefault() { + Table table = validationCatalog.loadTable(tableIdent); + SparkWriteConf writeConf = new SparkWriteConf(spark, table); + assertThat(writeConf.shredVariants()).isFalse(); + } + + @TestTemplate + public void testVariantInferenceBufferSizeDefault() { + Table table = validationCatalog.loadTable(tableIdent); + SparkWriteConf writeConf = new SparkWriteConf(spark, table); + assertThat(writeConf.variantInferenceBufferSize()) + .isEqualTo(TableProperties.PARQUET_VARIANT_BUFFER_SIZE_DEFAULT); + } + + @TestTemplate + public void testVariantInferenceBufferSizeTableProperty() { + Table table = validationCatalog.loadTable(tableIdent); + + table.updateProperties().set(TableProperties.PARQUET_VARIANT_BUFFER_SIZE, "500").commit(); + + SparkWriteConf writeConf = new SparkWriteConf(spark, table); + assertThat(writeConf.variantInferenceBufferSize()).isEqualTo(500); + } + + @TestTemplate + public void testShredVariantsSessionOverridesTableProperty() { + Table table = validationCatalog.loadTable(tableIdent); + table.updateProperties().set(TableProperties.PARQUET_SHRED_VARIANTS, "false").commit(); + + withSQLConf( + ImmutableMap.of(SparkSQLProperties.SHRED_VARIANTS, "true"), + () -> { + SparkWriteConf writeConf = new SparkWriteConf(spark, table); + assertThat(writeConf.shredVariants()).isTrue(); + }); + } + + @TestTemplate + public void testShredVariantsWriteOptionOverridesSessionConf() { + withSQLConf( + ImmutableMap.of(SparkSQLProperties.SHRED_VARIANTS, "false"), + () -> { + Table table = validationCatalog.loadTable(tableIdent); + SparkWriteConf writeConf = + new SparkWriteConf( + spark, + table, + new CaseInsensitiveStringMap( + ImmutableMap.of(SparkWriteOptions.SHRED_VARIANTS, "true"))); + assertThat(writeConf.shredVariants()).isTrue(); + }); + } + + @TestTemplate + public void testVariantInferenceBufferSizeSessionConf() { + withSQLConf( + ImmutableMap.of(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "250"), + () -> { + Table table = validationCatalog.loadTable(tableIdent); + SparkWriteConf writeConf = new SparkWriteConf(spark, table); + assertThat(writeConf.variantInferenceBufferSize()).isEqualTo(250); + }); + } + + @TestTemplate + public void testWritePropertiesIncludeVariantShredding() { + Table table = validationCatalog.loadTable(tableIdent); + table.updateProperties().set(TableProperties.PARQUET_SHRED_VARIANTS, "true").commit(); + table.updateProperties().set(TableProperties.PARQUET_VARIANT_BUFFER_SIZE, "200").commit(); + + SparkWriteConf writeConf = new SparkWriteConf(spark, table); + Map writeProperties = writeConf.writeProperties(); + assertThat(writeProperties).containsEntry(PARQUET_SHRED_VARIANTS, "true"); + assertThat(writeProperties).containsEntry(TableProperties.PARQUET_VARIANT_BUFFER_SIZE, "200"); + } } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java index eb89b0a23274..50afb53e0539 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java @@ -662,7 +662,7 @@ public void schemaEvolutionTestWithSparkSQL() throws Exception { "CAST(id AS FLOAT) col1", "CAST(id AS STRING) col2", "CAST(id AS INT) col3") - .registerTempTable("tempdata"); + .createOrReplaceTempView("tempdata"); sql("INSERT INTO TABLE %s SELECT * FROM tempdata", tblName); List expectedBeforeAddColumn = sql("SELECT * FROM %s ORDER BY col0", tblName); List expectedAfterAddColumn = diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index 9524b0e7167d..110e43ede1f9 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -2129,6 +2129,23 @@ public void testZOrderUDFWithDateType() { assertThat(zorderBytes).isNotNull().isNotEmpty(); } + @TestTemplate + public void testZOrderUDFWithTimestampNTZType() { + SparkZOrderUDF zorderUDF = new SparkZOrderUDF(1, 16, 1024); + Dataset result = + spark + .sql("SELECT timestamp_ntz '2025-01-01 12:00:00' as test_col") + .withColumn( + "zorder_result", + zorderUDF.sortedLexicographically(col("test_col"), DataTypes.TimestampNTZType)); + + assertThat(result.schema().apply("zorder_result").dataType()).isEqualTo(DataTypes.BinaryType); + List rows = result.collectAsList(); + Row row = rows.get(0); + byte[] zorderBytes = row.getAs("zorder_result"); + assertThat(zorderBytes).isNotNull().isNotEmpty(); + } + protected void shouldRewriteDataFilesWithPartitionSpec(Table table, int outputSpecId) { List rewrittenFiles = currentDataFiles(table); assertThat(rewrittenFiles).allMatch(file -> file.specId() == outputSpecId); diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkFormatModel.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkFormatModel.java index c18e4c053f50..291bb2bca4f5 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkFormatModel.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkFormatModel.java @@ -25,6 +25,8 @@ import org.apache.iceberg.data.BaseFormatModelTests; import org.apache.iceberg.data.Record; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.SparkUtil; +import org.apache.iceberg.types.Type; import org.apache.spark.sql.catalyst.InternalRow; public class TestSparkFormatModel extends BaseFormatModelTests { @@ -51,4 +53,9 @@ protected void assertEquals(Schema schema, List expected, List config = ImmutableMap.of( diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java index da6f8c73a1a6..95db27c27b32 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java @@ -76,6 +76,7 @@ import org.apache.iceberg.spark.ParquetBatchReadConf; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkStructLike; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.VortexBatchReadConf; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.SparkParquetWriters; @@ -139,6 +140,7 @@ public static void startMetastoreAndSpark() { .config("spark.ui.liveUpdate.period", 0) .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java index d22ecb02d483..cb2f866fab10 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java @@ -64,6 +64,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkValueConverter; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.PropertyUtil; import org.apache.spark.sql.Dataset; @@ -182,6 +183,7 @@ public static void startMetastoreAndSpark() { SparkSession.builder() .master("local[2]") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java index 8d191cf30b14..5e900ea0bad4 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java @@ -38,6 +38,7 @@ import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoder; @@ -72,6 +73,7 @@ public static void startSpark() { .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) .config("spark.sql.shuffle.partitions", 4) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java index 98e83bdd17cc..3957872be721 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java @@ -37,6 +37,7 @@ import org.apache.iceberg.DataOperations; import org.apache.iceberg.DeleteFile; import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; import org.apache.iceberg.Files; import org.apache.iceberg.Parameter; import org.apache.iceberg.ParameterizedTestExtension; @@ -53,22 +54,28 @@ import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.data.Record; import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.CatalogTestBase; import org.apache.iceberg.spark.SparkCatalogConfig; +import org.apache.iceberg.spark.SparkReadConf; import org.apache.iceberg.spark.SparkReadOptions; +import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.VoidFunction2; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Row; +import org.apache.spark.sql.connector.read.InputPartition; +import org.apache.spark.sql.connector.read.streaming.Offset; import org.apache.spark.sql.internal.SQLConf; import org.apache.spark.sql.streaming.DataStreamWriter; import org.apache.spark.sql.streaming.OutputMode; import org.apache.spark.sql.streaming.StreamingQuery; import org.apache.spark.sql.streaming.Trigger; +import org.apache.spark.sql.util.CaseInsensitiveStringMap; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; @@ -465,6 +472,143 @@ public void testTriggerAvailableNowDoesNotProcessNewDataWhileRunning() throws Ex assertThat(actualResults).containsExactlyInAnyOrderElementsOf(Iterables.concat(expectedData)); } + @TestTemplate + public void testTriggerAvailableNowCapsAsyncPreloadAfterPrepare() { + List> initialData = + List.of(List.of(new SimpleRecord(1, "one")), List.of(new SimpleRecord(2, "two"))); + appendDataAsMultipleSnapshots(initialData); + + table.refresh(); + long expectedCapSnapshotId = table.currentSnapshot().snapshotId(); + + SparkMicroBatchStream stream = + new SparkMicroBatchStream( + JavaSparkContext.fromSparkContext(spark.sparkContext()), + table, + table::io, + new SparkReadConf( + spark, + table, + new CaseInsensitiveStringMap( + ImmutableMap.of( + SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, + async.toString(), + SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, + "1", + SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, + "1", + SparkReadOptions.ASYNC_QUEUE_PRELOAD_FILE_LIMIT, + "10", + SparkReadOptions.ASYNC_QUEUE_PRELOAD_ROW_LIMIT, + "10"))), + table.schema(), + temp.resolve("available-now-cap-checkpoint").toString()); + + try { + stream.prepareForTriggerAvailableNow(); + + appendData(List.of(new SimpleRecord(3, "three"))); + + Offset startOffset = stream.initialOffset(); + Offset firstEndOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + assertThat(firstEndOffset).isNotNull(); + stream.planInputPartitions(startOffset, firstEndOffset); + + Offset secondEndOffset = stream.latestOffset(firstEndOffset, stream.getDefaultReadLimit()); + assertThat(secondEndOffset).isNotNull(); + stream.planInputPartitions(firstEndOffset, secondEndOffset); + + assertThat(stream.latestOffset(secondEndOffset, stream.getDefaultReadLimit())).isNull(); + assertThat(((StreamingOffset) secondEndOffset).snapshotId()).isEqualTo(expectedCapSnapshotId); + } finally { + stream.stop(); + } + } + + @TestTemplate + public void testLatestOffsetReturnsNullAfterFinalBatchIsConsumed() throws Exception { + appendDataAsMultipleSnapshots(TEST_DATA_MULTIPLE_SNAPSHOTS); + + table.refresh(); + int expectedBatchCount; + try (CloseableIterable tasks = table.newScan().planFiles()) { + expectedBatchCount = Iterables.size(tasks); + } + + SparkMicroBatchStream stream = + newMicroBatchStream( + ImmutableMap.of(SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, "1"), + "drain-to-null-checkpoint"); + + try { + int plannedBatchCount = 0; + Offset startOffset = stream.initialOffset(); + Offset endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + while (endOffset != null) { + InputPartition[] partitions = stream.planInputPartitions(startOffset, endOffset); + assertThat(partitions).isNotEmpty(); + plannedBatchCount += 1; + startOffset = endOffset; + endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + } + + assertThat(endOffset).isNull(); + assertThat(plannedBatchCount).isEqualTo(expectedBatchCount); + } finally { + stream.stop(); + } + } + + @TestTemplate + public void testPlanInputPartitionsIsIdempotentForSameOffsets() { + appendDataAsMultipleSnapshots(TEST_DATA_MULTIPLE_SNAPSHOTS); + + SparkMicroBatchStream stream = + newMicroBatchStream( + ImmutableMap.of(SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, "1"), + "idempotent-plan-files-checkpoint"); + + try { + Offset startOffset = stream.initialOffset(); + Offset endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + + assertThat(endOffset).isNotNull(); + + InputPartition[] firstPartitions = stream.planInputPartitions(startOffset, endOffset); + InputPartition[] secondPartitions = stream.planInputPartitions(startOffset, endOffset); + + List firstFileLocations = Lists.newArrayList(); + for (InputPartition partition : firstPartitions) { + SparkInputPartition sparkInputPartition = (SparkInputPartition) partition; + for (FileScanTask task : sparkInputPartition.taskGroup().tasks()) { + firstFileLocations.add(task.file().location()); + } + } + + List secondFileLocations = Lists.newArrayList(); + for (InputPartition partition : secondPartitions) { + SparkInputPartition sparkInputPartition = (SparkInputPartition) partition; + for (FileScanTask task : sparkInputPartition.taskGroup().tasks()) { + secondFileLocations.add(task.file().location()); + } + } + + assertThat(firstFileLocations).containsExactlyInAnyOrderElementsOf(secondFileLocations); + + startOffset = endOffset; + endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + while (endOffset != null) { + assertThat(stream.planInputPartitions(startOffset, endOffset)).isNotEmpty(); + startOffset = endOffset; + endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + } + + assertThat(endOffset).isNull(); + } finally { + stream.stop(); + } + } + @TestTemplate public void testReadStreamOnIcebergThenAddData() throws Exception { List> expected = TEST_DATA_MULTIPLE_SNAPSHOTS; @@ -1053,4 +1197,21 @@ private List rowsAvailable(StreamingQuery query) { .as(Encoders.bean(SimpleRecord.class)) .collectAsList(); } + + private SparkMicroBatchStream newMicroBatchStream( + Map options, String checkpointDirName) { + Map allOptions = Maps.newHashMap(options); + allOptions.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + allOptions.putIfAbsent(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } + + return new SparkMicroBatchStream( + JavaSparkContext.fromSparkContext(spark.sparkContext()), + table, + table::io, + new SparkReadConf(spark, table, new CaseInsensitiveStringMap(allOptions)), + table.schema(), + temp.resolve(checkpointDirName).toString()); + } } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java index e2b5d8920e9f..ab2479d61058 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java @@ -42,6 +42,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.ByteBuffers; @@ -85,6 +86,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); TestWriteMetricsConfig.sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java index 4baaf2d1fbb5..6eac5474afde 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java @@ -63,6 +63,7 @@ public static void startMetastoreAndSpark() { SparkSession.builder() .master("local[2]") .config("spark.sql.iceberg.aggregate_pushdown", "true") + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java index 536d568003cf..0d010087cd8b 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java @@ -20,11 +20,20 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.Assumptions.assumeThat; import java.util.List; +import java.util.Map; +import org.apache.iceberg.Parameter; import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.RowLevelOperationMode; +import org.apache.iceberg.SnapshotSummary; import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.TestHelpers; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.CatalogTestBase; import org.apache.iceberg.spark.source.SimpleRecord; @@ -37,14 +46,41 @@ @ExtendWith(ParameterizedTestExtension.class) public class TestDeleteFrom extends CatalogTestBase { + @Parameter(index = 3) + private int formatVersion; + + @Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}, formatVersion = {3}") + protected static Object[][] parameters() { + List parameters = Lists.newArrayList(); + for (Object[] catalogParams : CatalogTestBase.parameters()) { + for (int version : TestHelpers.V2_AND_ABOVE) { + parameters.add( + new Object[] {catalogParams[0], catalogParams[1], catalogParams[2], version}); + } + } + + return parameters.toArray(new Object[0][]); + } + @AfterEach public void removeTables() { sql("DROP TABLE IF EXISTS %s", tableName); } + private String tableProperties() { + return tableProperties(ImmutableMap.of()); + } + + private String tableProperties(Map additionalProperties) { + ImmutableMap.Builder builder = ImmutableMap.builder(); + builder.putAll(additionalProperties); + builder.put(TableProperties.FORMAT_VERSION, String.valueOf(formatVersion)); + return String.format("TBLPROPERTIES (%s)", tablePropsAsString(builder.buildKeepingLast())); + } + @TestTemplate public void testDeleteFromUnpartitionedTable() throws NoSuchTableException { - sql("CREATE TABLE %s (id bigint, data string) USING iceberg", tableName); + sql("CREATE TABLE %s (id bigint, data string) USING iceberg %s", tableName, tableProperties()); List records = Lists.newArrayList( @@ -74,7 +110,7 @@ public void testDeleteFromUnpartitionedTable() throws NoSuchTableException { @TestTemplate public void testDeleteFromTableAtSnapshot() throws NoSuchTableException { - sql("CREATE TABLE %s (id bigint, data string) USING iceberg", tableName); + sql("CREATE TABLE %s (id bigint, data string) USING iceberg %s", tableName, tableProperties()); List records = Lists.newArrayList( @@ -94,8 +130,9 @@ public void testDeleteFromPartitionedTable() throws NoSuchTableException { sql( "CREATE TABLE %s (id bigint, data string) " + "USING iceberg " - + "PARTITIONED BY (truncate(id, 2))", - tableName); + + "PARTITIONED BY (truncate(id, 2)) " + + "%s", + tableName, tableProperties()); List records = Lists.newArrayList( @@ -124,7 +161,9 @@ public void testDeleteFromPartitionedTable() throws NoSuchTableException { @TestTemplate public void testDeleteFromWhereFalse() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg %s", + tableName, tableProperties()); sql("INSERT INTO TABLE %s VALUES (1, 'a'), (2, 'b'), (3, 'c')", tableName); assertEquals( @@ -144,7 +183,9 @@ public void testDeleteFromWhereFalse() { @TestTemplate public void testTruncate() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg %s", + tableName, tableProperties()); sql("INSERT INTO TABLE %s VALUES (1, 'a'), (2, 'b'), (3, 'c')", tableName); assertEquals( @@ -166,8 +207,8 @@ public void testTruncate() { @TestTemplate public void testDeleteFromTablePartitionedByVarbinary() { sql( - "CREATE TABLE %s (id bigint NOT NULL, data binary) USING iceberg PARTITIONED BY (data)", - tableName); + "CREATE TABLE %s (id bigint NOT NULL, data binary) USING iceberg PARTITIONED BY (data) %s", + tableName, tableProperties()); sql("INSERT INTO TABLE %s VALUES(1, X'e3bcd1'), (2, X'bcd1')", tableName); assertEquals( @@ -185,4 +226,46 @@ public void testDeleteFromTablePartitionedByVarbinary() { ImmutableList.of(row(1L, new byte[] {-29, -68, -47})), sql("SELECT * FROM %s where data = X'e3bcd1'", tableName)); } + + @TestTemplate + public void truncateWithDVs() throws NoSuchTableException { + assumeThat(formatVersion).isGreaterThanOrEqualTo(3); + + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg %s", + tableName, + tableProperties( + ImmutableMap.of( + TableProperties.DELETE_MODE, RowLevelOperationMode.MERGE_ON_READ.modeName()))); + List records = + ImmutableList.of( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); + Dataset df = spark.createDataFrame(records, SimpleRecord.class); + df.coalesce(1).writeTo(tableName).append(); + + assertThat(sql("SELECT * FROM %s ORDER BY id", tableName)) + .containsExactly(row(1L, "a"), row(2L, "b"), row(3L, "c")); + + sql("DELETE FROM %s WHERE id = 1", tableName); + assertThat(validationCatalog.loadTable(tableIdent).currentSnapshot().summary()) + .containsEntry(SnapshotSummary.ADDED_DVS_PROP, "1") + .containsEntry(SnapshotSummary.ADDED_POS_DELETES_PROP, "1"); + + sql("DELETE FROM %s WHERE id = 2", tableName); + // DVs have been merged into single file + assertThat(validationCatalog.loadTable(tableIdent).currentSnapshot().summary()) + .containsEntry(SnapshotSummary.ADDED_DVS_PROP, "1") + .containsEntry(SnapshotSummary.REMOVED_DVS_PROP, "1") + .containsEntry(SnapshotSummary.ADDED_POS_DELETES_PROP, "2"); + + assertThat(sql("SELECT * FROM %s ORDER BY id", tableName)).containsExactly(row(3L, "c")); + + sql("TRUNCATE TABLE %s", tableName); + assertThat(validationCatalog.loadTable(tableIdent).currentSnapshot().summary()) + .containsEntry(SnapshotSummary.REMOVED_DVS_PROP, "1") + .containsEntry(SnapshotSummary.REMOVED_DELETE_FILES_PROP, "1") + .containsEntry(SnapshotSummary.REMOVED_POS_DELETES_PROP, "2"); + + assertThat(sql("SELECT * FROM %s ORDER BY id", tableName)).isEmpty(); + } } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestSparkVariantRead.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestSparkVariantRead.java index 599bf591e9a4..2d6e919a91ee 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestSparkVariantRead.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestSparkVariantRead.java @@ -302,6 +302,55 @@ public void testNestedMapVariant(boolean vectorized) { sql("DROP TABLE IF EXISTS %s", mapTable); } + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void testMergeIntoWithVariant(boolean vectorized) { + // Variant columns are not vectorized yet, but MERGE INTO should not crash regardless of the + // vectorization setting. The reader falls back to non-vectorized for variant columns. + String mergeTable = CATALOG + ".default.var_merge"; + sql("DROP TABLE IF EXISTS %s", mergeTable); + sql( + "CREATE TABLE %s (id BIGINT, data VARIANT) USING iceberg " + + "TBLPROPERTIES ('format-version'='3')", + mergeTable); + setVectorization(mergeTable, vectorized); + + sql( + "INSERT INTO %s VALUES " + + "(1, parse_json('{\"name\":\"alice\",\"age\":30}')), " + + "(2, parse_json('{\"name\":\"bob\",\"age\":25}'))", + mergeTable); + + sql( + "MERGE INTO %s AS target " + + "USING (SELECT 1 AS id, parse_json('{\"name\":\"alice\",\"age\":31}') AS data) AS source " + + "ON target.id = source.id " + + "WHEN MATCHED THEN UPDATE SET target.data = source.data " + + "WHEN NOT MATCHED THEN INSERT *", + mergeTable); + + List rows = spark.table(mergeTable).select("id", "data").orderBy("id").collectAsList(); + + assertThat(rows).hasSize(2); + assertThat(rows.get(0).getLong(0)).isEqualTo(1L); + Variant v1 = + new Variant( + ((VariantVal) rows.get(0).get(1)).getValue(), + ((VariantVal) rows.get(0).get(1)).getMetadata()); + assertThat(v1.getFieldByKey("name").getString()).describedAs("v1.name").isEqualTo("alice"); + assertThat(v1.getFieldByKey("age").getLong()).describedAs("v1.age").isEqualTo(31L); + + assertThat(rows.get(1).getLong(0)).isEqualTo(2L); + Variant v2 = + new Variant( + ((VariantVal) rows.get(1).get(1)).getValue(), + ((VariantVal) rows.get(1).get(1)).getMetadata()); + assertThat(v2.getFieldByKey("name").getString()).describedAs("v2.name").isEqualTo("bob"); + assertThat(v2.getFieldByKey("age").getLong()).describedAs("v2.age").isEqualTo(25L); + + sql("DROP TABLE IF EXISTS %s", mergeTable); + } + private void setVectorization(boolean on) { sql( "ALTER TABLE %s SET TBLPROPERTIES ('read.parquet.vectorization.enabled'='%s')", diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestTableEncryption.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestTableEncryption.java index a38506d621f9..3b36b7bb0a25 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestTableEncryption.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestTableEncryption.java @@ -56,6 +56,7 @@ import org.apache.iceberg.spark.SparkCatalogConfig; import org.apache.iceberg.types.Types; import org.apache.parquet.crypto.ParquetCryptoRuntimeException; +import org.apache.spark.SparkException; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.TestTemplate; @@ -248,14 +249,28 @@ public void testMetadataTamperproofing() throws IOException { public void testKeyDelete() { assertThatThrownBy( () -> sql("ALTER TABLE %s UNSET TBLPROPERTIES (`encryption.key-id`)", tableName)) - .hasMessageContaining("Cannot remove key in encrypted table"); + .isInstanceOf(SparkException.class) + .hasMessage("Unsupported table change: Cannot remove key ID from an encrypted table"); } @TestTemplate public void testKeyAlter() { assertThatThrownBy( () -> sql("ALTER TABLE %s SET TBLPROPERTIES ('encryption.key-id'='abcd')", tableName)) - .hasMessageContaining("Cannot modify key in encrypted table"); + .isInstanceOf(SparkException.class) + .hasMessage("Unsupported table change: Cannot modify key ID of an encrypted table"); + } + + @TestTemplate + public void testReplaceKeyChange() { + // Replacing a table with a different encryption key is disallowed + assertThatThrownBy( + () -> + sql( + "REPLACE TABLE %s (id bigint) USING iceberg TBLPROPERTIES ('encryption.key-id'='%s')", + tableName, UnitestKMS.MASTER_KEY_NAME2)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Cannot modify key ID of an encrypted table"); } @TestTemplate diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestUniqueTableLocation.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestUniqueTableLocation.java new file mode 100644 index 000000000000..c61bb3b0008e --- /dev/null +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestUniqueTableLocation.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.sql; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.UUID; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Table; +import org.apache.iceberg.actions.DeleteOrphanFiles; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.exceptions.NotFoundException; +import org.apache.iceberg.spark.CatalogTestBase; +import org.apache.iceberg.spark.SparkCatalogConfig; +import org.apache.iceberg.spark.actions.SparkActions; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestUniqueTableLocation extends CatalogTestBase { + + private String renamedTableName; + private TableIdentifier renamedIdent; + + @Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}") + protected static Object[][] parameters() { + return new Object[][] { + { + SparkCatalogConfig.HIVE_WITH_UNIQUE_LOCATION.catalogName(), + SparkCatalogConfig.HIVE_WITH_UNIQUE_LOCATION.implementation(), + SparkCatalogConfig.HIVE_WITH_UNIQUE_LOCATION.properties() + }, + { + SparkCatalogConfig.SPARK_SESSION_WITH_UNIQUE_LOCATION.catalogName(), + SparkCatalogConfig.SPARK_SESSION_WITH_UNIQUE_LOCATION.implementation(), + SparkCatalogConfig.SPARK_SESSION_WITH_UNIQUE_LOCATION.properties() + }, + }; + } + + @BeforeEach + public void initTableName() { + renamedTableName = tableName("table_2"); + renamedIdent = TableIdentifier.of(Namespace.of("default"), "table_2"); + } + + @AfterEach + public void dropTestTable() { + try { + sql("DROP TABLE IF EXISTS %s", tableName); + sql("DROP TABLE IF EXISTS %s", renamedTableName); + } catch (NotFoundException ignore) { + // Swallow FNF exception in case of corrupted table so test failure reason is clearer + } + } + + @TestTemplate + public void noCollisionAfterRename() { + assertThat(validationCatalog.tableExists(tableIdent)) + .as("%s should not exist", tableIdent) + .isFalse(); + assertThat(validationCatalog.tableExists(renamedIdent)) + .as("%s should not exist", renamedIdent) + .isFalse(); + + sql("CREATE TABLE %s (id BIGINT NOT NULL, data STRING) USING iceberg", tableName); + + sql("ALTER TABLE %s RENAME TO %s", tableName, renamedTableName); + + sql("CREATE TABLE %s (id BIGINT NOT NULL, data STRING) USING iceberg", tableName); + + Table table = validationCatalog.loadTable(tableIdent); + Table renamedTable = validationCatalog.loadTable(renamedIdent); + + assertThat(table.location()) + .as( + "After rename+recreate, %s and %s must have different locations", + tableName, renamedTableName) + .isNotEqualTo(renamedTable.location()); + } + + @TestTemplate + public void orphanCleanupDoesntCorruptTable() { + SparkActions actions = SparkActions.get(); + + assertThat(validationCatalog.tableExists(tableIdent)) + .as("%s should not exist", tableIdent) + .isFalse(); + assertThat(validationCatalog.tableExists(renamedIdent)) + .as("%s should not exist", renamedIdent) + .isFalse(); + + sql("CREATE TABLE %s (id BIGINT NOT NULL, data STRING) USING iceberg", tableName); + sql("INSERT INTO %s VALUES(0, '%s')", tableName, UUID.randomUUID().toString()); + + sql("ALTER TABLE %s RENAME TO %s", tableName, renamedTableName); + + sql("CREATE TABLE %s (id BIGINT NOT NULL, data STRING) USING iceberg", tableName); + sql("INSERT INTO %s VALUES(1, '%s')", tableName, UUID.randomUUID().toString()); + + Table table = validationCatalog.loadTable(tableIdent); + assertThat(table).as("Should load %s", table).isNotNull(); + + long cutoff = System.currentTimeMillis() + 1; + DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table).olderThan(cutoff).execute(); + assertThat(result.orphanFileLocations()).as("Should not touch any files").isEmpty(); + + assertThat(scalarSql("SELECT count(*) FROM %s", renamedTableName)) + .as("Table %s should remain unaffected by %s table cleanup", renamedTableName, tableName) + .isEqualTo(1L); + } +} diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/variant/TestVariantShredding.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/variant/TestVariantShredding.java new file mode 100644 index 000000000000..8cdcf22e5817 --- /dev/null +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/variant/TestVariantShredding.java @@ -0,0 +1,1101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.variant; + +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.parquet.schema.Types.optional; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.math.BigDecimal; +import java.net.InetAddress; +import java.util.List; +import java.util.Map; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.spark.CatalogTestBase; +import org.apache.iceberg.spark.SparkCatalogConfig; +import org.apache.iceberg.spark.SparkSQLProperties; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.variants.Variant; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; +import org.apache.spark.sql.internal.SQLConf; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; + +public class TestVariantShredding extends CatalogTestBase { + + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "address", Types.VariantType.get())); + + private static final Schema SCHEMA2 = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "address", Types.VariantType.get()), + Types.NestedField.optional(3, "metadata", Types.VariantType.get())); + + @Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}") + protected static Object[][] parameters() { + return new Object[][] { + { + SparkCatalogConfig.HADOOP.catalogName(), + SparkCatalogConfig.HADOOP.implementation(), + SparkCatalogConfig.HADOOP.properties() + }, + }; + } + + @BeforeAll + public static void startMetastoreAndSpark() { + // First call parent to initialize metastore and spark with local[2] + CatalogTestBase.startMetastoreAndSpark(); + + // Now stop and recreate spark with local[1] to write all rows to a single file + if (spark != null) { + spark.stop(); + } + + spark = + SparkSession.builder() + .master("local[1]") // Use one thread to write the rows to a single parquet file + .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") + .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") + .config(DISABLE_UI) + .enableHiveSupport() + .getOrCreate(); + + sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); + } + + @BeforeEach + public void before() { + super.before(); + validationCatalog.createTable( + tableIdent, SCHEMA, null, Map.of(TableProperties.FORMAT_VERSION, "3")); + } + + @AfterEach + public void after() { + spark.conf().unset(SparkSQLProperties.SHRED_VARIANTS); + spark.conf().unset(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE); + validationCatalog.dropTable(tableIdent, true); + } + + @TestTemplate + public void testVariantShreddingDisabled() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "false"); + + String values = "(1, parse_json('{\"city\": \"NYC\", \"zip\": 10001}')), (2, null)"; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType address = variant("address", 2, Type.Repetition.OPTIONAL); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testExcludingNullValue() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"name": "Alice", "age": 30, "dummy": null}')),\ + (2, parse_json('{"name": "Bob", "age": 25}')),\ + (3, parse_json('{"name": "Charlie", "age": 35}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testInconsistentType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"age": "25"}')),\ + (2, parse_json('{"age": 30}')),\ + (3, parse_json('{"age": "35"}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + List rows = + sql("SELECT variant_get(address, '$.age', 'int') FROM %s WHERE id = 2", tableName); + assertThat(rows).hasSize(1); + assertThat(rows.get(0)[0]).isEqualTo(30); + } + + @TestTemplate + public void testPrimitiveType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = "(1, parse_json('123')), (2, parse_json('456')), (3, parse_json('789'))"; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType address = + variant( + "address", + 2, + Type.Repetition.REQUIRED, + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(16, true))); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testPrimitiveDecimalType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + "(1, parse_json('123.56')), (2, parse_json('\"abc\"')), (3, parse_json('12.56'))"; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType address = + variant( + "address", + 2, + Type.Repetition.REQUIRED, + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.decimalType(2, 5))); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testBooleanType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"active": true}')),\ + (2, parse_json('{"active": false}')),\ + (3, parse_json('{"active": true}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType active = field("active", shreddedPrimitive(PrimitiveType.PrimitiveTypeName.BOOLEAN)); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(active)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testDecimalTypeWithInconsistentScales() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"price": 123.456789}')),\ + (2, parse_json('{"price": 678.90}')),\ + (3, parse_json('{"price": 999.99}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType price = + field( + "price", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.decimalType(6, 9))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(price)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testDecimalTypeWithConsistentScales() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"price": 123.45}')),\ + (2, parse_json('{"price": 678.90}')),\ + (3, parse_json('{"price": 999.99}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType price = + field( + "price", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.decimalType(2, 5))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(price)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testArrayType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('["java", "scala", "python"]')),\ + (2, parse_json('["rust", "go"]')),\ + (3, parse_json('["javascript"]'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType arr = + list( + element( + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType()))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, arr); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testNestedArrayType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"tags": ["java", "scala", "python"]}')),\ + (2, parse_json('{"tags": ["rust", "go"]}')),\ + (3, parse_json('{"tags": ["javascript"]}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType tags = + field( + "tags", + list( + element( + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, + LogicalTypeAnnotation.stringType())))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(tags)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testNestedObjectType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"location": {"city": "Seattle", "zip": 98101}, "tags": ["java", "scala", "python"]}')),\ + (2, parse_json('{"location": {"city": "Portland", "zip": 97201}}')),\ + (3, parse_json('{"location": {"city": "NYC", "zip": 10001}}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType city = + field( + "city", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType zip = + field( + "zip", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(32, true))); + GroupType location = field("location", objectFields(city, zip)); + GroupType tags = + field( + "tags", + list( + element( + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, + LogicalTypeAnnotation.stringType())))); + + GroupType address = + variant("address", 2, Type.Repetition.REQUIRED, objectFields(location, tags)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testLazyInitializationWithBufferedRows() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "5"); + + String values = + """ + (1, parse_json('{"name": "Alice", "age": 30}')),\ + (2, parse_json('{"name": "Bob", "age": 25}')),\ + (3, parse_json('{"name": "Charlie", "age": 35}')),\ + (4, parse_json('{"name": "David", "age": 28}')),\ + (5, parse_json('{"name": "Eve", "age": 32}')),\ + (6, parse_json('{"name": "Frank", "age": 40}')),\ + (7, parse_json('{"name": "Grace", "age": 27}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + long rowCount = spark.read().format("iceberg").load(tableName).count(); + assertThat(rowCount).isEqualTo(7); + } + + @TestTemplate + public void testMultipleRowGroups() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "3"); + + int numRows = 1000; + StringBuilder valuesBuilder = new StringBuilder(); + for (int i = 1; i <= numRows; i++) { + if (i > 1) { + valuesBuilder.append(", "); + } + valuesBuilder.append( + String.format("(%d, parse_json('{\"name\": \"User%d\", \"age\": %d}'))", i, i, 20 + i)); + } + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", + tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 1024); + sql("INSERT INTO %s VALUES %s", tableName, valuesBuilder.toString()); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + long rowCount = spark.read().format("iceberg").load(tableName).count(); + assertThat(rowCount).isEqualTo(numRows); + } + + @TestTemplate + public void testColumnIndexTruncateLength() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "3"); + + int customTruncateLength = 10; + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", + tableName, "parquet.columnindex.truncate.length", customTruncateLength); + + StringBuilder valuesBuilder = new StringBuilder(); + for (int i = 1; i <= 10; i++) { + if (i > 1) { + valuesBuilder.append(", "); + } + String longValue = "A".repeat(20); + valuesBuilder.append( + String.format( + "(%d, parse_json('{\"description\": \"%s\", \"id\": %d}'))", i, longValue, i)); + } + sql("INSERT INTO %s VALUES %s", tableName, valuesBuilder.toString()); + + GroupType description = + field( + "description", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType id = + field( + "id", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = + variant("address", 2, Type.Repetition.REQUIRED, objectFields(description, id)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + long rowCount = spark.read().format("iceberg").load(tableName).count(); + assertThat(rowCount).isEqualTo(10); + } + + @TestTemplate + public void testIntegerFamilyPromotion() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + // Mix of INT8, INT16, INT32, INT64 - should promote to INT64 + String values = + """ + (1, parse_json('{"value": 10}')),\ + (2, parse_json('{"value": 1000}')),\ + (3, parse_json('{"value": 100000}')),\ + (4, parse_json('{"value": 10000000000}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType value = + field( + "value", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT64, LogicalTypeAnnotation.intType(64, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(value)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testDecimalFamilyPromotion() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + // Test that they get promoted to the most capable decimal type observed + String values = + """ + (1, parse_json('{"value": 1.5}')),\ + (2, parse_json('{"value": 123.456789}')),\ + (3, parse_json('{"value": 123456789123456.789}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType value = + field( + "value", + optional(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) + .length(16) + .as(LogicalTypeAnnotation.decimalType(6, 21)) + .named("typed_value")); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(value)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testDataRoundTripWithShredding() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"name": "Alice", "age": 30}')),\ + (2, parse_json('{"name": "Bob", "age": 25}')),\ + (3, parse_json('{"name": "Charlie", "age": 35}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + // Verify that we can read the data back correctly + List rows = + sql( + "SELECT id, variant_get(address, '$.name', 'string')," + + " variant_get(address, '$.age', 'int')" + + " FROM %s ORDER BY id", + tableName); + assertThat(rows).hasSize(3); + assertThat(rows.get(0)[0]).isEqualTo(1); + assertThat(rows.get(0)[1]).isEqualTo("Alice"); + assertThat(rows.get(0)[2]).isEqualTo(30); + assertThat(rows.get(1)[0]).isEqualTo(2); + assertThat(rows.get(1)[1]).isEqualTo("Bob"); + assertThat(rows.get(1)[2]).isEqualTo(25); + assertThat(rows.get(2)[0]).isEqualTo(3); + assertThat(rows.get(2)[1]).isEqualTo("Charlie"); + assertThat(rows.get(2)[2]).isEqualTo(35); + } + + @TestTemplate + public void testMultipleVariantsWithShredding() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + // Recreate table with SCHEMA2 (address + metadata variant columns) + validationCatalog.dropTable(tableIdent, true); + validationCatalog.createTable( + tableIdent, SCHEMA2, null, Map.of(TableProperties.FORMAT_VERSION, "3")); + + String values = + """ + (1, parse_json('{"city": "NYC"}'), parse_json('{"source": "web"}')),\ + (2, parse_json('{"city": "LA"}'), parse_json('{"source": "app"}')),\ + (3, parse_json('{"city": "SF"}'), parse_json('{"source": "api"}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType city = + field( + "city", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(city)); + + GroupType source = + field( + "source", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType metadata = variant("metadata", 3, Type.Repetition.REQUIRED, objectFields(source)); + MessageType expectedSchema = parquetSchema(address, metadata); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testVariantWithNullValues() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('null')),\ + (2, parse_json('null')),\ + (3, parse_json('null'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType address = variant("address", 2, Type.Repetition.REQUIRED); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testArrayOfNullElementsWithShredding() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + sql( + "INSERT INTO %s VALUES (1, parse_json('[null, null, null]')), " + + "(2, parse_json('[null]'))", + tableName); + + // Array elements are all null, element type is null, falls back to unshredded + GroupType address = variant("address", 2, Type.Repetition.REQUIRED); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testMixedNullAndNonNullVariantValues() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"name": "Alice", "age": 30}')),\ + (2, null),\ + (3, parse_json('{"name": "Charlie", "age": 35}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.OPTIONAL, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + long rowCount = spark.read().format("iceberg").load(tableName).count(); + assertThat(rowCount).isEqualTo(3); + } + + @TestTemplate + public void testWriteOptionOverridesSessionConfig() throws IOException, NoSuchTableException { + // Disable shredding at session level + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "false"); + + // Enable shredding via per-write option + String query = + "SELECT 1 as id, parse_json('{\"name\": \"Alice\", \"age\": 30}') as address" + + " UNION ALL SELECT 2, parse_json('{\"name\": \"Bob\", \"age\": 25}')" + + " UNION ALL SELECT 3, parse_json('{\"name\": \"Charlie\", \"age\": 35}')"; + spark.sql(query).writeTo(tableName).option("shred-variants", "true").append(); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testInfrequentFieldPruning() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "11"); + + StringBuilder valuesBuilder = new StringBuilder(); + for (int i = 1; i <= 11; i++) { + if (i > 1) { + valuesBuilder.append(", "); + } + if (i == 1) { + // Only the first row has rare_field + valuesBuilder.append( + String.format( + "(%d, parse_json('{\"name\": \"User%d\", \"rare_field\": \"rare\"}'))", i, i)); + } else { + valuesBuilder.append(String.format("(%d, parse_json('{\"name\": \"User%d\"}'))", i, i)); + } + } + sql("INSERT INTO %s VALUES %s", tableName, valuesBuilder.toString()); + + // rare_field appears in 1/11 rows, should be pruned + // name appears in 11/11 rows and should be kept + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testMixedTypeTieBreaking() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "10"); + + StringBuilder valuesBuilder = new StringBuilder(); + for (int i = 1; i <= 10; i++) { + if (i > 1) { + valuesBuilder.append(", "); + } + if (i <= 5) { + valuesBuilder.append(String.format("(%d, parse_json('{\"val\": %d}'))", i, i)); + } else { + valuesBuilder.append(String.format("(%d, parse_json('{\"val\": \"text%d\"}'))", i, i)); + } + } + sql("INSERT INTO %s VALUES %s", tableName, valuesBuilder.toString()); + + // 5 ints + 5 strings is a tie so STRING wins (higher TIE_BREAK_PRIORITY) + GroupType val = + field( + "val", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(val)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + // Verify data round-trips correctly + List rows = + sql("SELECT id, variant_get(address, '$.val', 'string') FROM %s ORDER BY id", tableName); + assertThat(rows).hasSize(10); + assertThat(rows.get(0)[1]).isEqualTo("1"); + assertThat(rows.get(5)[1]).isEqualTo("text6"); + } + + @TestTemplate + public void testFieldOnlyAfterBuffer() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "3"); + + String values = + """ + (1, parse_json('{"name": "Alice"}')),\ + (2, parse_json('{"name": "Bob"}')),\ + (3, parse_json('{"name": "Charlie"}')),\ + (4, parse_json('{"name": "David", "score": 95}')),\ + (5, parse_json('{"name": "Eve", "score": 88}')),\ + (6, parse_json('{"name": "Frank", "score": 72}')),\ + (7, parse_json('{"name": "Grace", "score": 91}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + // Schema is determined from buffer (rows 1-3) which only has "name". + // "score" is not shredded + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + // Verify all data round-trips despite "score" not being shredded + List rows = + sql( + "SELECT id, variant_get(address, '$.name', 'string')," + + " variant_get(address, '$.score', 'int')" + + " FROM %s ORDER BY id", + tableName); + assertThat(rows).hasSize(7); + assertThat(rows.get(0)[1]).isEqualTo("Alice"); + assertThat(rows.get(0)[2]).isNull(); + assertThat(rows.get(3)[1]).isEqualTo("David"); + assertThat(rows.get(3)[2]).isEqualTo(95); + assertThat(rows.get(6)[1]).isEqualTo("Grace"); + assertThat(rows.get(6)[2]).isEqualTo(91); + } + + @TestTemplate + public void testCrossFileDifferentShreddedType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "3"); + + // File 1: "score" is always integer → shredded as INT8 + String batch1 = + """ + (1, parse_json('{"score": 95}')),\ + (2, parse_json('{"score": 88}')),\ + (3, parse_json('{"score": 72}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, batch1); + + // Verify file 1 schema: score shredded as INT8 + Table table = validationCatalog.loadTable(tableIdent); + GroupType scoreInt = + field( + "score", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + MessageType expectedSchema1 = + parquetSchema(variant("address", 2, Type.Repetition.REQUIRED, objectFields(scoreInt))); + verifyParquetSchema(table, expectedSchema1); + + // File 2: "score" is always string → shredded as STRING + String batch2 = + """ + (4, parse_json('{"score": "high"}')),\ + (5, parse_json('{"score": "medium"}')),\ + (6, parse_json('{"score": "low"}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, batch2); + + // Query across both files, reader must handle different shredded types + List rows = + sql("SELECT id, variant_get(address, '$.score', 'string') FROM %s ORDER BY id", tableName); + assertThat(rows).hasSize(6); + assertThat(rows.get(0)[1]).isEqualTo("95"); + assertThat(rows.get(1)[1]).isEqualTo("88"); + assertThat(rows.get(3)[1]).isEqualTo("high"); + assertThat(rows.get(5)[1]).isEqualTo("low"); + } + + @TestTemplate + public void testAllNullVariantColumn() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + sql("INSERT INTO %s VALUES (1, null), (2, null), (3, null)", tableName); + + // All variant values are SQL NULL, so no shredding should occur + Table table = validationCatalog.loadTable(tableIdent); + MessageType expectedSchema = parquetSchema(variant("address", 2, Type.Repetition.OPTIONAL)); + verifyParquetSchema(table, expectedSchema); + + List rows = sql("SELECT id, address FROM %s ORDER BY id", tableName); + assertThat(rows).hasSize(3); + assertThat(rows.get(0)[1]).isNull(); + assertThat(rows.get(1)[1]).isNull(); + assertThat(rows.get(2)[1]).isNull(); + } + + @TestTemplate + public void testBufferSizeOne() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "1"); + + sql( + """ + INSERT INTO %s VALUES + (1, parse_json('{"name": "Alice", "age": 30}')), + (2, parse_json('{"name": "Bob", "age": 25}')), + (3, parse_json('{"name": "Charlie", "age": 35}')) + """, + tableName); + + // Schema inferred from first row only, should still shred name and age + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + List rows = + sql("SELECT id, variant_get(address, '$.name', 'string') FROM %s ORDER BY id", tableName); + assertThat(rows).hasSize(3); + assertThat(rows.get(0)[1]).isEqualTo("Alice"); + assertThat(rows.get(2)[1]).isEqualTo("Charlie"); + } + + @TestTemplate + public void testDecimalFallbackAfterBuffer() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "3"); + + // Buffer: scale=2, 3 integer digits -> DECIMAL(5,2) + // Row 4: precision overflow -> fallback to value field + // Row 5: scale overflow -> fallback to value field + // Row 6: fits typed column, scale widened from 1 to 2 via setScale + String values = + """ + (1, parse_json('{"val": 123.45}')),\ + (2, parse_json('{"val": 678.90}')),\ + (3, parse_json('{"val": 999.99}')),\ + (4, parse_json('{"val": 123456.78}')),\ + (5, parse_json('{"val": 1.2345}')),\ + (6, parse_json('{"val": 12.3}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + List rows = + sql( + "SELECT id, variant_get(address, '$.val', 'decimal(10,4)') FROM %s ORDER BY id", + tableName); + assertThat(rows).hasSize(6); + assertThat(rows.get(0)[1]).isEqualTo(new BigDecimal("123.4500")); + assertThat(rows.get(3)[1]).isEqualTo(new BigDecimal("123456.7800")); + assertThat(rows.get(4)[1]).isEqualTo(new BigDecimal("1.2345")); + assertThat(rows.get(5)[1]).isEqualTo(new BigDecimal("12.3000")); + } + + private void verifyParquetSchema(Table table, MessageType expectedSchema) throws IOException { + try (CloseableIterable tasks = table.newScan().planFiles()) { + assertThat(tasks).isNotEmpty(); + + for (FileScanTask task : tasks) { + String path = task.file().location(); + + HadoopInputFile inputFile = HadoopInputFile.fromPath(new Path(path), new Configuration()); + + try (ParquetFileReader reader = ParquetFileReader.open(inputFile)) { + MessageType actualSchema = reader.getFileMetaData().getSchema(); + assertThat(actualSchema).isEqualTo(expectedSchema); + } + } + } + } + + private static MessageType parquetSchema(Type... variantTypes) { + return org.apache.parquet.schema.Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.INT32) + .id(1) + .named("id") + .addFields(variantTypes) + .named("table"); + } + + private static GroupType variant(String name, int fieldId, Type.Repetition repetition) { + return org.apache.parquet.schema.Types.buildGroup(repetition) + .id(fieldId) + .as(LogicalTypeAnnotation.variantType(Variant.VARIANT_SPEC_VERSION)) + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .named("metadata") + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .named("value") + .named(name); + } + + private static GroupType variant( + String name, int fieldId, Type.Repetition repetition, Type shreddedType) { + checkShreddedType(shreddedType); + return org.apache.parquet.schema.Types.buildGroup(repetition) + .id(fieldId) + .as(LogicalTypeAnnotation.variantType(Variant.VARIANT_SPEC_VERSION)) + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .named("metadata") + .optional(PrimitiveType.PrimitiveTypeName.BINARY) + .named("value") + .addField(shreddedType) + .named(name); + } + + private static Type shreddedPrimitive(PrimitiveType.PrimitiveTypeName primitive) { + return optional(primitive).named("typed_value"); + } + + private static Type shreddedPrimitive( + PrimitiveType.PrimitiveTypeName primitive, LogicalTypeAnnotation annotation) { + return optional(primitive).as(annotation).named("typed_value"); + } + + private static GroupType objectFields(GroupType... fields) { + for (GroupType fieldType : fields) { + checkField(fieldType); + } + + return org.apache.parquet.schema.Types.buildGroup(Type.Repetition.OPTIONAL) + .addFields(fields) + .named("typed_value"); + } + + private static GroupType field(String name, Type shreddedType) { + checkShreddedType(shreddedType); + return org.apache.parquet.schema.Types.buildGroup(Type.Repetition.REQUIRED) + .optional(PrimitiveType.PrimitiveTypeName.BINARY) + .named("value") + .addField(shreddedType) + .named(name); + } + + private static GroupType element(Type shreddedType) { + return field("element", shreddedType); + } + + private static GroupType list(GroupType elementType) { + return org.apache.parquet.schema.Types.optionalList().element(elementType).named("typed_value"); + } + + private static void checkShreddedType(Type shreddedType) { + Preconditions.checkArgument( + shreddedType.getName().equals("typed_value"), + "Invalid shredded type name: %s should be typed_value", + shreddedType.getName()); + Preconditions.checkArgument( + shreddedType.isRepetition(Type.Repetition.OPTIONAL), + "Invalid shredded type repetition: %s should be OPTIONAL", + shreddedType.getRepetition()); + } + + private static void checkField(GroupType fieldType) { + Preconditions.checkArgument( + fieldType.isRepetition(Type.Repetition.REQUIRED), + "Invalid field type repetition: %s should be REQUIRED", + fieldType.getRepetition()); + } +} diff --git a/vortex/src/main/java/org/apache/iceberg/vortex/ConvertFilterToVortex.java b/vortex/src/main/java/org/apache/iceberg/vortex/ConvertFilterToVortex.java index 84833881900f..38933ebd0f04 100644 --- a/vortex/src/main/java/org/apache/iceberg/vortex/ConvertFilterToVortex.java +++ b/vortex/src/main/java/org/apache/iceberg/vortex/ConvertFilterToVortex.java @@ -119,13 +119,11 @@ public Expression or(Expression leftResult, Expression rightResult) { @Override public Expression predicate(BoundPredicate pred) { - if (!(pred.term() instanceof BoundReference)) { + if (!(pred.term() instanceof BoundReference term)) { throw new UnsupportedOperationException( "Cannot convert non-reference to Parquet filter: " + pred.term()); } - BoundReference term = (BoundReference) pred.term(); - if (pred.isLiteralPredicate()) { org.apache.iceberg.expressions.Literal icebergLit = pred.asLiteralPredicate().literal(); Literal vortexLit = toVortexLiteral(icebergLit, term.type()); @@ -167,54 +165,59 @@ private org.apache.iceberg.expressions.Expression bind(UnboundPredicate pred) Literal toVortexLiteral(org.apache.iceberg.expressions.Literal literal, Type termType) { switch (termType.typeId()) { - case BOOLEAN: + case BOOLEAN -> { return Literal.bool((Boolean) literal.value()); - case INTEGER: + } + case INTEGER -> { return Literal.int32((Integer) literal.value()); - case LONG: + } + case LONG -> { return Literal.int64((Long) literal.value()); - case FLOAT: + } + case FLOAT -> { return Literal.float32((Float) literal.value()); - case DOUBLE: + } + case DOUBLE -> { return Literal.float64((Double) literal.value()); - case DECIMAL: - Types.DecimalType decimalType = (Types.DecimalType) (termType); + } + case DECIMAL -> { + Types.DecimalType decimalType = (Types.DecimalType) termType; return Literal.decimal( (BigDecimal) literal.value(), decimalType.precision(), decimalType.scale()); - case STRING: - { - CharSequence charSequence = (CharSequence) literal.value(); - if (Objects.isNull(charSequence)) { - return Literal.string(null); - } else { - return Literal.string(charSequence.toString()); - } + } + case STRING -> { + CharSequence charSequence = (CharSequence) literal.value(); + if (Objects.isNull(charSequence)) { + return Literal.string(null); + } else { + return Literal.string(charSequence.toString()); } - case BINARY: - { - ByteBuffer byteBuffer = (ByteBuffer) literal.value(); - if (Objects.isNull(byteBuffer)) { - return Literal.bytes(null); - } else { - byte[] bytes = new byte[byteBuffer.remaining()]; - byteBuffer.get(bytes); - return Literal.bytes(bytes); - } + } + case BINARY -> { + ByteBuffer byteBuffer = (ByteBuffer) literal.value(); + if (Objects.isNull(byteBuffer)) { + return Literal.bytes(null); + } else { + byte[] bytes = new byte[byteBuffer.remaining()]; + byteBuffer.get(bytes); + return Literal.bytes(bytes); } - case UUID: - { - UUID uuid = (UUID) literal.value(); - if (Objects.isNull(uuid)) { - return Literal.string(null); - } else { - return Literal.string(uuid.toString()); - } + } + case UUID -> { + UUID uuid = (UUID) literal.value(); + if (Objects.isNull(uuid)) { + return Literal.string(null); + } else { + return Literal.string(uuid.toString()); } - case TIME: + } + case TIME -> { return Literal.timeMicros((Long) literal.value()); - case DATE: + } + case DATE -> { return Literal.dateDays((Integer) literal.value()); - case TIMESTAMP: + } + case TIMESTAMP -> { Types.TimestampType timestampType = (Types.TimestampType) termType; if (timestampType.shouldAdjustToUTC()) { throw new UnsupportedOperationException( @@ -225,8 +228,8 @@ Literal toVortexLiteral(org.apache.iceberg.expressions.Literal literal, Ty // precision. return Literal.timestampMicros((Long) literal.value(), Optional.empty()); } - default: - throw new UnsupportedOperationException("Unsupported Literal type: " + termType); + } + default -> throw new UnsupportedOperationException("Unsupported Literal type: " + termType); } } @@ -240,40 +243,31 @@ Expression fromBinaryPredicate( return right; } - switch (op) { - case TRUE: - return ALWAYS_TRUE; - case FALSE: - return ALWAYS_FALSE; - case LT: - return Binary.lt(left, right); - case LT_EQ: - return Binary.ltEq(left, right); - case GT: - return Binary.gt(left, right); - case GT_EQ: - return Binary.gtEq(left, right); - case EQ: - return Binary.eq(left, right); - case NOT_EQ: - return Binary.notEq(left, right); - case AND: - return Binary.and(left, right); - case OR: - return Binary.or(left, right); - default: - return UnconvertibleExpr.INSTANCE; - } + return switch (op) { + case TRUE -> ALWAYS_TRUE; + case FALSE -> ALWAYS_FALSE; + case LT -> Binary.lt(left, right); + case LT_EQ -> Binary.ltEq(left, right); + case GT -> Binary.gt(left, right); + case GT_EQ -> Binary.gtEq(left, right); + case EQ -> Binary.eq(left, right); + case NOT_EQ -> Binary.notEq(left, right); + case AND -> Binary.and(left, right); + case OR -> Binary.or(left, right); + default -> UnconvertibleExpr.INSTANCE; + }; } Expression fromUnaryPredicate( org.apache.iceberg.expressions.Expression.Operation op, Expression child) { switch (op) { - case TRUE: + case TRUE -> { return ALWAYS_TRUE; - case FALSE: + } + case FALSE -> { return ALWAYS_FALSE; - case NOT: + } + case NOT -> { if (child == ALWAYS_TRUE) { return ALWAYS_FALSE; } else if (child == ALWAYS_FALSE) { @@ -281,8 +275,10 @@ Expression fromUnaryPredicate( } else { return Not.of(child); } - default: + } + default -> { return ALWAYS_TRUE; + } } } @@ -296,50 +292,44 @@ Expression fromSetPredicate( .map(value -> (Expression) Binary.eq(term, toVortexValue(value, termType))) .toArray(Expression[]::new); - switch (op) { - case IN: - return Binary.or(eqExprs[0], java.util.Arrays.copyOfRange(eqExprs, 1, eqExprs.length)); - case NOT_IN: - return Not.of( - Binary.or(eqExprs[0], java.util.Arrays.copyOfRange(eqExprs, 1, eqExprs.length))); - default: - return UnconvertibleExpr.INSTANCE; - } + return switch (op) { + case IN -> Binary.or(eqExprs[0], java.util.Arrays.copyOfRange(eqExprs, 1, eqExprs.length)); + case NOT_IN -> + Not.of(Binary.or(eqExprs[0], java.util.Arrays.copyOfRange(eqExprs, 1, eqExprs.length))); + default -> UnconvertibleExpr.INSTANCE; + }; } @SuppressWarnings("unchecked") private Literal toVortexValue(T value, Type termType) { - switch (termType.typeId()) { - case BOOLEAN: - return Literal.bool((Boolean) value); - case INTEGER: - return Literal.int32((Integer) value); - case LONG: - return Literal.int64((Long) value); - case FLOAT: - return Literal.float32((Float) value); - case DOUBLE: - return Literal.float64((Double) value); - case DECIMAL: + return switch (termType.typeId()) { + case BOOLEAN -> Literal.bool((Boolean) value); + case INTEGER -> Literal.int32((Integer) value); + case LONG -> Literal.int64((Long) value); + case FLOAT -> Literal.float32((Float) value); + case DOUBLE -> Literal.float64((Double) value); + case DECIMAL -> { Types.DecimalType decimalType = (Types.DecimalType) termType; - return Literal.decimal((BigDecimal) value, decimalType.precision(), decimalType.scale()); - case STRING: + yield Literal.decimal((BigDecimal) value, decimalType.precision(), decimalType.scale()); + } + case STRING -> { CharSequence charSequence = (CharSequence) value; - return Literal.string(charSequence.toString()); - case DATE: - return Literal.dateDays((Integer) value); - case TIME: - return Literal.timeMicros((Long) value); - case TIMESTAMP: + yield Literal.string(charSequence.toString()); + } + case DATE -> Literal.dateDays((Integer) value); + case TIME -> Literal.timeMicros((Long) value); + case TIMESTAMP -> { Types.TimestampType timestampType = (Types.TimestampType) termType; if (timestampType.shouldAdjustToUTC()) { throw new UnsupportedOperationException( "Handling of timestamps with timezones not yet supported"); } - return Literal.timestampMicros((Long) value, Optional.empty()); - default: - throw new UnsupportedOperationException("Unsupported type for set predicate: " + termType); - } + yield Literal.timestampMicros((Long) value, Optional.empty()); + } + default -> + throw new UnsupportedOperationException( + "Unsupported type for set predicate: " + termType); + }; } enum UnconvertibleExpr implements Expression { diff --git a/vortex/src/main/java/org/apache/iceberg/vortex/VortexFormatModel.java b/vortex/src/main/java/org/apache/iceberg/vortex/VortexFormatModel.java index 3bfccd3a01ce..9835010b1d6f 100644 --- a/vortex/src/main/java/org/apache/iceberg/vortex/VortexFormatModel.java +++ b/vortex/src/main/java/org/apache/iceberg/vortex/VortexFormatModel.java @@ -200,7 +200,7 @@ public FileAppender build() throws IOException { return switch (content) { case DATA, EQUALITY_DELETES -> buildAppender(schema); - case POSITION_DELETES -> + case POSITION_DELETES, DATA_MANIFEST, DELETE_MANIFEST -> throw new UnsupportedOperationException( "Position deletes are not yet supported for Vortex format"); }; diff --git a/vortex/src/main/java/org/apache/iceberg/vortex/VortexMetrics.java b/vortex/src/main/java/org/apache/iceberg/vortex/VortexMetrics.java index 875439f8b940..e098e56377eb 100644 --- a/vortex/src/main/java/org/apache/iceberg/vortex/VortexMetrics.java +++ b/vortex/src/main/java/org/apache/iceberg/vortex/VortexMetrics.java @@ -102,8 +102,8 @@ static Metrics buildMetrics( } private static int truncateLength(MetricsModes.MetricsMode mode) { - if (mode instanceof MetricsModes.Truncate) { - return ((MetricsModes.Truncate) mode).length(); + if (mode instanceof MetricsModes.Truncate truncate) { + return truncate.length(); } return Integer.MAX_VALUE; } @@ -113,15 +113,11 @@ private static T truncateLowerBound(Type type, T value, int length) { if (value == null) { return null; } - switch (type.typeId()) { - case STRING: - return (T) UnicodeUtil.truncateStringMin((String) value, length); - case BINARY: - case FIXED: - return (T) BinaryUtil.truncateBinaryMin((ByteBuffer) value, length); - default: - return value; - } + return switch (type.typeId()) { + case STRING -> (T) UnicodeUtil.truncateStringMin((String) value, length); + case BINARY, FIXED -> (T) BinaryUtil.truncateBinaryMin((ByteBuffer) value, length); + default -> value; + }; } @SuppressWarnings("unchecked") @@ -129,14 +125,10 @@ private static T truncateUpperBound(Type type, T value, int length) { if (value == null) { return null; } - switch (type.typeId()) { - case STRING: - return (T) UnicodeUtil.truncateStringMax((String) value, length); - case BINARY: - case FIXED: - return (T) BinaryUtil.truncateBinaryMax((ByteBuffer) value, length); - default: - return value; - } + return switch (type.typeId()) { + case STRING -> (T) UnicodeUtil.truncateStringMax((String) value, length); + case BINARY, FIXED -> (T) BinaryUtil.truncateBinaryMax((ByteBuffer) value, length); + default -> value; + }; } } diff --git a/vortex/src/main/java/org/apache/iceberg/vortex/VortexSchemaWithTypeVisitor.java b/vortex/src/main/java/org/apache/iceberg/vortex/VortexSchemaWithTypeVisitor.java index 88bc9c488935..1ad4dfe4edb8 100644 --- a/vortex/src/main/java/org/apache/iceberg/vortex/VortexSchemaWithTypeVisitor.java +++ b/vortex/src/main/java/org/apache/iceberg/vortex/VortexSchemaWithTypeVisitor.java @@ -40,19 +40,17 @@ public static T visit( } public static T visit(Type iType, DType schema, VortexSchemaWithTypeVisitor visitor) { - switch (schema.getVariant()) { - case STRUCT: - return visitStruct(iType != null ? iType.asStructType() : null, schema, visitor); - - case LIST: + return switch (schema.getVariant()) { + case STRUCT -> visitStruct(iType != null ? iType.asStructType() : null, schema, visitor); + case LIST -> { Types.ListType list = iType != null ? iType.asListType() : null; - return visitor.list( + yield visitor.list( list, schema, visit(list != null ? list.elementType() : null, schema.getElementType(), visitor)); - default: - return visitor.primitive(iType != null ? iType.asPrimitiveType() : null, schema); - } + } + default -> visitor.primitive(iType != null ? iType.asPrimitiveType() : null, schema); + }; } private static T visitStruct( diff --git a/vortex/src/main/java/org/apache/iceberg/vortex/VortexSchemas.java b/vortex/src/main/java/org/apache/iceberg/vortex/VortexSchemas.java index 605dece4eaa1..d22a2e23855e 100644 --- a/vortex/src/main/java/org/apache/iceberg/vortex/VortexSchemas.java +++ b/vortex/src/main/java/org/apache/iceberg/vortex/VortexSchemas.java @@ -82,46 +82,40 @@ public static DType toDType(Schema icebergSchema) { } private static DType toVortexDType(Type type, boolean nullable) { - switch (type.typeId()) { - case BOOLEAN: - return DType.newBool(nullable); - case INTEGER: - return DType.newInt(nullable); - case LONG: - return DType.newLong(nullable); - case FLOAT: - return DType.newFloat(nullable); - case DOUBLE: - return DType.newDouble(nullable); - case STRING: - return DType.newUtf8(nullable); - case BINARY: - case FIXED: - return DType.newBinary(nullable); - case DECIMAL: + return switch (type.typeId()) { + case BOOLEAN -> DType.newBool(nullable); + case INTEGER -> DType.newInt(nullable); + case LONG -> DType.newLong(nullable); + case FLOAT -> DType.newFloat(nullable); + case DOUBLE -> DType.newDouble(nullable); + case STRING -> DType.newUtf8(nullable); + case BINARY, FIXED -> DType.newBinary(nullable); + case DECIMAL -> { Types.DecimalType decimal = (Types.DecimalType) type; - return DType.newDecimal(decimal.precision(), decimal.scale(), nullable); - case DATE: - return DType.newDate(DType.TimeUnit.DAYS, nullable); - case TIME: - return DType.newTime(DType.TimeUnit.MICROSECONDS, nullable); - case TIMESTAMP: + yield DType.newDecimal(decimal.precision(), decimal.scale(), nullable); + } + case DATE -> DType.newDate(DType.TimeUnit.DAYS, nullable); + case TIME -> DType.newTime(DType.TimeUnit.MICROSECONDS, nullable); + case TIMESTAMP -> { Types.TimestampType ts = (Types.TimestampType) type; - return DType.newTimestamp( + yield DType.newTimestamp( DType.TimeUnit.MICROSECONDS, ts.shouldAdjustToUTC() ? Optional.of("UTC") : Optional.empty(), nullable); - case TIMESTAMP_NANO: + } + case TIMESTAMP_NANO -> { Types.TimestampNanoType tsNano = (Types.TimestampNanoType) type; - return DType.newTimestamp( + yield DType.newTimestamp( DType.TimeUnit.NANOSECONDS, tsNano.shouldAdjustToUTC() ? Optional.of("UTC") : Optional.empty(), nullable); - case LIST: + } + case LIST -> { Types.ListType listType = (Types.ListType) type; DType elementDType = toVortexDType(listType.elementType(), listType.isElementOptional()); - return DType.newList(elementDType, nullable); - case STRUCT: + yield DType.newList(elementDType, nullable); + } + case STRUCT -> { Types.StructType structType = (Types.StructType) type; List fields = structType.fields(); String[] fieldNames = new String[fields.size()]; @@ -131,11 +125,12 @@ private static DType toVortexDType(Type type, boolean nullable) { fieldTypes[i] = toVortexDType(fields.get(i).type(), fields.get(i).isOptional()); } - return DType.newStruct(fieldNames, fieldTypes, nullable); - default: - throw new UnsupportedOperationException( - "Unsupported Iceberg type for Vortex write: " + type); - } + yield DType.newStruct(fieldNames, fieldTypes, nullable); + } + default -> + throw new UnsupportedOperationException( + "Unsupported Iceberg type for Vortex write: " + type); + }; } /** Convert an Iceberg Schema to an Arrow Schema for writing via Arrow IPC. */ @@ -149,57 +144,55 @@ public static org.apache.arrow.vector.types.pojo.Schema toArrowSchema(Schema ice } private static Field toArrowField(String name, Type type, boolean nullable) { - switch (type.typeId()) { - case BOOLEAN: - return new Field(name, new FieldType(nullable, ArrowType.Bool.INSTANCE, null), null); - case INTEGER: - return new Field( - name, new FieldType(nullable, new ArrowType.Int(Integer.SIZE, true), null), null); - case LONG: - return new Field( - name, new FieldType(nullable, new ArrowType.Int(Long.SIZE, true), null), null); - case FLOAT: - return new Field( - name, - new FieldType( - nullable, new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE), null), - null); - case DOUBLE: - return new Field( - name, - new FieldType( - nullable, new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE), null), - null); - case STRING: - return new Field(name, new FieldType(nullable, ArrowType.Utf8.INSTANCE, null), null); - case BINARY: - return new Field(name, new FieldType(nullable, ArrowType.Binary.INSTANCE, null), null); - case FIXED: + return switch (type.typeId()) { + case BOOLEAN -> new Field(name, new FieldType(nullable, ArrowType.Bool.INSTANCE, null), null); + case INTEGER -> + new Field( + name, new FieldType(nullable, new ArrowType.Int(Integer.SIZE, true), null), null); + case LONG -> + new Field(name, new FieldType(nullable, new ArrowType.Int(Long.SIZE, true), null), null); + case FLOAT -> + new Field( + name, + new FieldType( + nullable, new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE), null), + null); + case DOUBLE -> + new Field( + name, + new FieldType( + nullable, new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE), null), + null); + case STRING -> new Field(name, new FieldType(nullable, ArrowType.Utf8.INSTANCE, null), null); + case BINARY -> + new Field(name, new FieldType(nullable, ArrowType.Binary.INSTANCE, null), null); + case FIXED -> { Types.FixedType fixedType = (Types.FixedType) type; - return new Field( + yield new Field( name, new FieldType(nullable, new ArrowType.FixedSizeBinary(fixedType.length()), null), null); - case DECIMAL: + } + case DECIMAL -> { Types.DecimalType decimalType = (Types.DecimalType) type; - return new Field( + yield new Field( name, new FieldType( nullable, new ArrowType.Decimal(decimalType.precision(), decimalType.scale(), 128), null), null); - case DATE: - return new Field( - name, new FieldType(nullable, new ArrowType.Date(DateUnit.DAY), null), null); - case TIME: - return new Field( - name, - new FieldType(nullable, new ArrowType.Time(TimeUnit.MICROSECOND, Long.SIZE), null), - null); - case TIMESTAMP: + } + case DATE -> + new Field(name, new FieldType(nullable, new ArrowType.Date(DateUnit.DAY), null), null); + case TIME -> + new Field( + name, + new FieldType(nullable, new ArrowType.Time(TimeUnit.MICROSECOND, Long.SIZE), null), + null); + case TIMESTAMP -> { Types.TimestampType tsType = (Types.TimestampType) type; - return new Field( + yield new Field( name, new FieldType( nullable, @@ -207,9 +200,10 @@ private static Field toArrowField(String name, Type type, boolean nullable) { TimeUnit.MICROSECOND, tsType.shouldAdjustToUTC() ? "UTC" : null), null), null); - case TIMESTAMP_NANO: + } + case TIMESTAMP_NANO -> { Types.TimestampNanoType tsNanoType = (Types.TimestampNanoType) type; - return new Field( + yield new Field( name, new FieldType( nullable, @@ -217,27 +211,30 @@ private static Field toArrowField(String name, Type type, boolean nullable) { TimeUnit.NANOSECOND, tsNanoType.shouldAdjustToUTC() ? "UTC" : null), null), null); - case LIST: + } + case LIST -> { Types.ListType listType = (Types.ListType) type; Field elementField = toArrowField("element", listType.elementType(), listType.isElementOptional()); - return new Field( + yield new Field( name, new FieldType(nullable, ArrowType.List.INSTANCE, null), ImmutableList.of(elementField)); - case STRUCT: + } + case STRUCT -> { Types.StructType structType = (Types.StructType) type; ImmutableList.Builder children = ImmutableList.builder(); for (Types.NestedField field : structType.fields()) { children.add(toArrowField(field.name(), field.type(), field.isOptional())); } - return new Field( + yield new Field( name, new FieldType(nullable, ArrowType.Struct.INSTANCE, null), children.build()); - default: - throw new UnsupportedOperationException( - "Unsupported Iceberg type for Arrow conversion: " + type); - } + } + default -> + throw new UnsupportedOperationException( + "Unsupported Iceberg type for Arrow conversion: " + type); + }; } private static Type toIcebergType(DType dataType) {