vortex-data · robert3005 · May 18, 2026 · a10y · Jun 12, 2026 · robert3005
diff --git a/java/vortex-jni/src/main/java/dev/vortex/api/DataSource.java b/java/vortex-jni/src/main/java/dev/vortex/api/DataSource.java
@@ -128,6 +128,57 @@ public OptionalLong asOptional() {
         }
     }
 
+    /**
+     * Sum of the on-storage byte sizes of all files included in this data source along with the precision of that
+     * estimate. Mirrors the Rust {@code Option<Precision<u64>>} returned by {@code DataSource::byte_size}:
+     * {@link ByteSize.Unknown} when no estimate is available (for example when the filesystem listing did not return
+     * sizes), {@link ByteSize.Estimate} for an inexact hint (some files contribute extrapolated sizes), and
+     * {@link ByteSize.Exact} when every file has a known size.
+     */
+    public ByteSize byteSize() {
+        long[] out = new long[2];
+        NativeDataSource.byteSize(pointer, out);
+        return switch ((int) out[1]) {
+            case 1 -> new ByteSize.Estimate(out[0]);
+            case 2 -> new ByteSize.Exact(out[0]);
+            default -> ByteSize.Unknown.INSTANCE;
+        };
+    }
+
+    /** Precision-aware byte size. See {@link #byteSize()}. */
+    public sealed interface ByteSize {
+        /** Returns the byte size as a long, or {@code OptionalLong.empty()} when unknown. */
+        OptionalLong asOptional();
+
+        /** Byte size is not known. */
+        final class Unknown implements ByteSize {
+            public static final Unknown INSTANCE = new Unknown();
+
+            private Unknown() {}
+
+            @Override
+            public OptionalLong asOptional() {
+                return OptionalLong.empty();
+            }
+        }
+
+        /** Estimated byte size; the actual value may differ. */
+        record Estimate(long value) implements ByteSize {
+            @Override
+            public OptionalLong asOptional() {
+                return OptionalLong.of(value);
+            }
+        }
+
+        /** Exact byte size. */
+        record Exact(long value) implements ByteSize {
+            @Override
+            public OptionalLong asOptional() {
+                return OptionalLong.of(value);
+            }
+        }
+    }
+
     /** Submit a scan. */
     public Scan scan(ScanOptions options) {
         Objects.requireNonNull(options, "options");

diff --git a/java/vortex-jni/src/main/java/dev/vortex/jni/NativeDataSource.java b/java/vortex-jni/src/main/java/dev/vortex/jni/NativeDataSource.java
@@ -33,4 +33,10 @@ private NativeDataSource() {}
      * {@code 1=estimate}, {@code 2=exact}.
      */
     public static native void rowCount(long pointer, long[] out);
+
+    /**
+     * Populate {@code out} with {@code [bytes, precision]}, the sum of on-storage file sizes for the data source.
+     * Precision is one of {@code 0=unknown}, {@code 1=estimate}, {@code 2=exact}.
+     */
+    public static native void byteSize(long pointer, long[] out);
 }
diff --git a/java/vortex-jni/src/test/java/dev/vortex/api/TestMinimal.java b/java/vortex-jni/src/test/java/dev/vortex/api/TestMinimal.java
@@ -11,6 +11,7 @@
 import dev.vortex.jni.NativeLoader;
 import java.io.IOException;
 import java.math.BigDecimal;
+import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -132,6 +133,7 @@ public void testFullScan() throws Exception {
         DataSource ds = DataSource.open(session, writePath);
 
         assertEquals(new DataSource.RowCount.Exact(10L), ds.rowCount());
+        assertEquals(new DataSource.ByteSize.Exact(Files.size(tempDir.resolve("minimal.vortex"))), ds.byteSize());
 
         var schema = ds.arrowSchema(allocator);
         assertEquals(

diff --git a/java/vortex-spark/src/main/java/dev/vortex/spark/read/VortexBatchExec.java b/java/vortex-spark/src/main/java/dev/vortex/spark/read/VortexBatchExec.java
@@ -4,6 +4,7 @@
 package dev.vortex.spark.read;
 
 import com.google.common.collect.ImmutableMap;
+import dev.vortex.api.Session;
 import dev.vortex.jni.NativeFiles;
 import dev.vortex.spark.VortexFilePartition;
 import dev.vortex.spark.VortexSparkSession;
@@ -76,14 +77,19 @@ public PartitionReaderFactory createReaderFactory() {
     }
 
     private List<String> resolvePaths() {
-        var session = VortexSparkSession.get(formatOptions);
+        return resolveVortexPaths(VortexSparkSession.get(formatOptions), paths, formatOptions);
+    }
+
+    /**
+     * Expands directory-like entries to concrete {@code .vortex} files; entries that already name a {@code .vortex}
+     * file are kept as-is. Shared with {@link VortexScan#estimateStatistics()} so planning and execution resolve paths
+     * identically.
+     */
+    static List<String> resolveVortexPaths(Session session, List<String> paths, Map<String, String> formatOptions) {
         return paths.stream()
-                .flatMap(path -> {
-                    if (path.endsWith(".vortex")) {
-                        return Stream.of(path);
-                    }
-                    return NativeFiles.listFiles(session, path, formatOptions).stream();
-                })
+                .flatMap(path -> path.endsWith(".vortex")
+                        ? Stream.of(path)
+                        : NativeFiles.listFiles(session, path, formatOptions).stream())
                 .collect(Collectors.toList());
     }
 

diff --git a/java/vortex-spark/src/main/java/dev/vortex/spark/read/VortexScan.java b/java/vortex-spark/src/main/java/dev/vortex/spark/read/VortexScan.java
@@ -3,38 +3,62 @@
 
 package dev.vortex.spark.read;
 
+import dev.vortex.api.DataSource;
+import dev.vortex.api.Session;
+import dev.vortex.spark.VortexSparkSession;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
+import java.util.OptionalLong;
 import org.apache.spark.sql.connector.catalog.CatalogV2Util;
 import org.apache.spark.sql.connector.catalog.Column;
+import org.apache.spark.sql.connector.expressions.NamedReference;
 import org.apache.spark.sql.connector.expressions.filter.Predicate;
 import org.apache.spark.sql.connector.read.Batch;
 import org.apache.spark.sql.connector.read.Scan;
+import org.apache.spark.sql.connector.read.Statistics;
+import org.apache.spark.sql.connector.read.SupportsReportStatistics;
+import org.apache.spark.sql.connector.read.colstats.ColumnStatistics;
+import org.apache.spark.sql.internal.SQLConf;
 import org.apache.spark.sql.types.StructType;
 
-/** Spark V2 {@link Scan} over a table of Vortex files. */
-public final class VortexScan implements Scan {
+/**
+ * Spark V2 {@link Scan} over a table of Vortex files.
+ *
+ * <p>Implements {@link SupportsReportStatistics} to surface both the row count Vortex records in each file footer and a
+ * Spark scan-size estimate. The byte estimate starts from the on-storage file sizes collected by
+ * {@code MultiFileDataSource}, then follows Spark's file scan convention by applying the SQL file-compression factor
+ * and scaling by the pushed read schema's default size relative to the full table schema's default size. When the
+ * listing did not return a size for one or more files the file-byte total is extrapolated before Spark scaling is
+ * applied.
+ */
+public final class VortexScan implements Scan, SupportsReportStatistics {
 
     private final List<String> paths;
+    private final List<Column> tableColumns;
     private final List<Column> readColumns;
     private final Map<String, String> formatOptions;
     private final Predicate[] pushedPredicates;
 
+    private volatile Statistics cachedStatistics;
+
     /**
      * Creates a new VortexScan for the specified file paths and columns. The caller is responsible for passing
      * immutable collections; the constructor does not copy.
      *
      * @param paths the list of Vortex file paths to scan
+     * @param tableColumns the full table columns before projection pushdown
      * @param readColumns the list of columns to read from the files
      * @param pushedPredicates predicates pushed down by Spark; {@code null} or empty means no pushdown
      */
     public VortexScan(
             List<String> paths,
+            List<Column> tableColumns,
             List<Column> readColumns,
-            Map<String, String> formatOptions,
-            Predicate[] pushedPredicates) {
+            Predicate[] pushedPredicates,
+            Map<String, String> formatOptions) {
         this.paths = paths;
+        this.tableColumns = tableColumns;
         this.readColumns = readColumns;
         this.formatOptions = formatOptions;
         this.pushedPredicates = pushedPredicates == null ? new Predicate[0] : pushedPredicates.clone();
@@ -83,4 +107,70 @@ public Batch toBatch() {
     public ColumnarSupportMode columnarSupportMode() {
         return ColumnarSupportMode.SUPPORTED;
     }
+
+    /**
+     * Returns statistics for this scan.
+     *
+     * <p>Opens the Vortex {@link DataSource} on first invocation and caches the result. The row count is taken from the
+     * data source (sum of file-footer row counts; extrapolated from the first opened file when other files are
+     * deferred). {@link Statistics#sizeInBytes()} is derived from the per-file sizes reported by the filesystem
+     * listing, then adjusted by Spark's compression factor and the ratio between the pushed read schema and the full
+     * table schema. When a listing did not return a size for some file the file-byte total is extrapolated. When no
+     * file size is known at all the value is left empty so Spark falls back to its default heuristic.
+     *
+     * @return statistics with row-count and Spark scan-size estimates
+     */
+    @Override
+    public Statistics estimateStatistics() {
+        Statistics local = cachedStatistics;
+        if (local != null) {
+            return local;
+        }
+        synchronized (this) {
+            if (cachedStatistics == null) {
+                cachedStatistics = computeStatistics();
+            }
+            return cachedStatistics;
+        }
+    }
+
+    private Statistics computeStatistics() {
+        Session session = VortexSparkSession.get(formatOptions);
+        List<String> resolvedPaths = VortexBatchExec.resolveVortexPaths(session, paths, formatOptions);
+        if (resolvedPaths.isEmpty()) {
+            return new VortexStatistics(OptionalLong.empty(), OptionalLong.empty());
+        }
+
+        DataSource source = DataSource.open(session, resolvedPaths, formatOptions);
+        return new VortexStatistics(
+                source.rowCount().asOptional(),
+                scaleSizeInBytes(source.byteSize().asOptional()));
+    }
+
+    private OptionalLong scaleSizeInBytes(OptionalLong fileBytes) {
+        if (fileBytes.isEmpty()) {
+            return OptionalLong.empty();
+        }
+
+        StructType tableSchema = CatalogV2Util.v2ColumnsToStructType(tableColumns.toArray(new Column[0]));
+        StructType readSchema = readSchema();
+        int tableDefaultSize = tableSchema.defaultSize();
+        if (tableDefaultSize <= 0) {
+            return fileBytes;
+        }
+
+        double scaled = SQLConf.get().fileCompressionFactor()
+                * fileBytes.getAsLong()
+                / tableDefaultSize
+                * readSchema.defaultSize();
+        return OptionalLong.of((long) scaled);
+    }
+
+    private record VortexStatistics(OptionalLong numRows, OptionalLong sizeInBytes) implements Statistics {
+
+        @Override
+        public Map<NamedReference, ColumnStatistics> columnStats() {
+            return Map.of();
+        }
+    }
 }
diff --git a/java/vortex-spark/src/main/java/dev/vortex/spark/read/VortexScanBuilder.java b/java/vortex-spark/src/main/java/dev/vortex/spark/read/VortexScanBuilder.java
@@ -31,7 +31,8 @@
 public final class VortexScanBuilder
         implements ScanBuilder, SupportsPushDownRequiredColumns, SupportsPushDownV2Filters {
     private final ImmutableList.Builder<String> paths;
-    private final List<Column> columns;
+    private final List<Column> tableColumns;
+    private final List<Column> readColumns;
     private final Map<String, String> formatOptions;
     private final Set<String> partitionColumnNames;
     private Predicate[] pushedPredicates = new Predicate[0];
@@ -48,10 +49,11 @@ public VortexScanBuilder(Map<String, String> formatOptions) {
      */
     public VortexScanBuilder(Map<String, String> formatOptions, Transform[] partitionTransforms) {
         this.paths = ImmutableList.builder();
-        this.columns = new ArrayList<>();
         Map<String, String> options = Maps.newHashMap();
         options.put("vortex.workerThreads", "4");
         options.putAll(formatOptions);
+        this.tableColumns = new ArrayList<>();
+        this.readColumns = new ArrayList<>();
         this.formatOptions = options;
         this.partitionColumnNames = collectPartitionColumnNames(partitionTransforms);
     }
@@ -74,7 +76,8 @@ public VortexScanBuilder addPath(String path) {
      * @return this builder for method chaining
      */
     public VortexScanBuilder addColumn(Column column) {
-        this.columns.add(column);
+        this.tableColumns.add(column);
+        this.readColumns.add(column);
         return this;
     }
 
@@ -97,7 +100,7 @@ public VortexScanBuilder addAllPaths(Iterable<String> paths) {
      */
     public VortexScanBuilder addAllColumns(Iterable<Column> columns) {
         for (Column column : columns) {
-            this.columns.add(column);
+            addColumn(column);
         }
         return this;
     }
@@ -116,7 +119,12 @@ public Scan build() {
         // Allow empty columns for operations like count() that don't need actual column data
         // If no columns are specified, we'll read the minimal schema needed
 
-        return new VortexScan(paths, List.copyOf(this.columns), this.formatOptions, pushedPredicates);
+        return new VortexScan(
+                paths,
+                List.copyOf(this.tableColumns),
+                List.copyOf(this.readColumns),
+                pushedPredicates,
+                this.formatOptions);
     }
 
     /**
@@ -129,8 +137,8 @@ public Scan build() {
      */
     @Override
     public void pruneColumns(StructType requiredSchema) {
-        columns.clear();
-        columns.addAll(Arrays.asList(CatalogV2Util.structTypeToV2Columns(requiredSchema)));
+        readColumns.clear();
+        readColumns.addAll(Arrays.asList(CatalogV2Util.structTypeToV2Columns(requiredSchema)));
     }
 
     /**
@@ -145,7 +153,7 @@ public void pruneColumns(StructType requiredSchema) {
     @Override
     public Predicate[] pushPredicates(Predicate[] predicates) {
         Map<String, DataType> dataColumnTypes = new HashMap<>();
-        for (Column column : columns) {
+        for (Column column : readColumns) {
             if (!partitionColumnNames.contains(column.name())) {
                 dataColumnTypes.put(column.name(), column.dataType());
             }