apache
diff --git a/‎pyiceberg/io/pyarrow.py‎
Lines changed: 20 additions & 1 deletion b/‎pyiceberg/io/pyarrow.py‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎pyiceberg/table/__init__.py‎
Lines changed: 79 additions & 7 deletions b/‎pyiceberg/table/__init__.py‎
Lines changed: 79 additions & 7 deletions
@@ -1614,8 +1614,13 @@ def _task_to_record_batches(
     partition_spec: PartitionSpec | None = None,
     format_version: TableVersion = TableProperties.DEFAULT_FORMAT_VERSION,
     downcast_ns_timestamp_to_us: bool | None = None,
+    dictionary_columns: tuple[str, ...] | None = None,
 ) -> Iterator[pa.RecordBatch]:
-    arrow_format = _get_file_format(task.file.file_format, pre_buffer=True, buffer_size=(ONE_MEGABYTE * 8))
+    # Only pass dictionary_columns for Parquet — ORC does not support this kwarg.
+    format_kwargs: dict[str, Any] = {"pre_buffer": True, "buffer_size": ONE_MEGABYTE * 8}
+    if dictionary_columns and task.file.file_format == FileFormat.PARQUET:
+        format_kwargs["dictionary_columns"] = dictionary_columns
+    arrow_format = _get_file_format(task.file.file_format, **format_kwargs)
     with io.new_input(task.file.file_path).open() as fin:
         fragment = arrow_format.make_fragment(fin)
         physical_schema = fragment.physical_schema
@@ -1718,6 +1723,7 @@ class ArrowScan:
     _case_sensitive: bool
     _limit: int | None
     _downcast_ns_timestamp_to_us: bool | None
+    _dictionary_columns: tuple[str, ...] | None
     """Scan the Iceberg Table and create an Arrow construct.
 
     Attributes:
@@ -1737,6 +1743,8 @@ def __init__(
         row_filter: BooleanExpression,
         case_sensitive: bool = True,
         limit: int | None = None,
+        *,
+        dictionary_columns: tuple[str, ...] | None = None,
     ) -> None:
         self._table_metadata = table_metadata
         self._io = io
@@ -1745,6 +1753,7 @@ def __init__(
         self._case_sensitive = case_sensitive
         self._limit = limit
         self._downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE)
+        self._dictionary_columns = dictionary_columns
 
     @property
     def _projected_field_ids(self) -> set[int]:
@@ -1773,6 +1782,15 @@ def to_table(self, tasks: Iterable[FileScanTask]) -> pa.Table:
             ValueError: When a field type in the file cannot be projected to the schema type
         """
         arrow_schema = schema_to_pyarrow(self._projected_schema, include_field_ids=False)
+        if self._dictionary_columns:
+            dict_cols_set = set(self._dictionary_columns)
+            arrow_schema = pa.schema(
+                [
+                    field.with_type(pa.dictionary(pa.int32(), field.type)) if field.name in dict_cols_set else field
+                    for field in arrow_schema
+                ],
+                metadata=arrow_schema.metadata,
+            )
 
         batches = self.to_record_batches(tasks)
         try:
@@ -1855,6 +1873,7 @@ def _record_batches_from_scan_tasks_and_deletes(
                 self._table_metadata.specs().get(task.file.spec_id),
                 self._table_metadata.format_version,
                 self._downcast_ns_timestamp_to_us,
+                dictionary_columns=self._dictionary_columns,
             )
             for batch in batches:
                 if self._limit is not None:
 
@@ -1121,6 +1121,7 @@ def scan(
         snapshot_id: int | None = None,
         options: Properties = EMPTY_DICT,
         limit: int | None = None,
+        dictionary_columns: tuple[str, ...] | None = None,
     ) -> DataScan:
         """Fetch a DataScan based on the table's current metadata.
 
@@ -1147,6 +1148,13 @@ def scan(
                 An integer representing the number of rows to
                 return in the scan result. If None, fetches all
                 matching rows.
+            dictionary_columns:
+                A tuple of column names that PyArrow should read as
+                dictionary-encoded (DictionaryArray). Reduces memory
+                usage for columns with large or repeated string values
+                (e.g. large JSON blobs). Only applies to Parquet files;
+                silently ignored for ORC. Columns absent from the file
+                are silently skipped. Default is None (no dictionary encoding).
 
         Returns:
             A DataScan based on the table's current metadata.
@@ -1162,6 +1170,7 @@ def scan(
             limit=limit,
             catalog=self.catalog,
             table_identifier=self._identifier,
+            dictionary_columns=dictionary_columns,
         )
 
     @property
@@ -1664,6 +1673,7 @@ def scan(
         snapshot_id: int | None = None,
         options: Properties = EMPTY_DICT,
         limit: int | None = None,
+        dictionary_columns: tuple[str, ...] | None = None,
     ) -> DataScan:
         raise ValueError("Cannot scan a staged table")
 
@@ -1749,16 +1759,20 @@ def projection(self) -> Schema:
         return current_schema.select(*self.selected_fields, case_sensitive=self.case_sensitive)
 
     @abstractmethod
-    def plan_files(self) -> Iterable[ScanTask]: ...
+    def plan_files(self) -> Iterable[ScanTask]:
+        ...
 
     @abstractmethod
-    def to_arrow(self) -> pa.Table: ...
+    def to_arrow(self) -> pa.Table:
+        ...
 
     @abstractmethod
-    def to_pandas(self, **kwargs: Any) -> pd.DataFrame: ...
+    def to_pandas(self, **kwargs: Any) -> pd.DataFrame:
+        ...
 
     @abstractmethod
-    def to_polars(self) -> pl.DataFrame: ...
+    def to_polars(self) -> pl.DataFrame:
+        ...
 
     def update(self: S, **overrides: Any) -> S:
         """Create a copy of this table scan with updated fields."""
@@ -1791,7 +1805,8 @@ def with_case_sensitive(self: S, case_sensitive: bool = True) -> S:
         return self.update(case_sensitive=case_sensitive)
 
     @abstractmethod
-    def count(self) -> int: ...
+    def count(self) -> int:
+        ...
 
 
 class ScanTask:
@@ -1916,6 +1931,36 @@ def _min_sequence_number(manifests: list[ManifestFile]) -> int:
 
 
 class DataScan(TableScan):
+    dictionary_columns: tuple[str, ...] | None
+
+    def __init__(
+        self,
+        table_metadata: TableMetadata,
+        io: FileIO,
+        row_filter: str | BooleanExpression = ALWAYS_TRUE,
+        selected_fields: tuple[str, ...] = ("*",),
+        case_sensitive: bool = True,
+        snapshot_id: int | None = None,
+        options: Properties = EMPTY_DICT,
+        limit: int | None = None,
+        catalog: Catalog | None = None,
+        table_identifier: Identifier | None = None,
+        dictionary_columns: tuple[str, ...] | None = None,
+    ) -> None:
+        super().__init__(
+            table_metadata=table_metadata,
+            io=io,
+            row_filter=row_filter,
+            selected_fields=selected_fields,
+            case_sensitive=case_sensitive,
+            snapshot_id=snapshot_id,
+            options=options,
+            limit=limit,
+            catalog=catalog,
+            table_identifier=table_identifier,
+        )
+        self.dictionary_columns = dictionary_columns
+
     def _build_partition_projection(self, spec_id: int) -> BooleanExpression:
         project = inclusive_projection(self.table_metadata.schema(), self.table_metadata.specs()[spec_id], self.case_sensitive)
         return project(self.row_filter)
@@ -2113,7 +2158,13 @@ def to_arrow(self) -> pa.Table:
         from pyiceberg.io.pyarrow import ArrowScan
 
         return ArrowScan(
-            self.table_metadata, self.io, self.projection(), self.row_filter, self.case_sensitive, self.limit
+            self.table_metadata,
+            self.io,
+            self.projection(),
+            self.row_filter,
+            self.case_sensitive,
+            self.limit,
+            dictionary_columns=self.dictionary_columns,
         ).to_table(self.plan_files())
 
     def to_arrow_batch_reader(self) -> pa.RecordBatchReader:
@@ -2132,8 +2183,29 @@ def to_arrow_batch_reader(self) -> pa.RecordBatchReader:
         from pyiceberg.io.pyarrow import ArrowScan, schema_to_pyarrow
 
         target_schema = schema_to_pyarrow(self.projection())
+
+        # When dictionary_columns is set, PyArrow returns DictionaryArray for those columns.
+        # target_schema uses plain string types, so .cast(target_schema) would silently decode
+        # them back to plain strings. Rebuild target_schema with dictionary types for the listed
+        # columns so from_batches and cast both preserve the encoding.
+        if self.dictionary_columns:
+            dict_cols_set = set(self.dictionary_columns)
+            target_schema = pa.schema(
+                [
+                    field.with_type(pa.dictionary(pa.int32(), field.type)) if field.name in dict_cols_set else field
+                    for field in target_schema
+                ],
+                metadata=target_schema.metadata,
+            )
+
         batches = ArrowScan(
-            self.table_metadata, self.io, self.projection(), self.row_filter, self.case_sensitive, self.limit
+            self.table_metadata,
+            self.io,
+            self.projection(),
+            self.row_filter,
+            self.case_sensitive,
+            self.limit,
+            dictionary_columns=self.dictionary_columns,
         ).to_record_batches(self.plan_files())
 
         return pa.RecordBatchReader.from_batches(