From b22fbc8f72408fc45f9648bad76d8ba47df8b664 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 9 Apr 2026 15:55:00 +0200 Subject: [PATCH 1/8] Parquet IO: also use zoneinfo timezones by default even when pyarrow uses pytz --- pandas/core/frame.py | 12 +++++ pandas/core/generic.py | 2 +- pandas/core/internals/managers.py | 8 ++++ pandas/io/_util.py | 74 ++++++++++++++++++++++++++++++- 4 files changed, 94 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6133fba8cf8ec..9dd24df0e20fe 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5477,6 +5477,18 @@ def predicate(arr: ArrayLike) -> bool: mgr = self._mgr._get_data_subset(predicate).copy(deep=False) return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) + def _select_dtypes_indices(self, dtype_class) -> np.ndarray: + """ + Return the indices of the columns of a given dtype. + + Currently only works given a class, so mostly useful for ExtensionDtypes. + """ + + def predicate(arr: ArrayLike) -> bool: + return isinstance(arr.dtype, dtype_class) + + return self._mgr._get_data_subset_indices(predicate) + def insert( self, loc: int, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 00a337f075f05..a9c00b73faf97 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7127,7 +7127,7 @@ def fillna( if axis == 1: # Check that all columns in result have the same dtype # otherwise don't bother with fillna and losing accurate dtypes - unique_dtypes = algos.unique(self._mgr.get_dtypes()) + unique_dtypes = self._mgr.get_unique_dtypes() if len(unique_dtypes) > 1: raise ValueError( "All columns must have the same dtype, but got dtypes: " diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index e82f2fb043d0d..771134f2081b4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -336,6 +336,9 @@ def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool: blk = self.blocks[blkno] return any(blk is ref() for ref in mgr.blocks[blkno].refs.referenced_blocks) + def get_unique_dtypes(self) -> npt.NDArray[np.object_]: + return algos.unique([blk.dtype for blk in self.blocks]) + def get_dtypes(self) -> npt.NDArray[np.object_]: dtypes = np.array([blk.dtype for blk in self.blocks], dtype=object) return dtypes.take(self.blknos) @@ -656,6 +659,11 @@ def _get_data_subset(self, predicate: Callable) -> Self: blocks = [blk for blk in self.blocks if predicate(blk.values)] return self._combine(blocks) + def _get_data_subset_indices(self, predicate: Callable) -> np.ndarray: + blocks = [blk for blk in self.blocks if predicate(blk.values)] + indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) + return indexer + def get_bool_data(self) -> Self: """ Select blocks that are bool-dtype and columns from object-dtype blocks diff --git a/pandas/io/_util.py b/pandas/io/_util.py index da9ac3913cbbd..72a50b1e25ce7 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -1,5 +1,6 @@ from __future__ import annotations +import datetime as dt from typing import ( TYPE_CHECKING, Literal, @@ -10,6 +11,7 @@ from pandas._config import using_string_dtype from pandas._libs import lib +from pandas._libs.tslibs import timezones from pandas.compat import ( pa_version_under18p0, pa_version_under19p0, @@ -35,6 +37,9 @@ ) +pytz = import_optional_dependency("pytz", errors="ignore") + + def _arrow_dtype_mapping() -> dict: pa = import_optional_dependency("pyarrow") return { @@ -120,7 +125,9 @@ def arrow_table_to_pandas( raise NotImplementedError df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs) - return _post_convert_dtypes(df, dtype_backend, dtype, names) + df = _post_convert_dtypes(df, dtype_backend, dtype, names) + df = _normalize_timezone_dtypes(df) + return df def _post_convert_dtypes( @@ -189,3 +196,68 @@ def _post_convert_dtypes( df[col] = df[col].astype(cat_dtype) return df + + +def _normalize_pytz_timezone(tz: dt.tzinfo) -> dt.tzinfo: + """ + If the input tz is a pytz timezone, attempt to convert it to "default" + tzinfo object (zoneinfo or datetime.timezone). + """ + if not type(tz).__module__.startswith("pytz"): + # isinstance(col.dtype.tz, pytz.BaseTzInfo) does not included + # fixed offsets + return tz + + if timezones.is_utc(tz): + return timezones.maybe_get_tz("UTC") + + if timezones.is_fixed_offset(tz): + # Convert pytz fixed offset to datetime.timezone + try: + offset = tz.utcoffset(None) + if offset is not None: + return dt.timezone(offset) + except Exception: + pass + + zone = timezones.get_timezone(tz) + if isinstance(zone, str): + try: + return timezones.maybe_get_tz(zone) + except Exception: + # some pytz timezones might not be available for zoneinfo + pass + + return tz + + +def _normalize_timezone_index(index: pd.Index) -> pd.Index: + if isinstance(index, pd.MultiIndex): + levels = [_normalize_timezone_index(level) for level in index.levels] + return index.set_levels(levels) + + if isinstance(index.dtype, pd.DatetimeTZDtype): + normalized_tz = _normalize_pytz_timezone(index.dtype.tz) + if normalized_tz is not index.dtype.tz: + return index.tz_convert(normalized_tz) + + return index + + +def _normalize_timezone_dtypes(df: pd.DataFrame) -> pd.DataFrame: + if pytz is not None: + # Convert any pytz timezones to zoneinfo / fixed offset timezones + if any( + isinstance(dtype, pd.DatetimeTZDtype) + for dtype in df._mgr.get_unique_dtypes() + ): + col_indices = df._select_dtypes_indices(pd.DatetimeTZDtype) + for i in col_indices: + col = df.iloc[:, i] + normalized_tz = _normalize_pytz_timezone(col.dtype.tz) + if normalized_tz is not col.dtype.tz: + df.isetitem(i, col.dt.tz_convert(normalized_tz)) + + df.index = _normalize_timezone_index(df.index) + df.columns = _normalize_timezone_index(df.columns) + return df From 524ff5853542f6b4a98dfbab8632edbab4fddc6d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 9 Apr 2026 17:53:30 +0200 Subject: [PATCH 2/8] fixup + update test for tzaware index now no longer returning pytz --- pandas/core/internals/managers.py | 2 +- pandas/tests/io/test_parquet.py | 26 +------------------------- 2 files changed, 2 insertions(+), 26 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 771134f2081b4..94b4f581c44db 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -337,7 +337,7 @@ def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool: return any(blk is ref() for ref in mgr.blocks[blkno].refs.referenced_blocks) def get_unique_dtypes(self) -> npt.NDArray[np.object_]: - return algos.unique([blk.dtype for blk in self.blocks]) + return algos.unique(np.array([blk.dtype for blk in self.blocks], dtype=object)) def get_dtypes(self) -> npt.NDArray[np.object_]: dtypes = np.array([blk.dtype for blk in self.blocks], dtype=object) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index c5922b6b5a9a4..aff16c58f8c28 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1068,31 +1068,7 @@ def test_timestamp_nanoseconds(self, pa, temp_file): def test_timezone_aware_index(self, pa, timezone_aware_date_list, temp_file): idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) - - # see gh-36004 - # compare time(zone) values only, skip their class: - # pyarrow always creates fixed offset timezones using pytz.FixedOffset() - # even if it was datetime.timezone() originally - # - # technically they are the same: - # they both implement datetime.tzinfo - # they both wrap datetime.timedelta() - # this use-case sets the resolution to 1 minute - - expected = df[:] - if timezone_aware_date_list.tzinfo != datetime.UTC: - # pyarrow returns pytz.FixedOffset while pandas constructs datetime.timezone - # https://github.com/pandas-dev/pandas/issues/37286 - try: - import pytz - except ImportError: - pass - else: - offset = df.index.tz.utcoffset(timezone_aware_date_list) - tz = pytz.FixedOffset(offset.total_seconds() / 60) - expected.index = expected.index.tz_convert(tz) - expected["index_as_col"] = expected["index_as_col"].dt.tz_convert(tz) - check_round_trip(df, temp_file, pa, check_dtype=False, expected=expected) + check_round_trip(df, temp_file, pa, check_dtype=False) def test_filter_row_groups(self, pa, temp_file): # https://github.com/pandas-dev/pandas/issues/26551 From 6cab7d79dca9ea986edd5a82cf13d540a81b3375 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 9 Apr 2026 21:51:22 +0200 Subject: [PATCH 3/8] fix normalize logic for static timezone --- pandas/io/_util.py | 16 +++++++--------- pandas/tests/tslibs/test_timezones.py | 16 ++++++++++++++++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 72a50b1e25ce7..95e84021ab9f7 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -211,21 +211,19 @@ def _normalize_pytz_timezone(tz: dt.tzinfo) -> dt.tzinfo: if timezones.is_utc(tz): return timezones.maybe_get_tz("UTC") - if timezones.is_fixed_offset(tz): - # Convert pytz fixed offset to datetime.timezone + if tz.zone is not None: try: - offset = tz.utcoffset(None) - if offset is not None: - return dt.timezone(offset) + return timezones.maybe_get_tz(tz.zone) except Exception: + # some pytz timezones might not be available for zoneinfo pass - zone = timezones.get_timezone(tz) - if isinstance(zone, str): + if timezones.is_fixed_offset(tz): + # Convert pytz fixed offset to datetime.timezone try: - return timezones.maybe_get_tz(zone) + offset = tz.utcoffset(None) + return dt.timezone(offset) except Exception: - # some pytz timezones might not be available for zoneinfo pass return tz diff --git a/pandas/tests/tslibs/test_timezones.py b/pandas/tests/tslibs/test_timezones.py index c48986c597356..33b05bc34eccd 100644 --- a/pandas/tests/tslibs/test_timezones.py +++ b/pandas/tests/tslibs/test_timezones.py @@ -6,6 +6,7 @@ import subprocess import sys import textwrap +import zoneinfo import dateutil.tz import pytest @@ -191,3 +192,18 @@ def test_maybe_get_tz_offset_only(): tz = timezones.maybe_get_tz("UTC-02:45") assert tz == timezone(-timedelta(hours=2, minutes=45)) + + +def test_normalize_pytz_timezone(): + pytz = pytest.importorskip("pytz") + + from pandas.io._util import _normalize_pytz_timezone + + for tz, expected in [ + (pytz.UTC, timezone.utc), + (pytz.FixedOffset(90), timezone(timedelta(minutes=90))), + (pytz.timezone("America/New_York"), zoneinfo.ZoneInfo("America/New_York")), + (pytz.timezone("Etc/GMT+1"), zoneinfo.ZoneInfo("Etc/GMT+1")), + ]: + result = _normalize_pytz_timezone(tz) + assert result == expected From 6b1fcaf1017da433a00b38b0aeffcb4c1ff58a0c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 9 Apr 2026 21:52:49 +0200 Subject: [PATCH 4/8] update parser test for pyarrow engine --- pandas/tests/io/parser/test_parse_dates.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 7aed7acb8e50d..41890396192a3 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -217,14 +217,8 @@ def test_parse_tz_aware(all_parsers): expected = DataFrame( {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") ) - if parser.engine == "pyarrow": - pytz = pytest.importorskip("pytz") - expected_tz = pytz.utc - expected.index = expected.index.as_unit("s") - else: - expected_tz = timezone.utc tm.assert_frame_equal(result, expected) - assert result.index.tz is expected_tz + assert result.index.tz is timezone.utc @pytest.mark.parametrize("kwargs", [{}, {"index_col": "C"}]) From 55d4e4b17d7b7c9fecebf3a9b78f3c4d07d812bc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 9 Apr 2026 22:13:23 +0200 Subject: [PATCH 5/8] add docstring + link to pyarrow PR --- pandas/core/frame.py | 2 +- pandas/io/_util.py | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9dd24df0e20fe..d30e475841d90 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5450,7 +5450,7 @@ def predicate(arr: ArrayLike) -> bool: return True - blk_dtypes = [blk.dtype for blk in self._mgr.blocks] + blk_dtypes = self._mgr.get_unique_dtypes() if ( np.object_ in include and str not in include diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 95e84021ab9f7..c88ebbf645d21 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -231,8 +231,11 @@ def _normalize_pytz_timezone(tz: dt.tzinfo) -> dt.tzinfo: def _normalize_timezone_index(index: pd.Index) -> pd.Index: if isinstance(index, pd.MultiIndex): - levels = [_normalize_timezone_index(level) for level in index.levels] - return index.set_levels(levels) + if any(isinstance(level.dtype, pd.DatetimeTZDtype) for level in index.levels): + levels = [_normalize_timezone_index(level) for level in index.levels] + return index.set_levels(levels) + + return index if isinstance(index.dtype, pd.DatetimeTZDtype): normalized_tz = _normalize_pytz_timezone(index.dtype.tz) @@ -243,6 +246,13 @@ def _normalize_timezone_index(index: pd.Index) -> pd.Index: def _normalize_timezone_dtypes(df: pd.DataFrame) -> pd.DataFrame: + """ + PyArrow uses pytz by default for timezones, but pandas uses + zoneinfo / datetime.timezone since pandas 3.0. + + TODO: Starting with pyarrow 25, it will use zoneinfo by default, and then + this normalization can be skipped (https://github.com/apache/arrow/pull/49694). + """ if pytz is not None: # Convert any pytz timezones to zoneinfo / fixed offset timezones if any( @@ -258,4 +268,5 @@ def _normalize_timezone_dtypes(df: pd.DataFrame) -> pd.DataFrame: df.index = _normalize_timezone_index(df.index) df.columns = _normalize_timezone_index(df.columns) + return df From 6a11335c95d26b9b68df474680a43d15e5e1c62c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 10 Apr 2026 10:43:20 +0200 Subject: [PATCH 6/8] fix expected unit in parser test --- pandas/tests/io/parser/test_parse_dates.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 41890396192a3..41effd4c2896e 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -217,6 +217,8 @@ def test_parse_tz_aware(all_parsers): expected = DataFrame( {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") ) + if parser.engine == "pyarrow": + expected.index = expected.index.as_unit("s") tm.assert_frame_equal(result, expected) assert result.index.tz is timezone.utc From bca10e8f23f554676ac66bd2e6d447fdd6ab1471 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 10 Apr 2026 10:45:15 +0200 Subject: [PATCH 7/8] fix/suppress typing failures --- pandas/io/_util.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index c88ebbf645d21..72a8b2e8f0ef6 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -5,6 +5,7 @@ TYPE_CHECKING, Literal, ) +import zoneinfo import numpy as np @@ -209,11 +210,11 @@ def _normalize_pytz_timezone(tz: dt.tzinfo) -> dt.tzinfo: return tz if timezones.is_utc(tz): - return timezones.maybe_get_tz("UTC") + return dt.timezone.utc - if tz.zone is not None: + if tz.zone is not None: # type: ignore[attr-defined] try: - return timezones.maybe_get_tz(tz.zone) + return zoneinfo.ZoneInfo(tz.zone) # type: ignore[attr-defined] except Exception: # some pytz timezones might not be available for zoneinfo pass @@ -222,7 +223,8 @@ def _normalize_pytz_timezone(tz: dt.tzinfo) -> dt.tzinfo: # Convert pytz fixed offset to datetime.timezone try: offset = tz.utcoffset(None) - return dt.timezone(offset) + if offset is not None: + return dt.timezone(offset) except Exception: pass @@ -240,7 +242,7 @@ def _normalize_timezone_index(index: pd.Index) -> pd.Index: if isinstance(index.dtype, pd.DatetimeTZDtype): normalized_tz = _normalize_pytz_timezone(index.dtype.tz) if normalized_tz is not index.dtype.tz: - return index.tz_convert(normalized_tz) + return index.tz_convert(normalized_tz) # type: ignore[attr-defined] return index From 0e5e05be79f262eef3ef6e2996c99f7c45eccf14 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 10 Apr 2026 14:21:47 +0200 Subject: [PATCH 8/8] add whatsnew --- doc/source/whatsnew/v3.0.3.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/doc/source/whatsnew/v3.0.3.rst b/doc/source/whatsnew/v3.0.3.rst index f8987257b4858..9b1b50a4ff114 100644 --- a/doc/source/whatsnew/v3.0.3.rst +++ b/doc/source/whatsnew/v3.0.3.rst @@ -8,6 +8,20 @@ including other versions of pandas. {{ header }} +.. --------------------------------------------------------------------------- +.. _whatsnew_303.enhancements: + +Enhancements +~~~~~~~~~~~~ +- Starting with pandas 3.0.0, time zones are represented by default using the + standard library's :mod:`zoneinfo` module (or ``datetime.timezone`` for fixed + offsets) instead of using ``pytz`` (:ref:`release note `). + + The IO methods using ``pyarrow`` under the hood such as :func:`read_parquet`, + :func:`read_feather` and :func:`read_orc` (or :func:`read_csv` when specifying + the engine) were still returning timezone using ``pytz``. Those have now been + updated to consistently use default ``zoneinfo`` time zones as well (:issue:`65134`). + .. --------------------------------------------------------------------------- .. _whatsnew_303.regressions: