Skip to content

Commit b22fbc8

Browse files
Parquet IO: also use zoneinfo timezones by default even when pyarrow uses pytz
1 parent 43c5e9a commit b22fbc8

File tree

4 files changed

+94
-2
lines changed

4 files changed

+94
-2
lines changed

pandas/core/frame.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5477,6 +5477,18 @@ def predicate(arr: ArrayLike) -> bool:
54775477
mgr = self._mgr._get_data_subset(predicate).copy(deep=False)
54785478
return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self)
54795479

5480+
def _select_dtypes_indices(self, dtype_class) -> np.ndarray:
5481+
"""
5482+
Return the indices of the columns of a given dtype.
5483+
5484+
Currently only works given a class, so mostly useful for ExtensionDtypes.
5485+
"""
5486+
5487+
def predicate(arr: ArrayLike) -> bool:
5488+
return isinstance(arr.dtype, dtype_class)
5489+
5490+
return self._mgr._get_data_subset_indices(predicate)
5491+
54805492
def insert(
54815493
self,
54825494
loc: int,

pandas/core/generic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7127,7 +7127,7 @@ def fillna(
71277127
if axis == 1:
71287128
# Check that all columns in result have the same dtype
71297129
# otherwise don't bother with fillna and losing accurate dtypes
7130-
unique_dtypes = algos.unique(self._mgr.get_dtypes())
7130+
unique_dtypes = self._mgr.get_unique_dtypes()
71317131
if len(unique_dtypes) > 1:
71327132
raise ValueError(
71337133
"All columns must have the same dtype, but got dtypes: "

pandas/core/internals/managers.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,9 @@ def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool:
336336
blk = self.blocks[blkno]
337337
return any(blk is ref() for ref in mgr.blocks[blkno].refs.referenced_blocks)
338338

339+
def get_unique_dtypes(self) -> npt.NDArray[np.object_]:
340+
return algos.unique([blk.dtype for blk in self.blocks])
341+
339342
def get_dtypes(self) -> npt.NDArray[np.object_]:
340343
dtypes = np.array([blk.dtype for blk in self.blocks], dtype=object)
341344
return dtypes.take(self.blknos)
@@ -656,6 +659,11 @@ def _get_data_subset(self, predicate: Callable) -> Self:
656659
blocks = [blk for blk in self.blocks if predicate(blk.values)]
657660
return self._combine(blocks)
658661

662+
def _get_data_subset_indices(self, predicate: Callable) -> np.ndarray:
663+
blocks = [blk for blk in self.blocks if predicate(blk.values)]
664+
indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks]))
665+
return indexer
666+
659667
def get_bool_data(self) -> Self:
660668
"""
661669
Select blocks that are bool-dtype and columns from object-dtype blocks

pandas/io/_util.py

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import datetime as dt
34
from typing import (
45
TYPE_CHECKING,
56
Literal,
@@ -10,6 +11,7 @@
1011
from pandas._config import using_string_dtype
1112

1213
from pandas._libs import lib
14+
from pandas._libs.tslibs import timezones
1315
from pandas.compat import (
1416
pa_version_under18p0,
1517
pa_version_under19p0,
@@ -35,6 +37,9 @@
3537
)
3638

3739

40+
pytz = import_optional_dependency("pytz", errors="ignore")
41+
42+
3843
def _arrow_dtype_mapping() -> dict:
3944
pa = import_optional_dependency("pyarrow")
4045
return {
@@ -120,7 +125,9 @@ def arrow_table_to_pandas(
120125
raise NotImplementedError
121126

122127
df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs)
123-
return _post_convert_dtypes(df, dtype_backend, dtype, names)
128+
df = _post_convert_dtypes(df, dtype_backend, dtype, names)
129+
df = _normalize_timezone_dtypes(df)
130+
return df
124131

125132

126133
def _post_convert_dtypes(
@@ -189,3 +196,68 @@ def _post_convert_dtypes(
189196
df[col] = df[col].astype(cat_dtype)
190197

191198
return df
199+
200+
201+
def _normalize_pytz_timezone(tz: dt.tzinfo) -> dt.tzinfo:
202+
"""
203+
If the input tz is a pytz timezone, attempt to convert it to "default"
204+
tzinfo object (zoneinfo or datetime.timezone).
205+
"""
206+
if not type(tz).__module__.startswith("pytz"):
207+
# isinstance(col.dtype.tz, pytz.BaseTzInfo) does not included
208+
# fixed offsets
209+
return tz
210+
211+
if timezones.is_utc(tz):
212+
return timezones.maybe_get_tz("UTC")
213+
214+
if timezones.is_fixed_offset(tz):
215+
# Convert pytz fixed offset to datetime.timezone
216+
try:
217+
offset = tz.utcoffset(None)
218+
if offset is not None:
219+
return dt.timezone(offset)
220+
except Exception:
221+
pass
222+
223+
zone = timezones.get_timezone(tz)
224+
if isinstance(zone, str):
225+
try:
226+
return timezones.maybe_get_tz(zone)
227+
except Exception:
228+
# some pytz timezones might not be available for zoneinfo
229+
pass
230+
231+
return tz
232+
233+
234+
def _normalize_timezone_index(index: pd.Index) -> pd.Index:
235+
if isinstance(index, pd.MultiIndex):
236+
levels = [_normalize_timezone_index(level) for level in index.levels]
237+
return index.set_levels(levels)
238+
239+
if isinstance(index.dtype, pd.DatetimeTZDtype):
240+
normalized_tz = _normalize_pytz_timezone(index.dtype.tz)
241+
if normalized_tz is not index.dtype.tz:
242+
return index.tz_convert(normalized_tz)
243+
244+
return index
245+
246+
247+
def _normalize_timezone_dtypes(df: pd.DataFrame) -> pd.DataFrame:
248+
if pytz is not None:
249+
# Convert any pytz timezones to zoneinfo / fixed offset timezones
250+
if any(
251+
isinstance(dtype, pd.DatetimeTZDtype)
252+
for dtype in df._mgr.get_unique_dtypes()
253+
):
254+
col_indices = df._select_dtypes_indices(pd.DatetimeTZDtype)
255+
for i in col_indices:
256+
col = df.iloc[:, i]
257+
normalized_tz = _normalize_pytz_timezone(col.dtype.tz)
258+
if normalized_tz is not col.dtype.tz:
259+
df.isetitem(i, col.dt.tz_convert(normalized_tz))
260+
261+
df.index = _normalize_timezone_index(df.index)
262+
df.columns = _normalize_timezone_index(df.columns)
263+
return df

0 commit comments

Comments
 (0)