|
1 | 1 | from __future__ import annotations |
2 | 2 |
|
| 3 | +import datetime as dt |
3 | 4 | from typing import ( |
4 | 5 | TYPE_CHECKING, |
5 | 6 | Literal, |
|
10 | 11 | from pandas._config import using_string_dtype |
11 | 12 |
|
12 | 13 | from pandas._libs import lib |
| 14 | +from pandas._libs.tslibs import timezones |
13 | 15 | from pandas.compat import ( |
14 | 16 | pa_version_under18p0, |
15 | 17 | pa_version_under19p0, |
|
35 | 37 | ) |
36 | 38 |
|
37 | 39 |
|
| 40 | +pytz = import_optional_dependency("pytz", errors="ignore") |
| 41 | + |
| 42 | + |
38 | 43 | def _arrow_dtype_mapping() -> dict: |
39 | 44 | pa = import_optional_dependency("pyarrow") |
40 | 45 | return { |
@@ -120,7 +125,9 @@ def arrow_table_to_pandas( |
120 | 125 | raise NotImplementedError |
121 | 126 |
|
122 | 127 | df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs) |
123 | | - return _post_convert_dtypes(df, dtype_backend, dtype, names) |
| 128 | + df = _post_convert_dtypes(df, dtype_backend, dtype, names) |
| 129 | + df = _normalize_timezone_dtypes(df) |
| 130 | + return df |
124 | 131 |
|
125 | 132 |
|
126 | 133 | def _post_convert_dtypes( |
@@ -189,3 +196,68 @@ def _post_convert_dtypes( |
189 | 196 | df[col] = df[col].astype(cat_dtype) |
190 | 197 |
|
191 | 198 | return df |
| 199 | + |
| 200 | + |
| 201 | +def _normalize_pytz_timezone(tz: dt.tzinfo) -> dt.tzinfo: |
| 202 | + """ |
| 203 | + If the input tz is a pytz timezone, attempt to convert it to "default" |
| 204 | + tzinfo object (zoneinfo or datetime.timezone). |
| 205 | + """ |
| 206 | + if not type(tz).__module__.startswith("pytz"): |
| 207 | + # isinstance(col.dtype.tz, pytz.BaseTzInfo) does not included |
| 208 | + # fixed offsets |
| 209 | + return tz |
| 210 | + |
| 211 | + if timezones.is_utc(tz): |
| 212 | + return timezones.maybe_get_tz("UTC") |
| 213 | + |
| 214 | + if timezones.is_fixed_offset(tz): |
| 215 | + # Convert pytz fixed offset to datetime.timezone |
| 216 | + try: |
| 217 | + offset = tz.utcoffset(None) |
| 218 | + if offset is not None: |
| 219 | + return dt.timezone(offset) |
| 220 | + except Exception: |
| 221 | + pass |
| 222 | + |
| 223 | + zone = timezones.get_timezone(tz) |
| 224 | + if isinstance(zone, str): |
| 225 | + try: |
| 226 | + return timezones.maybe_get_tz(zone) |
| 227 | + except Exception: |
| 228 | + # some pytz timezones might not be available for zoneinfo |
| 229 | + pass |
| 230 | + |
| 231 | + return tz |
| 232 | + |
| 233 | + |
| 234 | +def _normalize_timezone_index(index: pd.Index) -> pd.Index: |
| 235 | + if isinstance(index, pd.MultiIndex): |
| 236 | + levels = [_normalize_timezone_index(level) for level in index.levels] |
| 237 | + return index.set_levels(levels) |
| 238 | + |
| 239 | + if isinstance(index.dtype, pd.DatetimeTZDtype): |
| 240 | + normalized_tz = _normalize_pytz_timezone(index.dtype.tz) |
| 241 | + if normalized_tz is not index.dtype.tz: |
| 242 | + return index.tz_convert(normalized_tz) |
| 243 | + |
| 244 | + return index |
| 245 | + |
| 246 | + |
| 247 | +def _normalize_timezone_dtypes(df: pd.DataFrame) -> pd.DataFrame: |
| 248 | + if pytz is not None: |
| 249 | + # Convert any pytz timezones to zoneinfo / fixed offset timezones |
| 250 | + if any( |
| 251 | + isinstance(dtype, pd.DatetimeTZDtype) |
| 252 | + for dtype in df._mgr.get_unique_dtypes() |
| 253 | + ): |
| 254 | + col_indices = df._select_dtypes_indices(pd.DatetimeTZDtype) |
| 255 | + for i in col_indices: |
| 256 | + col = df.iloc[:, i] |
| 257 | + normalized_tz = _normalize_pytz_timezone(col.dtype.tz) |
| 258 | + if normalized_tz is not col.dtype.tz: |
| 259 | + df.isetitem(i, col.dt.tz_convert(normalized_tz)) |
| 260 | + |
| 261 | + df.index = _normalize_timezone_index(df.index) |
| 262 | + df.columns = _normalize_timezone_index(df.columns) |
| 263 | + return df |
0 commit comments