Skip to content
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Other enhancements
^^^^^^^^^^^^^^^^^^
- :class:`Period` now supports f-string formatting via ``__format__``, e.g. ``f"{period:%Y-%m}"`` (:issue:`48536`)
- :meth:`.DataFrameGroupBy.agg` now allows for the provided ``func`` to return a NumPy array (:issue:`63957`)
- :meth:`Series.to_json` now also uses the ``_values_for_json`` method of an ExtensionArray (:issue:`65047`)
- :meth:`ExtensionArray.map` now calls :meth:`ExtensionArray._cast_pointwise_result` to retain the dtype backend, e.g. Arrow-backed arrays now preserve their Arrow dtype through ``map`` (:issue:`57189`, :issue:`62164`)
- Added :meth:`ExtensionArray.count` (:issue:`64450`)
- Added :meth:`Index.replace` method to support value replacement functionality similar to :meth:`Series.replace` (:issue:`19495`)
Expand Down
6 changes: 6 additions & 0 deletions pandas/_libs/src/vendored/ujson/python/objToJSON.c
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,12 @@ static PyObject *get_values(PyObject *obj) {
if (values == NULL) {
// Clear so we can subsequently try another method
PyErr_Clear();
} else if (PyObject_HasAttrString(values, "_values_for_json")) {
// We have gotten an ExtensionArray
PyObject *array_values =
PyObject_CallMethod(values, "_values_for_json", NULL);
Py_DECREF(values);
values = array_values;
} else if (PyObject_HasAttrString(values, "__array__")) {
// We may have gotten a Categorical or Sparse array so call np.array
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose categorical or sparse will now be covered by the _values_for_json above?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jorisvandenbossche As _values_for_json is not overwritten by them they use the base implementation so fall back to np.asarray(self) which will call self.__array__() so the behaviour should be the same for them as before:

def _values_for_json(self) -> np.ndarray:
"""
Specify how to render our entries in to_json.
Notes
-----
The dtype on the returned ndarray is not restricted, but for non-native
types that are not specifically handled in objToJSON.c, to_json is
liable to raise. In these cases, it may be safer to return an ndarray
of strings.
"""
return np.asarray(self)

I am not sure if these are the only two arrays that lead to this code path in objToJSON.c, if so it could be removed as it is replaced by the path above?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if so it could be removed as it is replaced by the path above?

Yeah, potentially, but at least the comment is no longer correct.

I think there might be some more things that can be cleaned up, but fine to leave that for a follow-up PR as well

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then just remove the comment for now? For testing I just removed this whole block:

    } else if (PyObject_HasAttrString(values, "__array__")) {
      // We may have gotten a Categorical or Sparse array so call np.array
      PyObject *array_values = PyObject_CallMethod(values, "__array__", NULL);
      Py_DECREF(values);
      values = array_values;

And then let the whole test suite run locally (on current dev environment) with some errors failing, but on a first glance nothing that had to do with changes here (mostly SQL, timezone, clipboard). When running pytest -k json the tests all passed. But I am not sure if that guarantees that nothing got broken, until now JSON serialization of EAs was not tested, so not sure about the coverage of other arrays.

PyObject *array_values = PyObject_CallMethod(values, "__array__", NULL);
Expand Down
22 changes: 22 additions & 0 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import inspect
from io import StringIO
import operator

import numpy as np
import pytest

from pandas._libs._ujson import ujson_dumps
from pandas._typing import Dtype

from pandas.core.dtypes.common import is_bool_dtype
Expand Down Expand Up @@ -783,3 +785,23 @@ def test_equals(self, data, na_value, as_series, box):
def test_equals_same_data_different_object(self, data):
# https://github.com/pandas-dev/pandas/issues/34660
assert pd.Series(data).equals(pd.Series(data))

def test_values_for_json(self, data):
# GH 65047
values: np.ndarray = data._values_for_json()

# Check that the result is a numpy array
assert type(values) == np.ndarray
Comment thread
Julian-Harbeck marked this conversation as resolved.
Outdated

# Check that the result is JSON-serializable
assert isinstance(ujson_dumps(values), str)

# Check that Series.to_json uses _values_for_json
ser = pd.Series(data)
result: str = ser.to_json()
expected: str = pd.Series(values).to_json()
assert result == expected

# Test roundtrip through JSON
ser_new = pd.read_json(StringIO(result), typ="series", dtype=data.dtype)
tm.assert_series_equal(ser_new, ser)
Comment on lines +807 to +811
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe can split this roundtrip part as a separate test, since I think this is the main reason for the xfails? Then the base test ensuring _values_for_json returns the correct thing would still run for those cases

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes had the same thought earlier (see comment above), I split the test into test_values_for_json for the serialization and test_json_roundtrip for the JSON roundtrip with deserialization. I still added the later one to BaseMethodsTests even though the deserialization is not really a method of an ExtensionArray but due to the close connection to the serialization method I think this makes sense?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is good to keep the tests together indeed

7 changes: 7 additions & 0 deletions pandas/tests/extension/decimal/test_decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,13 @@ def test_series_repr(self, data):
def test_unary_ufunc_dunder_equivalence(self, data, ufunc):
super().test_unary_ufunc_dunder_equivalence(data, ufunc)

def test_values_for_json(self, data):
# GH 65127
# DecimalArray does not support roundtrip as Decimal cannot be created from
# dictionary created in JSON serialization
with pytest.raises(AssertionError, match="Attributes of Series are different"):
Comment thread
Julian-Harbeck marked this conversation as resolved.
Outdated
return super().test_values_for_json(data)


def test_take_na_value_other_decimal():
arr = DecimalArray([decimal.Decimal("1.0"), decimal.Decimal("2.0")])
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/extension/json/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,16 @@ def test_setitem_2d_values(self, data):
def test_EA_types(self, engine, data, request):
super().test_EA_types(engine, data, request)

def test_values_for_json(self, data):
# GH 65127
# JSONArray does not support roundtrip as during JSON serialization each element
# of the array is packed into another dictionary ``{"data": element}`` with
# element being a dictionary itself, and during deserialization these
# dictionaries are not unpacked again, so the JSONArray cannot be reconstructed
# with the simple deserialization in the test.
with pytest.raises(AssertionError, match="Series are different"):
return super().test_values_for_json(data)


def custom_assert_series_equal(left, right, *args, **kwargs):
# NumPy doesn't handle an array of equal-length UserDicts.
Expand Down
30 changes: 30 additions & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1061,6 +1061,36 @@ def test_loc_setitem_with_expansion_preserves_ea_index_dtype(self, data, request
request.applymarker(mark)
super().test_loc_setitem_with_expansion_preserves_ea_index_dtype(data)

@pytest.mark.filterwarnings(
"ignore:The default 'epoch' date format is deprecated:DeprecationWarning"
)
def test_values_for_json(self, data):
# GH 65127
# All datetime and duration ArrowDtypes with non default resolution of ms fail
# on roundtrip. The date32 and date64 dtypes fail already in serialization due
# to as_unit not implemented for them. Currently the json serialization relies
# on the default 'epoch' format for datetimes, leading to the filtered
# Pandas4Warning.
if ((data.dtype.kind in "Mm") and ("ms" not in str(data.dtype))) or (
data.dtype in [ArrowDtype(pa.date32()), ArrowDtype(pa.date64())]
):
msg = "|".join(
[
# date32/date64
"as_unit not implemented for date",
# timestamp with s unit and US/Pacific or US/Eastern tz
"year 51970 is out of range",
# all others
"Series are different",
]
)
with pytest.raises(
(NotImplementedError, ValueError, AssertionError), match=msg
Comment thread
Julian-Harbeck marked this conversation as resolved.
Outdated
):
super().test_values_for_json(data)
else:
super().test_values_for_json(data)


class TestLogicalOps:
"""Various Series and DataFrame logical ops methods."""
Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/extension/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,17 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
else:
return super().check_reduce(ser, op_name, skipna)

@pytest.mark.filterwarnings(
"ignore:The default 'epoch' date format is deprecated:DeprecationWarning"
)
def test_values_for_json(self, data):
# GH 65127
# DatetimeArray fails on roundtrip. Also currently the json serialization relies
# on the default 'epoch' format for datetimes, leading to the filtered
# Pandas4Warning.
with pytest.raises(AssertionError, match="Series are different"):
super().test_values_for_json(data)


class Test2DCompat(base.NDArrayBacked2DTests):
pass
7 changes: 7 additions & 0 deletions pandas/tests/extension/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,3 +137,10 @@ def test_astype_str(self, data):
)
def test_loc_setitem_with_expansion_preserves_ea_index_dtype(self, data):
super().test_loc_setitem_with_expansion_preserves_ea_index_dtype(data)

def test_values_for_json(self, data):
# GH 65127
# IntervalArray does not support roundtrip as Interval cannot be created from
# dictionary created in JSON serialization
with pytest.raises(AssertionError, match="Attributes of Series are different"):
return super().test_values_for_json(data)
6 changes: 6 additions & 0 deletions pandas/tests/extension/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,12 @@ def test_map(self, data, na_action):
result = data.map(lambda x: x, na_action=na_action)
tm.assert_extension_array_equal(result, data)

def test_values_for_json(self, data):
# GH 65127
# PeriodArray currently cannot be serialized to JSON
with pytest.raises(OverflowError, match="Maximum recursion level reached"):
return super().test_values_for_json(data)


class Test2DCompat(base.NDArrayBacked2DTests):
pass
Loading