Fix boolean array for arrow-backed DF. (#10527)
This commit is contained in:
parent
d33043a348
commit
e537b0969f
@ -458,7 +458,7 @@ def pandas_pa_type(ser: Any) -> np.ndarray:
|
||||
# combine_chunks takes the most significant amount of time
|
||||
chunk: pa.Array = aa.combine_chunks()
|
||||
# When there's null value, we have to use copy
|
||||
zero_copy = chunk.null_count == 0
|
||||
zero_copy = chunk.null_count == 0 and not pa.types.is_boolean(chunk.type)
|
||||
# Alternately, we can use chunk.buffers(), which returns a list of buffers and
|
||||
# we need to concatenate them ourselves.
|
||||
# FIXME(jiamingy): Is there a better way to access the arrow buffer along with
|
||||
@ -825,37 +825,9 @@ def _arrow_transform(data: DataType) -> Any:
|
||||
|
||||
data = cast(pa.Table, data)
|
||||
|
||||
def type_mapper(dtype: pa.DataType) -> Optional[str]:
|
||||
"""Maps pyarrow type to pandas arrow extension type."""
|
||||
if pa.types.is_int8(dtype):
|
||||
return pd.ArrowDtype(pa.int8())
|
||||
if pa.types.is_int16(dtype):
|
||||
return pd.ArrowDtype(pa.int16())
|
||||
if pa.types.is_int32(dtype):
|
||||
return pd.ArrowDtype(pa.int32())
|
||||
if pa.types.is_int64(dtype):
|
||||
return pd.ArrowDtype(pa.int64())
|
||||
if pa.types.is_uint8(dtype):
|
||||
return pd.ArrowDtype(pa.uint8())
|
||||
if pa.types.is_uint16(dtype):
|
||||
return pd.ArrowDtype(pa.uint16())
|
||||
if pa.types.is_uint32(dtype):
|
||||
return pd.ArrowDtype(pa.uint32())
|
||||
if pa.types.is_uint64(dtype):
|
||||
return pd.ArrowDtype(pa.uint64())
|
||||
if pa.types.is_float16(dtype):
|
||||
return pd.ArrowDtype(pa.float16())
|
||||
if pa.types.is_float32(dtype):
|
||||
return pd.ArrowDtype(pa.float32())
|
||||
if pa.types.is_float64(dtype):
|
||||
return pd.ArrowDtype(pa.float64())
|
||||
if pa.types.is_boolean(dtype):
|
||||
return pd.ArrowDtype(pa.bool_())
|
||||
return None
|
||||
|
||||
# For common cases, this is zero-copy, can check with:
|
||||
# pa.total_allocated_bytes()
|
||||
df = data.to_pandas(types_mapper=type_mapper)
|
||||
df = data.to_pandas(types_mapper=pd.ArrowDtype)
|
||||
return df
|
||||
|
||||
|
||||
|
||||
@ -164,10 +164,6 @@ def pd_arrow_dtypes() -> Generator:
|
||||
|
||||
# Integer
|
||||
dtypes = pandas_pyarrow_mapper
|
||||
Null: Union[float, None, Any] = np.nan
|
||||
orig = pd.DataFrame(
|
||||
{"f0": [1, 2, Null, 3], "f1": [4, 3, Null, 1]}, dtype=np.float32
|
||||
)
|
||||
# Create a dictionary-backed dataframe, enable this when the roundtrip is
|
||||
# implemented in pandas/pyarrow
|
||||
#
|
||||
@ -190,24 +186,33 @@ def pd_arrow_dtypes() -> Generator:
|
||||
# pd_catcodes = pd_cat_df["f1"].cat.codes
|
||||
# assert pd_catcodes.equals(pa_catcodes)
|
||||
|
||||
for Null in (None, pd.NA):
|
||||
for Null in (None, pd.NA, 0):
|
||||
for dtype in dtypes:
|
||||
if dtype.startswith("float16") or dtype.startswith("bool"):
|
||||
continue
|
||||
# Use np.nan is a baseline
|
||||
orig_null = Null if not pd.isna(Null) and Null == 0 else np.nan
|
||||
orig = pd.DataFrame(
|
||||
{"f0": [1, 2, orig_null, 3], "f1": [4, 3, orig_null, 1]},
|
||||
dtype=np.float32,
|
||||
)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{"f0": [1, 2, Null, 3], "f1": [4, 3, Null, 1]}, dtype=dtype
|
||||
)
|
||||
yield orig, df
|
||||
|
||||
orig = pd.DataFrame(
|
||||
{"f0": [True, False, pd.NA, True], "f1": [False, True, pd.NA, True]},
|
||||
dtype=pd.BooleanDtype(),
|
||||
)
|
||||
df = pd.DataFrame(
|
||||
{"f0": [True, False, pd.NA, True], "f1": [False, True, pd.NA, True]},
|
||||
dtype=pd.ArrowDtype(pa.bool_()),
|
||||
)
|
||||
yield orig, df
|
||||
# If Null is `False`, then there's no missing value.
|
||||
for Null in (pd.NA, False):
|
||||
orig = pd.DataFrame(
|
||||
{"f0": [True, False, Null, True], "f1": [False, True, Null, True]},
|
||||
dtype=pd.BooleanDtype(),
|
||||
)
|
||||
df = pd.DataFrame(
|
||||
{"f0": [True, False, Null, True], "f1": [False, True, Null, True]},
|
||||
dtype=pd.ArrowDtype(pa.bool_()),
|
||||
)
|
||||
yield orig, df
|
||||
|
||||
|
||||
def check_inf(rng: RNG) -> None:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user