Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
rok committed Sep 11, 2024
1 parent 27acf8b commit 8f95269
Show file tree
Hide file tree
Showing 10 changed files with 170 additions and 6 deletions.
7 changes: 6 additions & 1 deletion cpp/src/arrow/extension/json.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,16 @@ std::shared_ptr<Array> JsonExtensionType::MakeArray(
return std::make_shared<ExtensionArray>(data);
}

std::shared_ptr<DataType> json(const std::shared_ptr<DataType> storage_type) {
Result<std::shared_ptr<DataType>> JsonExtensionType::Make(
const std::shared_ptr<DataType>& storage_type) {
ARROW_CHECK(storage_type->id() != Type::STRING ||
storage_type->id() != Type::STRING_VIEW ||
storage_type->id() != Type::LARGE_STRING);
return std::make_shared<JsonExtensionType>(storage_type);
}

std::shared_ptr<DataType> json(const std::shared_ptr<DataType>& storage_type) {
return JsonExtensionType::Make(storage_type).ValueOrDie();
}

} // namespace arrow::extension
10 changes: 9 additions & 1 deletion cpp/src/arrow/extension/json.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@

namespace arrow::extension {

class ARROW_EXPORT JsonArray : public ExtensionArray {
public:
using ExtensionArray::ExtensionArray;
};

/// \brief Concrete type class for variable-size JSON data, utf8-encoded.
class ARROW_EXPORT JsonExtensionType : public ExtensionType {
public:
Expand All @@ -45,12 +50,15 @@ class ARROW_EXPORT JsonExtensionType : public ExtensionType {

std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;

static Result<std::shared_ptr<DataType>> Make(
const std::shared_ptr<DataType>& storage_type);

private:
std::shared_ptr<DataType> storage_type_;
};

/// \brief Return a JsonExtensionType instance.
ARROW_EXPORT std::shared_ptr<DataType> json(
std::shared_ptr<DataType> storage_type = utf8());
const std::shared_ptr<DataType>& storage_type = utf8());

} // namespace arrow::extension
8 changes: 4 additions & 4 deletions python/pyarrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def print_entry(label, value):
union, sparse_union, dense_union,
dictionary,
run_end_encoded,
bool8, fixed_shape_tensor, opaque, uuid,
bool8, fixed_shape_tensor, json, opaque, uuid,
field,
type_for_alias,
DataType, DictionaryType, StructType,
Expand All @@ -183,7 +183,7 @@ def print_entry(label, value):
FixedSizeBinaryType, Decimal128Type, Decimal256Type,
BaseExtensionType, ExtensionType,
RunEndEncodedType, Bool8Type, FixedShapeTensorType,
OpaqueType, UuidType,
JsonType, OpaqueType, UuidType,
PyExtensionType, UnknownExtensionType,
register_extension_type, unregister_extension_type,
DictionaryMemo,
Expand Down Expand Up @@ -218,7 +218,7 @@ def print_entry(label, value):
MonthDayNanoIntervalArray,
Decimal128Array, Decimal256Array, StructArray, ExtensionArray,
RunEndEncodedArray, Bool8Array, FixedShapeTensorArray,
OpaqueArray, UuidArray,
JsonArray, OpaqueArray, UuidArray,
scalar, NA, _NULL as NULL, Scalar,
NullScalar, BooleanScalar,
Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
Expand All @@ -236,7 +236,7 @@ def print_entry(label, value):
FixedSizeBinaryScalar, DictionaryScalar,
MapScalar, StructScalar, UnionScalar,
RunEndEncodedScalar, Bool8Scalar, ExtensionScalar,
FixedShapeTensorScalar, OpaqueScalar, UuidScalar)
FixedShapeTensorScalar, JsonScalar, OpaqueScalar, UuidScalar)

# Buffers, allocation
from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager,
Expand Down
24 changes: 24 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -4343,6 +4343,30 @@ cdef class ExtensionArray(Array):
return result


class JsonArray(ExtensionArray):
"""
Concrete class for Arrow arrays of JSON data type.
Examples
--------
Define the extension type for JSON array
>>> import pyarrow as pa
>>> json_type = pa.json(pa.large_utf8())
Create an extension array
>>> arr = [None, '{ "id":30, "values":["a", "b"] }']
>>> storage = pa.array(arr, pa.large_utf8())
>>> pa.ExtensionArray.from_storage(json_type, storage)
<pyarrow.lib.JsonArray object at ...>
[
null,
{ "id":30, "values":["a", "b"] }
]
"""


class UuidArray(ExtensionArray):
"""
Concrete class for Arrow arrays of UUID data type.
Expand Down
10 changes: 10 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -2867,6 +2867,16 @@ cdef extern from "arrow/extension_type.h" namespace "arrow":
shared_ptr[CArray] storage()


cdef extern from "arrow/extension/json.h" namespace "arrow::extension" nogil:
cdef cppclass CJsonType" arrow::extension::JsonExtensionType"(CExtensionType):

@staticmethod
CResult[shared_ptr[CDataType]] Make(shared_ptr[CDataType]& storage_type)

cdef cppclass CJsonArray" arrow::extension::JsonArray"(CExtensionArray):
pass


cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension" nogil:
cdef cppclass CUuidType" arrow::extension::UuidType"(CExtensionType):

Expand Down
4 changes: 4 additions & 0 deletions python/pyarrow/lib.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,10 @@ cdef class UuidType(BaseExtensionType):
cdef:
const CUuidType* uuid_ext_type

cdef class JsonType(BaseExtensionType):
cdef:
const CJsonType* json_ext_type

cdef class PyExtensionType(ExtensionType):
pass

Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/public-api.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,8 @@ cdef api object pyarrow_wrap_data_type(
out = OpaqueType.__new__(OpaqueType)
elif extension_name == b"arrow.uuid":
out = UuidType.__new__(UuidType)
elif extension_name == b"arrow.json":
out = JsonType.__new__(JsonType)
else:
out = BaseExtensionType.__new__(BaseExtensionType)
else:
Expand Down
9 changes: 9 additions & 0 deletions python/pyarrow/scalar.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1044,6 +1044,15 @@ cdef class ExtensionScalar(Scalar):
return pyarrow_wrap_scalar(<shared_ptr[CScalar]> sp_scalar)


class JsonScalar(ExtensionScalar):
"""
Concrete class for JSON extension scalar.
"""

def as_py(self):
return None if self.value is None else self.value.as_py()


class UuidScalar(ExtensionScalar):
"""
Concrete class for Uuid extension scalar.
Expand Down
28 changes: 28 additions & 0 deletions python/pyarrow/tests/test_extension_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -1926,3 +1926,31 @@ def test_bool8_scalar():
assert pa.scalar(1, type=pa.bool8()).as_py() is True
assert pa.scalar(2, type=pa.bool8()).as_py() is True
assert pa.scalar(None, type=pa.bool8()).as_py() is None


@pytest.mark.parametrize("str_type", (pa.utf8, pa.large_utf8))
def test_json(str_type):
storage_type = str_type()
data = ['{"a": 1}', '{"b": 2}', None]
storage = pa.array(data, type=storage_type)
json_type = pa.json(storage_type)

assert json_type.extension_name == "arrow.json"
assert json_type.storage_type == storage_type
assert json_type.__class__ is pa.JsonType

array = pa.ExtensionArray.from_storage(json_type, storage)

assert array.to_pylist() == data
assert array[0].as_py() == data[0]
assert array[2].as_py() is None

buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["json"]))

batch = ipc_read_batch(buf)
reconstructed_array = batch.column(0)
assert reconstructed_array.type == json_type
assert reconstructed_array == array

assert json_type.__arrow_ext_scalar_class__() == pa.JsonScalar
assert isinstance(array[0], pa.JsonScalar)
74 changes: 74 additions & 0 deletions python/pyarrow/types.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1774,6 +1774,43 @@ cdef class ExtensionType(BaseExtensionType):
return ExtensionScalar


cdef class JsonType(BaseExtensionType):
"""
Concrete class for Arrow arrays of JSON data type.
Examples
--------
Define the extension type for JSON array
>>> import pyarrow as pa
>>> json_type = pa.json(pa.large_utf8())
Create an extension array
>>> arr = [None, '{ "id":30, "values":["a", "b"] }']
>>> storage = pa.array(arr, pa.large_utf8())
>>> pa.ExtensionArray.from_storage(json_type, storage)
<pyarrow.lib.JsonArray object at ...>
[
null,
{ "id":30, "values":["a", "b"] }
]
"""

cdef void init(self, const shared_ptr[CDataType]& type) except *:
BaseExtensionType.init(self, type)
self.json_ext_type = <const CJsonType*> type.get()

def __arrow_ext_class__(self):
return JsonArray

def __reduce__(self):
return json, (self.value_type,)

def __arrow_ext_scalar_class__(self):
return JsonScalar


cdef class UuidType(BaseExtensionType):
"""
Concrete class for UUID extension type.
Expand Down Expand Up @@ -5236,6 +5273,43 @@ def run_end_encoded(run_end_type, value_type):
return pyarrow_wrap_data_type(ree_type)


def json(DataType storage_type):
"""
Create instance of JSON extension type.
Parameters
----------
storage_type : DataType
The underlying data type.
Returns
-------
type : JsonType
Examples
--------
Create an instance of JSON extension type:
>>> import pyarrow as pa
>>> pa.json(pa.utf8())
JsonType(arrow.json)
Use the JSON type to create an array:
>>> pa.array(['{"a": 1}', '{"b": 2}'], type=pa.json(pa.utf8()))
<pyarrow.lib.JsonArray object at ...>
[
{"a": 1},
{"b": 2}
]
"""

cdef JsonType out = JsonType.__new__(JsonType)
c_json_ext_type = GetResultValue(CJsonType.Make(storage_type.sp_type))
out.init(c_json_ext_type)
return out


def uuid():
"""
Create UuidType instance.
Expand Down

0 comments on commit 8f95269

Please sign in to comment.