Skip to content

Commit

Permalink
Expose get_json_object_options to Python (NVIDIA#11180)
Browse files Browse the repository at this point in the history
This PR exposes `get_json_object_options` to the Python API. Addresses rapidsai/cudf#10196

Authors:
  - Srikar Vanavasam (https://github.com/SrikarVanavasam)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Paul Taylor (https://github.com/trxcllnt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: rapidsai/cudf#11180
  • Loading branch information
SrikarVanavasam authored Jul 11, 2022
1 parent 8c39130 commit 002cb1c
Show file tree
Hide file tree
Showing 5 changed files with 249 additions and 8 deletions.
15 changes: 14 additions & 1 deletion python/cudf/cudf/_lib/cpp/strings/json.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
# Copyright (c) 2021-2022, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from libcpp.string cimport string

Expand All @@ -9,7 +10,19 @@ from cudf._lib.cpp.scalar.scalar cimport scalar, string_scalar


cdef extern from "cudf/strings/json.hpp" namespace "cudf::strings" nogil:
cdef cppclass get_json_object_options:
get_json_object_options() except +
# getters
bool get_allow_single_quotes() except +
bool get_strip_quotes_from_single_strings() except +
bool get_missing_fields_as_nulls() except +
# setters
void set_allow_single_quotes(bool val) except +
void set_strip_quotes_from_single_strings(bool val) except +
void set_missing_fields_as_nulls(bool val) except +

cdef unique_ptr[column] get_json_object(
column_view col,
string_scalar json_path,
get_json_object_options options,
) except +
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
startswith_multiple,
)
from cudf._lib.strings.findall import findall, findall_record
from cudf._lib.strings.json import get_json_object
from cudf._lib.strings.json import get_json_object, GetJsonObjectOptions
from cudf._lib.strings.padding import PadSide, center, ljust, pad, rjust, zfill
from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence
from cudf._lib.strings.replace import (
Expand Down
54 changes: 51 additions & 3 deletions python/cudf/cudf/_lib/strings/json.pyx
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
# Copyright (c) 2021-2022, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.column cimport Column
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.scalar.scalar cimport string_scalar
from cudf._lib.cpp.strings.json cimport get_json_object as cpp_get_json_object
from cudf._lib.cpp.strings.json cimport (
get_json_object as cpp_get_json_object,
get_json_object_options,
)
from cudf._lib.cpp.types cimport size_type
from cudf._lib.scalar cimport DeviceScalar


def get_json_object(Column col, object py_json_path):
def get_json_object(
Column col, object py_json_path, GetJsonObjectOptions options):
"""
Apply a JSONPath string to all rows in an input column
of json strings.
Expand All @@ -25,10 +30,53 @@ def get_json_object(Column col, object py_json_path):
cdef const string_scalar* scalar_json_path = <const string_scalar*>(
json_path.get_raw_ptr()
)

with nogil:
c_result = move(cpp_get_json_object(
col_view,
scalar_json_path[0],
options.options,
))

return Column.from_unique_ptr(move(c_result))


cdef class GetJsonObjectOptions:
cdef get_json_object_options options

def __init__(
self,
*,
allow_single_quotes=False,
strip_quotes_from_single_strings=True,
missing_fields_as_nulls=False
):
self.options.set_allow_single_quotes(allow_single_quotes)
self.options.set_strip_quotes_from_single_strings(
strip_quotes_from_single_strings
)
self.options.set_missing_fields_as_nulls(missing_fields_as_nulls)

@property
def allow_single_quotes(self):
return self.options.get_allow_single_quotes()

@property
def strip_quotes_from_single_strings(self):
return self.options.get_strip_quotes_from_single_strings()

@property
def missing_fields_as_nulls(self):
return self.options.get_missing_fields_as_nulls()

@allow_single_quotes.setter
def allow_single_quotes(self, val):
self.options.set_allow_single_quotes(val)

@strip_quotes_from_single_strings.setter
def strip_quotes_from_single_strings(self, val):
self.options.set_strip_quotes_from_single_strings(val)

@missing_fields_as_nulls.setter
def missing_fields_as_nulls(self, val):
self.options.set_missing_fields_as_nulls(val)
35 changes: 32 additions & 3 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -2236,16 +2236,38 @@ def get(self, i: int = 0) -> SeriesOrIndex:

return self._return_or_inplace(libstrings.get(self._column, i))

def get_json_object(self, json_path):
def get_json_object(
self,
json_path,
*,
allow_single_quotes=False,
strip_quotes_from_single_strings=True,
missing_fields_as_nulls=False,
):
r"""
Applies a JSONPath string to an input strings column
where each row in the column is a valid json string
Parameters
----------
json_path: str
json_path : str
The JSONPath string to be applied to each row
of the input column
allow_single_quotes : bool, default False
If True, representing strings with single
quotes is allowed.
If False, strings must only be represented
with double quotes.
strip_quotes_from_single_strings : bool, default True
If True, strip the quotes from the return value of
a given row if it is a string.
If False, values returned for a given row include
quotes if they are strings.
missing_fields_as_nulls : bool, default False
If True, when an object is queried for a field
it does not contain, "null" is returned.
If False, when an object is queried for a field
it does not contain, None is returned.
Returns
-------
Expand Down Expand Up @@ -2286,9 +2308,16 @@ def get_json_object(self, json_path):
"""

try:
options = libstrings.GetJsonObjectOptions(
allow_single_quotes=allow_single_quotes,
strip_quotes_from_single_strings=(
strip_quotes_from_single_strings
),
missing_fields_as_nulls=missing_fields_as_nulls,
)
res = self._return_or_inplace(
libstrings.get_json_object(
self._column, cudf.Scalar(json_path, "str")
self._column, cudf.Scalar(json_path, "str"), options
)
)
except RuntimeError as e:
Expand Down
151 changes: 151 additions & 0 deletions python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -3129,6 +3129,157 @@ def test_string_get_json_object_invalid_JSONPath(json_path):
gs.str.get_json_object(json_path)


def test_string_get_json_object_allow_single_quotes():
gs = cudf.Series(
[
"""
{
"store":{
"book":[
{
'author':"Nigel Rees",
"title":'Sayings of the Century',
"price":8.95
},
{
"category":"fiction",
"author":"Evelyn Waugh",
'title':"Sword of Honour",
"price":12.99
}
]
}
}
"""
]
)
assert_eq(
gs.str.get_json_object(
"$.store.book[0].author", allow_single_quotes=True
),
cudf.Series(["Nigel Rees"]),
)
assert_eq(
gs.str.get_json_object(
"$.store.book[*].title", allow_single_quotes=True
),
cudf.Series(["['Sayings of the Century',\"Sword of Honour\"]"]),
)

assert_eq(
gs.str.get_json_object(
"$.store.book[0].author", allow_single_quotes=False
),
cudf.Series([None]),
)
assert_eq(
gs.str.get_json_object(
"$.store.book[*].title", allow_single_quotes=False
),
cudf.Series([None]),
)


def test_string_get_json_object_strip_quotes_from_single_strings():
gs = cudf.Series(
[
"""
{
"store":{
"book":[
{
"author":"Nigel Rees",
"title":"Sayings of the Century",
"price":8.95
},
{
"category":"fiction",
"author":"Evelyn Waugh",
"title":"Sword of Honour",
"price":12.99
}
]
}
}
"""
]
)
assert_eq(
gs.str.get_json_object(
"$.store.book[0].author", strip_quotes_from_single_strings=True
),
cudf.Series(["Nigel Rees"]),
)
assert_eq(
gs.str.get_json_object(
"$.store.book[*].title", strip_quotes_from_single_strings=True
),
cudf.Series(['["Sayings of the Century","Sword of Honour"]']),
)
assert_eq(
gs.str.get_json_object(
"$.store.book[0].author", strip_quotes_from_single_strings=False
),
cudf.Series(['"Nigel Rees"']),
)
assert_eq(
gs.str.get_json_object(
"$.store.book[*].title", strip_quotes_from_single_strings=False
),
cudf.Series(['["Sayings of the Century","Sword of Honour"]']),
)


def test_string_get_json_object_missing_fields_as_nulls():
gs = cudf.Series(
[
"""
{
"store":{
"book":[
{
"author":"Nigel Rees",
"title":"Sayings of the Century",
"price":8.95
},
{
"category":"fiction",
"author":"Evelyn Waugh",
"title":"Sword of Honour",
"price":12.99
}
]
}
}
"""
]
)
assert_eq(
gs.str.get_json_object(
"$.store.book[0].category", missing_fields_as_nulls=True
),
cudf.Series(["null"]),
)
assert_eq(
gs.str.get_json_object(
"$.store.book[*].category", missing_fields_as_nulls=True
),
cudf.Series(['[null,"fiction"]']),
)
assert_eq(
gs.str.get_json_object(
"$.store.book[0].category", missing_fields_as_nulls=False
),
cudf.Series([None]),
)
assert_eq(
gs.str.get_json_object(
"$.store.book[*].category", missing_fields_as_nulls=False
),
cudf.Series(['["fiction"]']),
)


def test_str_join_lists_error():
sr = cudf.Series([["a", "a"], ["b"], ["c"]])

Expand Down

0 comments on commit 002cb1c

Please sign in to comment.