-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[DRIVERS-2926] [PYTHON-4577] BSON Binary Vector Subtype Support #1813
base: master
Are you sure you want to change the base?
Changes from 1 commit
245c869
031cd8c
8d4e8a2
2df0d6b
315a115
d74314d
27f13c8
263f8c7
f8bcdef
5435785
5c4d152
f86d040
adcb945
7986cc5
26b8398
28de28a
68235b8
e2a1a3c
bf9758a
e1590aa
c4c7af7
de5a245
43bcce4
41ee0bb
9d52aeb
0d34464
1d49656
2af0ca4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
{ | ||
"description": "Basic Tests of Binary Vectors, subtype 9", | ||
"test_key": "vector", | ||
"tests": [ | ||
{ | ||
"description": "Simple Vector INT8", | ||
"valid": true, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since the invalid tests are separated in a different array, you can probably remove this key There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This one's out of date. |
||
"vector": [127, 7], | ||
"dtype_hex": "0x03", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not clear why expected hex value of dtype needs to be specified separately from the expected BSON encoding of the vector. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need the dtype in addition to the vector/numbers to encode the data. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to distinguish, for example, between int8, packed_bit, or even float32. |
||
"dtype_alias": "INT8", | ||
"padding": 0, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not clear why padding needs to be specified as an expectation. Isn't that just part of the encoding? Maybe if you add some tests where padding is non-zero it will be clearer. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not required. There are now tests that include non-zero padding (both invlaid and valid ones). |
||
"canonical_bson": "1600000005766563746F7200040000000903007F0700", | ||
"canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"AwB/Bw==\", \"subType\": \"09\"}}}" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't need to test JSON encoding here since that is well-tested elsewhere in the bson corpus |
||
}, | ||
{ | ||
"description": "Simple Vector FLOAT32", | ||
"valid": true, | ||
"vector": [127.0, 7.0], | ||
"dtype_hex": "0x27", | ||
"dtype_alias": "FLOAT32", | ||
"padding": 0, | ||
"canonical_bson": "1C00000005766563746F72000A0000000927000000FE420000E04000", | ||
"canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"JwAAAP5CAADgQA==\", \"subType\": \"09\"}}}" | ||
}, | ||
{ | ||
"description": "Simple Vector PACKED_BIT", | ||
"valid": true, | ||
"vector": [127, 7], | ||
"dtype_hex": "0x10", | ||
"dtype_alias": "PACKED_BIT", | ||
"padding": 0, | ||
"canonical_bson": "1600000005766563746F7200040000000910007F0700", | ||
"canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"EAB/Bw==\", \"subType\": \"09\"}}}" | ||
}, | ||
{ | ||
"description": "Empty Vector INT8", | ||
"valid": true, | ||
"vector": [], | ||
"dtype_hex": "0x03", | ||
"dtype_alias": "INT8", | ||
"padding": 0, | ||
"canonical_bson": "1400000005766563746F72000200000009030000", | ||
"canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"AwA=\", \"subType\": \"09\"}}}" | ||
}, | ||
{ | ||
"description": "Empty Vector FLOAT32", | ||
"valid": true, | ||
"vector": [], | ||
"dtype_hex": "0x27", | ||
"dtype_alias": "FLOAT32", | ||
"padding": 0, | ||
"canonical_bson": "1400000005766563746F72000200000009270000", | ||
"canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"JwA=\", \"subType\": \"09\"}}}" | ||
}, | ||
{ | ||
"description": "Empty Vector PACKED_BIT", | ||
"valid": true, | ||
"vector": [], | ||
"dtype_hex": "0x10", | ||
"dtype_alias": "PACKED_BIT", | ||
"padding": 0, | ||
"canonical_bson": "1400000005766563746F72000200000009100000", | ||
"canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"EAA=\", \"subType\": \"09\"}}}" | ||
}, | ||
{ | ||
"description": "Infinity Vector FLOAT32", | ||
"valid": true, | ||
"vector": ["-inf", 0.0, "inf"], | ||
"dtype_hex": "0x27", | ||
"dtype_alias": "FLOAT32", | ||
"padding": 0, | ||
"canonical_bson": "2000000005766563746F72000E000000092700000080FF000000000000807F00", | ||
"canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"JwAAAID/AAAAAAAAgH8=\", \"subType\": \"09\"}}}" | ||
} | ||
], | ||
"invalid": [ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will there be different categories of invalid vectors? Note that in the BSON corpus we have both "decodeErrors" and "parseErrors". |
||
{ | ||
"description": "Overflow Vector INT8", | ||
"valid": false, | ||
"vector": [256], | ||
"dtype_hex": "0x03", | ||
"dtype_alias": "INT8", | ||
"padding": 0 | ||
} | ||
] | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
# Copyright 2024-present MongoDB, Inc. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from __future__ import annotations | ||
|
||
import binascii | ||
import codecs | ||
import functools | ||
import glob | ||
import json | ||
import os | ||
import struct | ||
import sys | ||
from decimal import DecimalException | ||
from pathlib import Path | ||
from test import unittest | ||
|
||
from bson import decode, encode, json_util | ||
from bson.binary import Binary, BinaryVectorDtype | ||
|
||
_TEST_PATH = Path(__file__).parent / "bson_binary_vector" | ||
|
||
|
||
class TestBSONBinaryVector(unittest.TestCase): | ||
"""Runs Binary Vector subtype tests. | ||
|
||
Follows the style of the BSON corpus specification tests. | ||
Tests are automatically generated on import | ||
from json files in _TEST_PATH via `create_tests`. | ||
The actual tests are defined in the inner function `run_test` | ||
of the test generator `create_test`.""" | ||
|
||
|
||
def create_test(case_spec): | ||
"""Create standard test given specification in json. | ||
|
||
We use the naming convention expected (exp) and observed (obj) | ||
to differentiate what is in the json (expected or suffix _exp) | ||
from what is produced by the API (observed or suffix _obs) | ||
""" | ||
test_key = case_spec.get("test_key") | ||
|
||
def run_test(self): | ||
for test_case in case_spec.get("tests", []): | ||
description = test_case["description"] | ||
vector_exp = test_case["vector"] | ||
dtype_hex_exp = test_case["dtype_hex"] | ||
dtype_alias_exp = test_case.get("dtype_alias") | ||
padding_exp = test_case.get("padding") | ||
canonical_bson_exp = test_case["canonical_bson"] | ||
canonical_extjson_exp = test_case["canonical_extjson"] | ||
# Convert dtype hex string into bytes | ||
dtype_exp = BinaryVectorDtype(int(dtype_hex_exp, 16).to_bytes(1, byteorder="little")) | ||
|
||
if test_case["valid"]: | ||
# Convert bson string to bytes | ||
cB_exp = binascii.unhexlify(canonical_bson_exp.encode("utf8")) | ||
decoded_doc = decode(cB_exp) | ||
binary_obs = decoded_doc[test_key] | ||
# Handle special float cases like '-inf' | ||
if dtype_exp in [BinaryVectorDtype.FLOAT32]: | ||
vector_exp = [float(x) for x in vector_exp] | ||
|
||
# Test round-tripping canonical bson. | ||
self.assertEqual(encode(decoded_doc), cB_exp, description) | ||
|
||
# Test BSON to Binary Vector | ||
vector_obs = binary_obs.as_vector() | ||
self.assertEqual(vector_obs.dtype, dtype_exp) | ||
if dtype_alias_exp: | ||
self.assertEqual(vector_obs.dtype, BinaryVectorDtype[dtype_alias_exp]) | ||
self.assertEqual(vector_obs.data, vector_exp) | ||
self.assertEqual(vector_obs.padding, padding_exp) | ||
|
||
# Test Binary Vector to BSON | ||
vector_exp = Binary.from_vector(vector_exp, dtype_exp, padding_exp) | ||
cB_obs = binascii.hexlify(encode({test_key: vector_exp})).decode().upper() | ||
self.assertEqual(cB_obs, canonical_bson_exp) | ||
|
||
# Test JSON | ||
self.assertEqual(json_util.loads(canonical_extjson_exp), decoded_doc) | ||
self.assertEqual(json_util.dumps(decoded_doc), canonical_extjson_exp) | ||
|
||
else: | ||
with self.assertRaises(struct.error): | ||
Binary.from_vector(vector_exp, dtype_exp) | ||
|
||
return run_test | ||
|
||
|
||
def create_tests(): | ||
for filename in _TEST_PATH.glob("*.json"): | ||
with codecs.open(filename, encoding="utf-8") as test_file: | ||
test_method = create_test(json.load(test_file)) | ||
setattr(TestBSONBinaryVector, "test_" + filename.stem, test_method) | ||
|
||
|
||
create_tests() | ||
|
||
|
||
if __name__ == "__main__": | ||
unittest.main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's follow the rest of the BSON corpus and name this field "valid".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The format now follows one of the standards I saw.