Skip to content

Commit bae6b68

Browse files
committed
Add json+binary codec
1 parent a2a3401 commit bae6b68

File tree

3 files changed

+174
-0
lines changed

3 files changed

+174
-0
lines changed

python/CHANGELOG.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@
5454
allowing greater flexibility in "disjoint union" situations.
5555
(:user:`hyanwong`, :user:`petrelharp`, :issue:`3181`)
5656

57+
- Add ``json+binary`` metadata codec that allows storing binary data
58+
alongside JSON metadata. (:user:`benjeffery`, :pr:`XXXX`)
59+
5760
**Bugfixes**
5861

5962
- In some tables with mutations out-of-order ``TableCollection.sort`` did not re-order

python/tests/test_metadata.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -597,6 +597,111 @@ def test_zero_length(self):
597597
assert ms.decode_row(b"") == {}
598598

599599

600+
class TestJSONBinaryCodec:
601+
def test_encode_requires_binary(self):
602+
ms = tskit.MetadataSchema({"codec": "json+binary"})
603+
with pytest.raises(
604+
exceptions.MetadataEncodingError,
605+
match="requires top-level '_binary' bytes-like value",
606+
):
607+
ms.validate_and_encode_row({})
608+
609+
def test_zero_length_blob(self):
610+
ms = tskit.MetadataSchema({"codec": "json+binary"})
611+
encoded = ms.validate_and_encode_row({"_binary": b""})
612+
decoded = ms.decode_row(encoded)
613+
assert isinstance(decoded["_binary"], memoryview)
614+
assert len(decoded["_binary"]) == 0
615+
# JSON portion was empty
616+
assert set(decoded.keys()) == {"_binary"}
617+
618+
def test_round_trip_with_blob_and_json(self):
619+
ms = tskit.MetadataSchema({"codec": "json+binary"})
620+
blob = b"\x00\x01\x02hello"
621+
row = {"label": "alpha", "count": 7, "_binary": blob}
622+
encoded = ms.validate_and_encode_row(row)
623+
out = ms.decode_row(encoded)
624+
assert out["label"] == "alpha"
625+
assert out["count"] == 7
626+
assert isinstance(out["_binary"], memoryview)
627+
assert out["_binary"].tobytes() == blob
628+
629+
def test_decode_without_magic_errors(self):
630+
ms = tskit.MetadataSchema({"codec": "json+binary"})
631+
# Plain JSON is not acceptable for this codec
632+
with pytest.raises(ValueError, match="missing magic header"):
633+
ms.decode_row(b"{}")
634+
635+
def test_simple_default(self):
636+
schema = {
637+
"codec": "json+binary",
638+
"type": "object",
639+
"properties": {"number": {"type": "number", "default": 5}},
640+
}
641+
ms = tskit.MetadataSchema(schema)
642+
# With json+binary, we need to provide _binary even for empty metadata
643+
assert ms.decode_row(ms.validate_and_encode_row({"_binary": b""})) == {
644+
"number": 5,
645+
"_binary": memoryview(b""),
646+
}
647+
assert ms.decode_row(
648+
ms.validate_and_encode_row({"_binary": b"", "number": 42})
649+
) == {"number": 42, "_binary": memoryview(b"")}
650+
651+
def test_nested_default_error(self):
652+
schema = {
653+
"codec": "json+binary",
654+
"type": "object",
655+
"properties": {
656+
"obj": {
657+
"type": "object",
658+
"properties": {
659+
"nested_obj_no_default": {
660+
"type": "object",
661+
"properties": {},
662+
},
663+
"nested_obj": {
664+
"type": "object",
665+
"properties": {},
666+
"default": {"foo": "bar"},
667+
},
668+
},
669+
}
670+
},
671+
}
672+
with pytest.raises(
673+
tskit.MetadataSchemaValidationError,
674+
match="Defaults can only be specified at the top level for JSON codec",
675+
):
676+
tskit.MetadataSchema(schema)
677+
678+
def test_bad_type_error(self):
679+
ms = tskit.MetadataSchema({"codec": "json+binary"})
680+
# json+binary first checks for _binary key, so we need a dict with _binary
681+
# but other fields that can't be JSON encoded
682+
with pytest.raises(
683+
exceptions.MetadataEncodingError,
684+
match="Could not encode metadata of type TableCollection",
685+
):
686+
ms.validate_and_encode_row(
687+
{"_binary": b"", "bad_field": tskit.TableCollection(1)}
688+
)
689+
690+
def test_skip_validation(self):
691+
ms = tskit.MetadataSchema({"codec": "json+binary"})
692+
assert ms._bypass_validation
693+
with patch.object(ms, "_validate_row", return_value=True) as mocked_validate:
694+
ms.validate_and_encode_row({"_binary": b""})
695+
assert mocked_validate.call_count == 0
696+
697+
def test_dont_skip_validation(self):
698+
ms = tskit.MetadataSchema({"codec": "json+binary", "properties": {"foo": {}}})
699+
assert not ms._bypass_validation
700+
with patch.object(ms, "_validate_row", return_value=True) as mocked_validate:
701+
ms.validate_and_encode_row({"_binary": b""})
702+
assert mocked_validate.call_count == 1
703+
704+
600705
class TestStructCodec:
601706
def encode_decode(self, method_name, sub_schema, obj, buffer):
602707
assert (

python/tskit/metadata.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,72 @@ def decode(self, data: bytes) -> bytes:
193193
return data
194194

195195

196+
class JSONBinaryCodec(JSONCodec):
197+
"""
198+
A JSON codec that optionally packs a single binary blob alongside the
199+
canonical JSON bytes. The JSON portion is validated using the normal JSON
200+
schema rules; a reserved top-level key "_binary" is ignored for validation
201+
purposes and, if present at encode time, is stored as raw bytes appended
202+
after a small header and the JSON payload.
203+
204+
On encode, callers MUST supply a top-level "_binary" bytes-like value,
205+
even if zero length. On decode, the returned object will include a
206+
"_binary" key whose value is a memoryview over the decoded bytes.
207+
"""
208+
209+
MAGIC = b"JBLB"
210+
VERSION = 1
211+
_HDR = struct.Struct("<4sBQQ") # magic, version, json_len, blob_len
212+
213+
# Use the same validator behavior as JSONCodec; we do not special-case
214+
# validation for the reserved _binary key. If users set additionalProperties
215+
# to False, providing _binary will fail validation unless declared.
216+
217+
def encode(self, obj: Any) -> bytes:
218+
# Require a top-level _binary bytes-like entry; zero-length allowed
219+
if not isinstance(obj, dict) or "_binary" not in obj:
220+
raise exceptions.MetadataEncodingError(
221+
"json+binary requires top-level '_binary' bytes-like value"
222+
)
223+
try:
224+
blob_bytes = memoryview(obj["_binary"]).tobytes()
225+
except TypeError as e:
226+
raise exceptions.MetadataEncodingError(
227+
"_binary must be bytes-like (buffer protocol)"
228+
) from e
229+
230+
try:
231+
json_bytes = tskit.canonical_json(
232+
{k: v for k, v in obj.items() if k != "_binary"}
233+
).encode()
234+
except TypeError as e:
235+
raise exceptions.MetadataEncodingError(
236+
f"Could not encode metadata of type {str(e).split()[3]}"
237+
)
238+
239+
header = self._HDR.pack(
240+
self.MAGIC, self.VERSION, len(json_bytes), len(blob_bytes)
241+
)
242+
return header + json_bytes + blob_bytes
243+
244+
def decode(self, encoded: bytes) -> Any:
245+
if len(encoded) >= self._HDR.size and encoded[:4] == self.MAGIC:
246+
_, version, jlen, blen = self._HDR.unpack_from(encoded)
247+
if version != self.VERSION:
248+
raise ValueError("Unsupported json+binary version")
249+
start = self._HDR.size
250+
json_bytes = encoded[start : start + jlen]
251+
blob_bytes = encoded[start + jlen : start + jlen + blen]
252+
253+
result = super().decode(json_bytes)
254+
result["_binary"] = memoryview(blob_bytes)
255+
return result
256+
raise ValueError("Invalid json+binary payload: missing magic header")
257+
258+
259+
register_metadata_codec(JSONBinaryCodec, "json+binary")
260+
261+
196262
def binary_format_validator(validator, types, instance, schema):
197263
# We're hooking into jsonschemas validation code here, which works by creating
198264
# generators of exceptions, hence the yielding

0 commit comments

Comments
 (0)