diff --git a/c/tests/test_core.c b/c/tests/test_core.c index 1b97e4485f..817d958f4a 100644 --- a/c/tests/test_core.c +++ b/c/tests/test_core.c @@ -25,6 +25,7 @@ #include "testlib.h" #include #include +#include #include @@ -82,6 +83,141 @@ test_generate_uuid(void) CU_ASSERT_STRING_NOT_EQUAL(uuid, other_uuid); } +static void +set_u64_le(uint8_t *dest, uint64_t value) +{ + dest[0] = (uint8_t)(value & 0xFF); + dest[1] = (uint8_t)((value >> 8) & 0xFF); + dest[2] = (uint8_t)((value >> 16) & 0xFF); + dest[3] = (uint8_t)((value >> 24) & 0xFF); + dest[4] = (uint8_t)((value >> 32) & 0xFF); + dest[5] = (uint8_t)((value >> 40) & 0xFF); + dest[6] = (uint8_t)((value >> 48) & 0xFF); + dest[7] = (uint8_t)((value >> 56) & 0xFF); +} + +static void +test_json_binary_metadata_get_blob(void) +{ + int ret; + char metadata[128]; + const char *json; + tsk_size_t json_buffer_length; + const uint8_t *blob; + tsk_size_t blob_length; + uint8_t *bytes; + tsk_size_t metadata_length; + size_t header_length; + size_t json_length; + size_t payload_length; + size_t total_length; + const char json_payload[] = "{\"a\":1}"; + const uint8_t binary_payload[] = { 0x01, 0x02, 0x03, 0x04 }; + const uint8_t empty_payload[] = { 0 }; + + bytes = (uint8_t *) metadata; + header_length = 4 + 1 + 8 + 8; + json_length = strlen(json_payload); + payload_length = sizeof(binary_payload); + total_length = header_length + json_length + payload_length; + CU_ASSERT_FATAL(total_length <= sizeof(metadata)); + memset(metadata, 0, sizeof(metadata)); + bytes[0] = 'J'; + bytes[1] = 'B'; + bytes[2] = 'L'; + bytes[3] = 'B'; + bytes[4] = 1; + set_u64_le(bytes + 5, (uint64_t) json_length); + set_u64_le(bytes + 13, (uint64_t) payload_length); + memcpy(bytes + header_length, json_payload, json_length); + memcpy(bytes + header_length + json_length, binary_payload, payload_length); + metadata_length = (tsk_size_t) total_length; + ret = tsk_json_binary_metadata_get_blob( + metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); + CU_ASSERT_EQUAL(ret, 0); + CU_ASSERT_PTR_EQUAL(json, (const char *) bytes + header_length); + CU_ASSERT_EQUAL(json_buffer_length, (tsk_size_t) json_length); + if (json_length > 0) { + CU_ASSERT_EQUAL(memcmp(json, json_payload, json_length), 0); + } + CU_ASSERT_PTR_EQUAL(blob, bytes + header_length + json_length); + CU_ASSERT_EQUAL(blob_length, (tsk_size_t) payload_length); + CU_ASSERT_EQUAL(memcmp(blob, binary_payload, payload_length), 0); + + payload_length = 0; + total_length = header_length + json_length + payload_length; + CU_ASSERT_FATAL(total_length <= sizeof(metadata)); + set_u64_le(bytes + 13, (uint64_t) payload_length); + metadata_length = (tsk_size_t) total_length; + ret = tsk_json_binary_metadata_get_blob( + metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); + CU_ASSERT_EQUAL(ret, 0); + CU_ASSERT_PTR_EQUAL(json, (const char *) bytes + header_length); + CU_ASSERT_EQUAL(json_buffer_length, (tsk_size_t) json_length); + CU_ASSERT_EQUAL(blob_length, (tsk_size_t) payload_length); + CU_ASSERT_PTR_EQUAL(blob, bytes + header_length + json_length); + + json_length = 0; + payload_length = sizeof(empty_payload); + total_length = header_length + json_length + payload_length; + CU_ASSERT_FATAL(total_length <= sizeof(metadata)); + set_u64_le(bytes + 5, (uint64_t) json_length); + set_u64_le(bytes + 13, (uint64_t) payload_length); + memcpy(bytes + header_length + json_length, empty_payload, payload_length); + metadata_length = (tsk_size_t) total_length; + ret = tsk_json_binary_metadata_get_blob( + metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); + CU_ASSERT_EQUAL(ret, 0); + CU_ASSERT_PTR_EQUAL(json, (const char *) bytes + header_length); + CU_ASSERT_EQUAL(json_buffer_length, (tsk_size_t) json_length); + CU_ASSERT_EQUAL(blob_length, (tsk_size_t) payload_length); + CU_ASSERT_PTR_EQUAL(blob, bytes + header_length + json_length); + CU_ASSERT_EQUAL(memcmp(blob, empty_payload, payload_length), 0); + + blob = NULL; + blob_length = 0; + json = NULL; + json_buffer_length = 0; + metadata_length = header_length - 1; + ret = tsk_json_binary_metadata_get_blob( + metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); + CU_ASSERT_EQUAL(ret, TSK_ERR_FILE_FORMAT); + + metadata_length = (tsk_size_t) total_length; + bytes[0] = 'X'; + ret = tsk_json_binary_metadata_get_blob( + metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); + CU_ASSERT_EQUAL(ret, TSK_ERR_FILE_FORMAT); + bytes[0] = 'J'; + + bytes[4] = 2; + ret = tsk_json_binary_metadata_get_blob( + metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); + CU_ASSERT_EQUAL(ret, TSK_ERR_FILE_VERSION_TOO_NEW); + bytes[4] = 1; + + metadata_length = (tsk_size_t)(total_length - 1); + ret = tsk_json_binary_metadata_get_blob( + metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); + CU_ASSERT_EQUAL(ret, TSK_ERR_FILE_FORMAT); + + ret = tsk_json_binary_metadata_get_blob( + NULL, metadata_length, &json, &json_buffer_length, &blob, &blob_length); + CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); + ret = tsk_json_binary_metadata_get_blob( + metadata, metadata_length, NULL, &json_buffer_length, &blob, &blob_length); + CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); + ret = tsk_json_binary_metadata_get_blob( + metadata, metadata_length, &json, NULL, &blob, &blob_length); + CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); + ret = tsk_json_binary_metadata_get_blob( + metadata, metadata_length, &json, &json_buffer_length, NULL, &blob_length); + CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); + ret = tsk_json_binary_metadata_get_blob( + metadata, metadata_length, &json, &json_buffer_length, &blob, NULL); + CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); +} + static void test_double_round(void) { @@ -652,6 +788,7 @@ main(int argc, char **argv) { "test_strerror", test_strerror }, { "test_strerror_kastore", test_strerror_kastore }, { "test_generate_uuid", test_generate_uuid }, + { "test_json_binary_metadata_get_blob", test_json_binary_metadata_get_blob }, { "test_double_round", test_double_round }, { "test_blkalloc", test_blkalloc }, { "test_unknown_time", test_unknown_time }, diff --git a/c/tskit/core.c b/c/tskit/core.c index 5e5f828943..dcce6fb703 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -33,6 +33,9 @@ #include #define UUID_NUM_BYTES 16 +#define TSK_JSON_BINARY_HEADER_SIZE 21 + +static const uint8_t TSK_JSON_BINARY_MAGIC[4] = { 'J', 'B', 'L', 'B' }; #if defined(_WIN32) @@ -95,6 +98,22 @@ get_random_bytes(uint8_t *buf) #endif +static uint64_t +tsk_load_u64_le(const uint8_t *p) +{ + uint64_t value; + + value = (uint64_t) p[0]; + value |= (uint64_t) p[1] << 8; + value |= (uint64_t) p[2] << 16; + value |= (uint64_t) p[3] << 24; + value |= (uint64_t) p[4] << 32; + value |= (uint64_t) p[5] << 40; + value |= (uint64_t) p[6] << 48; + value |= (uint64_t) p[7] << 56; + return value; +} + /* Generate a new UUID4 using a system-generated source of randomness. * Note that this function writes a NULL terminator to the end of this * string, so that the total length of the buffer must be 37 bytes. @@ -121,6 +140,56 @@ tsk_generate_uuid(char *dest, int TSK_UNUSED(flags)) out: return ret; } + +int +tsk_json_binary_metadata_get_blob(const char *metadata, tsk_size_t metadata_length, + const char **json, tsk_size_t *json_length, const uint8_t **blob, + tsk_size_t *blob_length) +{ + int ret; + uint8_t version; + uint64_t json_length_u64; + uint64_t binary_length_u64; + const uint8_t *bytes; + const uint8_t *blob_start; + const char *json_start; + + if (metadata == NULL || json == NULL || json_length == NULL || blob == NULL + || blob_length == NULL) { + ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); + goto out; + } + bytes = (const uint8_t *) metadata; + if (metadata_length < TSK_JSON_BINARY_HEADER_SIZE) { + ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); + goto out; + } + if (memcmp(bytes, TSK_JSON_BINARY_MAGIC, sizeof(TSK_JSON_BINARY_MAGIC)) != 0) { + ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); + goto out; + } + version = bytes[4]; + if (version != 1) { + ret = tsk_trace_error(TSK_ERR_FILE_VERSION_TOO_NEW); + goto out; + } + json_length_u64 = tsk_load_u64_le(bytes + 5); + binary_length_u64 = tsk_load_u64_le(bytes + 13); + if ((uint64_t) metadata_length + < (uint64_t) TSK_JSON_BINARY_HEADER_SIZE + json_length_u64 + binary_length_u64) { + ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); + goto out; + } + json_start = (const char *) bytes + TSK_JSON_BINARY_HEADER_SIZE; + blob_start = bytes + TSK_JSON_BINARY_HEADER_SIZE + json_length_u64; + *json = json_start; + *json_length = (tsk_size_t) json_length_u64; + *blob = blob_start; + *blob_length = (tsk_size_t) binary_length_u64; + ret = 0; +out: + return ret; +} static const char * tsk_strerror_internal(int err) { diff --git a/c/tskit/core.h b/c/tskit/core.h index 76ac086957..f54a126130 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -1088,6 +1088,31 @@ bool tsk_isfinite(double val); #define TSK_UUID_SIZE 36 int tsk_generate_uuid(char *dest, int flags); +/** +@brief Extract the binary payload from ``json+binary`` encoded metadata. + +@rst +Metadata produced by :py:class:`tskit.metadata.JSONBinaryCodec` consists of a fixed-size +header followed by canonical JSON bytes and an optional binary payload. This helper +validates the ``json+binary`` framing, returning pointers to the embedded JSON and binary +sections without copying. + +The output pointers reference memory owned by the caller and remain valid only while +the original metadata buffer is alive. +@endrst + +@param[in] metadata Pointer to the encoded metadata bytes. +@param[in] metadata_length Number of bytes available at ``metadata``. +@param[out] json On success, set to the start of the JSON bytes. +@param[out] json_length On success, set to the JSON length in bytes. +@param[out] blob On success, set to the start of the binary payload. +@param[out] blob_length On success, set to the payload length in bytes. +@return 0 on success, or a :ref:`TSK_ERR ` code on failure. +*/ +int tsk_json_binary_metadata_get_blob(const char *metadata, tsk_size_t metadata_length, + const char **json, tsk_size_t *json_length, const uint8_t **blob, + tsk_size_t *blob_length); + /* TODO most of these can probably be macros so they compile out as no-ops. * Lets do the 64 bit tsk_size_t switch first though. */ void *tsk_malloc(tsk_size_t size); diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 28cf1496b6..fe15bd5f70 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -54,6 +54,9 @@ allowing greater flexibility in "disjoint union" situations. (:user:`hyanwong`, :user:`petrelharp`, :issue:`3181`) +- Add ``json+binary`` metadata codec that allows storing binary data + alongside JSON metadata. (:user:`benjeffery`, :pr:`3306`) + **Bugfixes** - In some tables with mutations out-of-order ``TableCollection.sort`` did not re-order diff --git a/python/tests/test_metadata.py b/python/tests/test_metadata.py index edecb44bde..f0b421f6a5 100644 --- a/python/tests/test_metadata.py +++ b/python/tests/test_metadata.py @@ -597,6 +597,133 @@ def test_zero_length(self): assert ms.decode_row(b"") == {} +class TestJSONBinaryCodec: + def test_encode_requires_binary(self): + ms = tskit.MetadataSchema({"codec": "json+binary"}) + with pytest.raises( + exceptions.MetadataEncodingError, + match="requires top-level '_binary' bytes-like value", + ): + ms.validate_and_encode_row({}) + + def test_zero_length_blob(self): + ms = tskit.MetadataSchema({"codec": "json+binary"}) + encoded = ms.validate_and_encode_row({"_binary": b""}) + decoded = ms.decode_row(encoded) + assert isinstance(decoded["_binary"], memoryview) + assert len(decoded["_binary"]) == 0 + # JSON portion was empty + assert set(decoded.keys()) == {"_binary"} + + def test_round_trip_with_blob_and_json(self): + ms = tskit.MetadataSchema({"codec": "json+binary"}) + blob = b"\x00\x01\x02hello" + row = {"label": "alpha", "count": 7, "_binary": blob} + encoded = ms.validate_and_encode_row(row) + out = ms.decode_row(encoded) + assert out["label"] == "alpha" + assert out["count"] == 7 + assert isinstance(out["_binary"], memoryview) + assert out["_binary"].tobytes() == blob + + def test_decode_without_magic_errors(self): + ms = tskit.MetadataSchema({"codec": "json+binary"}) + # Plain JSON is not acceptable for this codec + with pytest.raises(ValueError, match="missing magic header"): + ms.decode_row(b"{}") + + def test_simple_default(self): + schema = { + "codec": "json+binary", + "type": "object", + "properties": {"number": {"type": "number", "default": 5}}, + } + ms = tskit.MetadataSchema(schema) + # With json+binary, we need to provide _binary even for empty metadata + assert ms.decode_row(ms.validate_and_encode_row({"_binary": b""})) == { + "number": 5, + "_binary": memoryview(b""), + } + assert ms.decode_row( + ms.validate_and_encode_row({"_binary": b"", "number": 42}) + ) == {"number": 42, "_binary": memoryview(b"")} + + def test_nested_default_error(self): + schema = { + "codec": "json+binary", + "type": "object", + "properties": { + "obj": { + "type": "object", + "properties": { + "nested_obj_no_default": { + "type": "object", + "properties": {}, + }, + "nested_obj": { + "type": "object", + "properties": {}, + "default": {"foo": "bar"}, + }, + }, + } + }, + } + with pytest.raises( + tskit.MetadataSchemaValidationError, + match="Defaults can only be specified at the top level for JSON codec", + ): + tskit.MetadataSchema(schema) + + def test_bad_type_error(self): + ms = tskit.MetadataSchema({"codec": "json+binary"}) + # json+binary first checks for _binary key, so we need a dict with _binary + # but other fields that can't be JSON encoded + with pytest.raises( + exceptions.MetadataEncodingError, + match="Could not encode metadata of type TableCollection", + ): + ms.validate_and_encode_row( + {"_binary": b"", "bad_field": tskit.TableCollection(1)} + ) + + def test_skip_validation(self): + ms = tskit.MetadataSchema({"codec": "json+binary"}) + assert ms._bypass_validation + with patch.object(ms, "_validate_row", return_value=True) as mocked_validate: + ms.validate_and_encode_row({"_binary": b""}) + assert mocked_validate.call_count == 0 + + def test_dont_skip_validation(self): + ms = tskit.MetadataSchema({"codec": "json+binary", "properties": {"foo": {}}}) + assert not ms._bypass_validation + with patch.object(ms, "_validate_row", return_value=True) as mocked_validate: + ms.validate_and_encode_row({"_binary": b""}) + assert mocked_validate.call_count == 1 + + def test_binary_requires_buffer_protocol(self): + ms = tskit.MetadataSchema({"codec": "json+binary"}) + with pytest.raises( + exceptions.MetadataEncodingError, + match="_binary must be bytes-like \\(buffer protocol\\)", + ): + ms.validate_and_encode_row({"_binary": "not bytes"}) + + def test_decode_version_mismatch(self): + ms = tskit.MetadataSchema({"codec": "json+binary"}) + header = metadata.JSONBinaryCodec._HDR.pack( + metadata.JSONBinaryCodec.MAGIC, + metadata.JSONBinaryCodec.VERSION + 1, + len(b"{}"), + 0, + ) + with pytest.raises( + ValueError, + match="Unsupported json\\+binary version", + ): + ms.decode_row(header + b"{}") + + class TestStructCodec: def encode_decode(self, method_name, sub_schema, obj, buffer): assert ( diff --git a/python/tskit/metadata.py b/python/tskit/metadata.py index c447debab2..c0c11249d6 100644 --- a/python/tskit/metadata.py +++ b/python/tskit/metadata.py @@ -193,6 +193,72 @@ def decode(self, data: bytes) -> bytes: return data +class JSONBinaryCodec(JSONCodec): + """ + A JSON codec that optionally packs a single binary blob alongside the + canonical JSON bytes. The JSON portion is validated using the normal JSON + schema rules; a reserved top-level key "_binary" is ignored for validation + purposes and, if present at encode time, is stored as raw bytes appended + after a small header and the JSON payload. + + On encode, callers MUST supply a top-level "_binary" bytes-like value, + even if zero length. On decode, the returned object will include a + "_binary" key whose value is a memoryview over the decoded bytes. + """ + + MAGIC = b"JBLB" + VERSION = 1 + _HDR = struct.Struct("<4sBQQ") # magic, version, json_len, blob_len + + # Use the same validator behavior as JSONCodec; we do not special-case + # validation for the reserved _binary key. If users set additionalProperties + # to False, providing _binary will fail validation unless declared. + + def encode(self, obj: Any) -> bytes: + # Require a top-level _binary bytes-like entry; zero-length allowed + if not isinstance(obj, dict) or "_binary" not in obj: + raise exceptions.MetadataEncodingError( + "json+binary requires top-level '_binary' bytes-like value" + ) + try: + blob_bytes = memoryview(obj["_binary"]).tobytes() + except TypeError as e: + raise exceptions.MetadataEncodingError( + "_binary must be bytes-like (buffer protocol)" + ) from e + + try: + json_bytes = tskit.canonical_json( + {k: v for k, v in obj.items() if k != "_binary"} + ).encode() + except TypeError as e: + raise exceptions.MetadataEncodingError( + f"Could not encode metadata of type {str(e).split()[3]}" + ) + + header = self._HDR.pack( + self.MAGIC, self.VERSION, len(json_bytes), len(blob_bytes) + ) + return header + json_bytes + blob_bytes + + def decode(self, encoded: bytes) -> Any: + if len(encoded) >= self._HDR.size and encoded[:4] == self.MAGIC: + _, version, jlen, blen = self._HDR.unpack_from(encoded) + if version != self.VERSION: + raise ValueError("Unsupported json+binary version") + start = self._HDR.size + json_bytes = encoded[start : start + jlen] + blob_bytes = encoded[start + jlen : start + jlen + blen] + + result = super().decode(json_bytes) + result["_binary"] = memoryview(blob_bytes) + return result + raise ValueError("Invalid json+binary payload: missing magic header") + + +register_metadata_codec(JSONBinaryCodec, "json+binary") + + def binary_format_validator(validator, types, instance, schema): # We're hooking into jsonschemas validation code here, which works by creating # generators of exceptions, hence the yielding