From 5192de018f9f88bc579e46b5589f47c56a64058e Mon Sep 17 00:00:00 2001 From: Ben Bellick Date: Thu, 9 Oct 2025 10:24:17 -0400 Subject: [PATCH 1/3] chore: remove unused substrait-cpp submodule Also drop json test which depends on contents of substrait-cpp library --- .gitmodules | 3 -- tests/test_json.py | 71 --------------------------------------- third_party/substrait-cpp | 1 - update_cpp.sh | 5 --- 4 files changed, 80 deletions(-) delete mode 100644 tests/test_json.py delete mode 160000 third_party/substrait-cpp delete mode 100644 update_cpp.sh diff --git a/.gitmodules b/.gitmodules index a72a4f8..d9705e1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ [submodule "third_party/substrait"] path = third_party/substrait url = https://github.com/substrait-io/substrait -[submodule "third_party/substrait-cpp"] - path = third_party/substrait-cpp - url = https://github.com/substrait-io/substrait-cpp diff --git a/tests/test_json.py b/tests/test_json.py deleted file mode 100644 index 741bd87..0000000 --- a/tests/test_json.py +++ /dev/null @@ -1,71 +0,0 @@ -import os -import pathlib -import tempfile - -from substrait.proto import Plan -from substrait.json import load_json, parse_json, dump_json, write_json - -import pytest - - -JSON_FIXTURES = ( - pathlib.Path(os.path.dirname(__file__)) - / ".." - / "third_party" - / "substrait-cpp" - / "src" - / "substrait" - / "textplan" - / "data" -) -JSON_TEST_FILE = sorted(JSON_FIXTURES.glob("*.json")) -JSON_TEST_FILENAMES = [path.name for path in JSON_TEST_FILE] - - -@pytest.mark.parametrize("jsonfile", JSON_TEST_FILE, ids=JSON_TEST_FILENAMES) -def test_json_load(jsonfile): - with open(jsonfile) as f: - jsondata = _strip_json_comments(f) - parsed_plan = parse_json(jsondata) - - # Save to a temporary file so we can test load_json - # on content stripped of comments. - with tempfile.TemporaryDirectory() as tmpdir: - # We use a TemporaryDirectory as on Windows NamedTemporaryFile - # doesn't allow for easy reopening of the file. - with open(pathlib.Path(tmpdir) / "jsonfile.json", "w+") as stripped_file: - stripped_file.write(jsondata) - loaded_plan = load_json(stripped_file.name) - - # The Plan constructor itself will throw an exception - # in case there is anything wrong in parsing the JSON - # so we can take for granted that if the plan was created - # it is a valid plan in terms of protobuf definition. - assert type(loaded_plan) is Plan - - # Ensure that when loading from file or from string - # the outcome is the same - assert parsed_plan == loaded_plan - - -@pytest.mark.parametrize("jsonfile", JSON_TEST_FILE, ids=JSON_TEST_FILENAMES) -def test_json_roundtrip(jsonfile): - with open(jsonfile) as f: - jsondata = _strip_json_comments(f) - - parsed_plan = parse_json(jsondata) - assert parse_json(dump_json(parsed_plan)) == parsed_plan - - # Test with write/load - with tempfile.TemporaryDirectory() as tmpdir: - filename = pathlib.Path(tmpdir) / "jsonfile.json" - write_json(parsed_plan, filename) - assert load_json(filename) == parsed_plan - - -def _strip_json_comments(jsonfile): - # The JSON files in the cpp testsuite are prefixed with - # a comment containing the SQL that matches the json plan. - # As Python JSON parser doesn't support comments, - # we have to strip them to make the content readable - return "\n".join(line for line in jsonfile.readlines() if line[0] != "#") diff --git a/third_party/substrait-cpp b/third_party/substrait-cpp deleted file mode 160000 index e26585f..0000000 --- a/third_party/substrait-cpp +++ /dev/null @@ -1 +0,0 @@ -Subproject commit e26585f45cdfd0ed3bf03f700c354f04685398c6 diff --git a/update_cpp.sh b/update_cpp.sh deleted file mode 100644 index 804546a..0000000 --- a/update_cpp.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -echo "Updating substrait-cpp submodule..." -git submodule update --remote third_party/substrait-cpp - From 94a582d8fef5f114a921061d27398725b16e2028 Mon Sep 17 00:00:00 2001 From: Ben Bellick Date: Thu, 9 Oct 2025 10:47:19 -0400 Subject: [PATCH 2/3] feat: drop public API for unsupported JSON representation There is no official substrait JSON API, though users are more than welcome to use the official JSON representation of protobufs in general. Considering that these functions were just thin wrappers around the protobuf JSON api and that the tests are really just testing another libraries code, it made sense to remove them. BREAKING CHANGE: drop functions handling JSON versions of plans. --- src/substrait/json.py | 25 ------------------------- tests/sql/test_sql_to_substrait.py | 4 ++-- 2 files changed, 2 insertions(+), 27 deletions(-) delete mode 100644 src/substrait/json.py diff --git a/src/substrait/json.py b/src/substrait/json.py deleted file mode 100644 index ed27d44..0000000 --- a/src/substrait/json.py +++ /dev/null @@ -1,25 +0,0 @@ -from google.protobuf import json_format - -from substrait.proto import Plan - - -def load_json(filename): - """Load a Substrait Plan from a json file""" - with open(filename, encoding="utf-8") as f: - return parse_json(f.read()) - - -def parse_json(text): - """Generate a Substrait Plan from its JSON definition""" - return json_format.Parse(text=text, message=Plan()) - - -def write_json(plan, filename): - """Write a Substrait Plan to a json file""" - with open(filename, "w+") as f: - f.write(dump_json(plan)) - - -def dump_json(plan): - """Dump a Substrait Plan to a string in JSON format""" - return json_format.MessageToJson(plan) diff --git a/tests/sql/test_sql_to_substrait.py b/tests/sql/test_sql_to_substrait.py index ec33ee8..4b36431 100644 --- a/tests/sql/test_sql_to_substrait.py +++ b/tests/sql/test_sql_to_substrait.py @@ -1,6 +1,6 @@ from substrait.sql.sql_to_substrait import convert import pyarrow -import substrait.json +from google.protobuf import json_format import tempfile import pyarrow.substrait as pa_substrait import pytest @@ -91,7 +91,7 @@ def duckdb_schema_resolver(name: str): conn.install_extension("substrait", repository="community") conn.load_extension("substrait") - plan_json = substrait.json.dump_json(plan) + plan_json = json_format.MessageToJson(plan) sql = f"CALL from_substrait_json('{plan_json}')" substrait_out = conn.sql(sql) From a5285e4e984375c8bf964d1023c3593555e8c3bb Mon Sep 17 00:00:00 2001 From: Ben Bellick Date: Thu, 9 Oct 2025 16:04:09 -0400 Subject: [PATCH 3/3] docs: update README to reflect usage of protobuf lib for JSON parsing --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 4deeba2..9587aa0 100644 --- a/README.md +++ b/README.md @@ -136,12 +136,13 @@ relations { ``` ## Load a Substrait Plan from JSON -A substrait plan can be loaded from its JSON representation -using the ``substrait.json.load_json`` and ``substrait.json.parse_json`` +A substrait plan can be loaded [from the JSON representation +of the protobuf message](https://protobuf.dev/programming-guides/json/) using the [`protobuf` python library](https://pypi.org/project/protobuf/): functions: ``` ->>> import substrait.json +>>> from substrait.proto import Plan +>>> from google.protobuf import json_format >>> jsontext = """{ ... "relations":[ ... { @@ -182,7 +183,7 @@ functions: ... } ... ] ... }""" ->>> substrait.json.parse_json(jsontext) +>>> json_format.Parse(text=jsontext, mesage=Plan()) relations { root { input {