Skip to content

Commit 0020c50

Browse files
authored
Fix: Refactor parser config handling and add GraphRAG defaults (#8778)
### What problem does this PR solve? - Update `get_parser_config` to merge provided configs with defaults - Add GraphRAG configuration defaults for all chunk methods - Make raptor and graphrag fields non-nullable in ParserConfig schema - Update related test cases to reflect config changes - Ensure backward compatibility while adding new GraphRAG support - #8396 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
1 parent c3b8d8b commit 0020c50

File tree

8 files changed

+179
-28
lines changed

8 files changed

+179
-28
lines changed

api/utils/api_utils.py

Lines changed: 39 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -351,28 +351,47 @@ def generate_confirmation_token(tenant_id):
351351

352352

353353
def get_parser_config(chunk_method, parser_config):
354-
if parser_config:
355-
return parser_config
356354
if not chunk_method:
357355
chunk_method = "naive"
356+
357+
# Define default configurations for each chunk method
358358
key_mapping = {
359-
"naive": {"chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}},
360-
"qa": {"raptor": {"use_raptor": False}},
359+
"naive": {"chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
360+
"qa": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
361361
"tag": None,
362362
"resume": None,
363-
"manual": {"raptor": {"use_raptor": False}},
363+
"manual": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
364364
"table": None,
365-
"paper": {"raptor": {"use_raptor": False}},
366-
"book": {"raptor": {"use_raptor": False}},
367-
"laws": {"raptor": {"use_raptor": False}},
368-
"presentation": {"raptor": {"use_raptor": False}},
365+
"paper": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
366+
"book": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
367+
"laws": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
368+
"presentation": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
369369
"one": None,
370-
"knowledge_graph": {"chunk_token_num": 8192, "delimiter": r"\n", "entity_types": ["organization", "person", "location", "event", "time"]},
370+
"knowledge_graph": {
371+
"chunk_token_num": 8192,
372+
"delimiter": r"\n",
373+
"entity_types": ["organization", "person", "location", "event", "time"],
374+
"raptor": {"use_raptor": False},
375+
"graphrag": {"use_graphrag": False},
376+
},
371377
"email": None,
372378
"picture": None,
373379
}
374-
parser_config = key_mapping[chunk_method]
375-
return parser_config
380+
381+
default_config = key_mapping[chunk_method]
382+
383+
# If no parser_config provided, return default
384+
if not parser_config:
385+
return default_config
386+
387+
# If parser_config is provided, merge with defaults to ensure required fields exist
388+
if default_config is None:
389+
return parser_config
390+
391+
# Ensure raptor and graphrag fields have default values if not provided
392+
merged_config = deep_merge(default_config, parser_config)
393+
394+
return merged_config
376395

377396

378397
def get_data_openai(
@@ -602,17 +621,14 @@ def get_mcp_tools(mcp_servers: list, timeout: float | int = 10) -> tuple[dict, s
602621

603622
TimeoutException = Union[Type[BaseException], BaseException]
604623
OnTimeoutCallback = Union[Callable[..., Any], Coroutine[Any, Any, Any]]
605-
def timeout(
606-
seconds: float |int = None,
607-
attempts: int = 2,
608-
*,
609-
exception: Optional[TimeoutException] = None,
610-
on_timeout: Optional[OnTimeoutCallback] = None
611-
):
624+
625+
626+
def timeout(seconds: float | int = None, attempts: int = 2, *, exception: Optional[TimeoutException] = None, on_timeout: Optional[OnTimeoutCallback] = None):
612627
def decorator(func):
613628
@wraps(func)
614629
def wrapper(*args, **kwargs):
615630
result_queue = queue.Queue(maxsize=1)
631+
616632
def target():
617633
try:
618634
result = func(*args, **kwargs)
@@ -644,7 +660,7 @@ async def async_wrapper(*args, **kwargs) -> Any:
644660
with trio.fail_after(seconds):
645661
return await func(*args, **kwargs)
646662
except trio.TooSlowError:
647-
if a < attempts -1:
663+
if a < attempts - 1:
648664
continue
649665
if on_timeout is not None:
650666
if callable(on_timeout):
@@ -668,11 +684,11 @@ async def async_wrapper(*args, **kwargs) -> Any:
668684
if asyncio.iscoroutinefunction(func):
669685
return async_wrapper
670686
return wrapper
687+
671688
return decorator
672689

673690

674691
async def is_strong_enough(chat_model, embedding_model):
675-
676692
@timeout(30, 2)
677693
async def _is_strong_enough():
678694
nonlocal chat_model, embedding_model
@@ -681,11 +697,11 @@ async def _is_strong_enough():
681697
_ = await trio.to_thread.run_sync(lambda: embedding_model.encode(["Are you strong enough!?"]))
682698
if chat_model:
683699
with trio.fail_after(30):
684-
res = await trio.to_thread.run_sync(lambda: chat_model.chat("Nothing special.", [{"role":"user", "content": "Are you strong enough!?"}], {}))
700+
res = await trio.to_thread.run_sync(lambda: chat_model.chat("Nothing special.", [{"role": "user", "content": "Are you strong enough!?"}], {}))
685701
if res.find("**ERROR**") >= 0:
686702
raise Exception(res)
687703

688704
# Pressure test for GraphRAG task
689705
async with trio.open_nursery() as nursery:
690706
for _ in range(32):
691-
nursery.start_soon(_is_strong_enough)
707+
nursery.start_soon(_is_strong_enough)

api/utils/validation_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -365,10 +365,10 @@ class ParserConfig(Base):
365365
auto_questions: int = Field(default=0, ge=0, le=10)
366366
chunk_token_num: int = Field(default=512, ge=1, le=2048)
367367
delimiter: str = Field(default=r"\n", min_length=1)
368-
graphrag: GraphragConfig | None = None
368+
graphrag: GraphragConfig = Field(default_factory=lambda: GraphragConfig(use_graphrag=False))
369369
html4excel: bool = False
370370
layout_recognize: str = "DeepDOC"
371-
raptor: RaptorConfig | None = None
371+
raptor: RaptorConfig = Field(default_factory=lambda: RaptorConfig(use_raptor=False))
372372
tag_kb_ids: list[str] = Field(default_factory=list)
373373
topn_tags: int = Field(default=1, ge=1, le=10)
374374
filename_embd_weight: float | None = Field(default=0.1, ge=0.0, le=1.0)

test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -644,6 +644,7 @@ def test_parser_config_empty(self, HttpApiAuth):
644644
"html4excel": False,
645645
"layout_recognize": "DeepDOC",
646646
"raptor": {"use_raptor": False},
647+
"graphrag": {"use_graphrag": False},
647648
}, res
648649

649650
@pytest.mark.p2
@@ -657,6 +658,7 @@ def test_parser_config_unset(self, HttpApiAuth):
657658
"html4excel": False,
658659
"layout_recognize": "DeepDOC",
659660
"raptor": {"use_raptor": False},
661+
"graphrag": {"use_graphrag": False},
660662
}, res
661663

662664
@pytest.mark.p3
@@ -670,6 +672,7 @@ def test_parser_config_none(self, HttpApiAuth):
670672
"html4excel": False,
671673
"layout_recognize": "DeepDOC",
672674
"raptor": {"use_raptor": False},
675+
"graphrag": {"use_graphrag": False},
673676
}, res
674677

675678
@pytest.mark.p2
@@ -695,3 +698,64 @@ def test_unsupported_field(self, HttpApiAuth, payload):
695698
res = create_dataset(HttpApiAuth, payload)
696699
assert res["code"] == 101, res
697700
assert "Extra inputs are not permitted" in res["message"], res
701+
702+
703+
@pytest.mark.usefixtures("clear_datasets")
704+
class TestParserConfigBugFix:
705+
@pytest.mark.p1
706+
def test_parser_config_missing_raptor_and_graphrag(self, HttpApiAuth):
707+
payload = {"name": "test_parser_config_missing_fields", "parser_config": {"chunk_token_num": 1024}}
708+
res = create_dataset(HttpApiAuth, payload)
709+
assert res["code"] == 0, res
710+
711+
parser_config = res["data"]["parser_config"]
712+
assert "raptor" in parser_config, "raptor field should be present"
713+
assert "graphrag" in parser_config, "graphrag field should be present"
714+
assert parser_config["raptor"]["use_raptor"] is False, "raptor.use_raptor should default to False"
715+
assert parser_config["graphrag"]["use_graphrag"] is False, "graphrag.use_graphrag should default to False"
716+
assert parser_config["chunk_token_num"] == 1024, "User-provided chunk_token_num should be preserved"
717+
718+
@pytest.mark.p1
719+
def test_parser_config_with_only_raptor(self, HttpApiAuth):
720+
payload = {"name": "test_parser_config_only_raptor", "parser_config": {"chunk_token_num": 1024, "raptor": {"use_raptor": True}}}
721+
res = create_dataset(HttpApiAuth, payload)
722+
assert res["code"] == 0, res
723+
724+
parser_config = res["data"]["parser_config"]
725+
assert parser_config["raptor"]["use_raptor"] is True, "User-provided raptor.use_raptor should be preserved"
726+
assert "graphrag" in parser_config, "graphrag field should be present"
727+
assert parser_config["graphrag"]["use_graphrag"] is False, "graphrag.use_graphrag should default to False"
728+
729+
@pytest.mark.p1
730+
def test_parser_config_with_only_graphrag(self, HttpApiAuth):
731+
payload = {"name": "test_parser_config_only_graphrag", "parser_config": {"chunk_token_num": 1024, "graphrag": {"use_graphrag": True}}}
732+
res = create_dataset(HttpApiAuth, payload)
733+
assert res["code"] == 0, res
734+
735+
parser_config = res["data"]["parser_config"]
736+
assert "raptor" in parser_config, "raptor field should be present"
737+
assert parser_config["raptor"]["use_raptor"] is False, "raptor.use_raptor should default to False"
738+
assert parser_config["graphrag"]["use_graphrag"] is True, "User-provided graphrag.use_graphrag should be preserved"
739+
740+
@pytest.mark.p1
741+
def test_parser_config_with_both_fields(self, HttpApiAuth):
742+
payload = {"name": "test_parser_config_both_fields", "parser_config": {"chunk_token_num": 1024, "raptor": {"use_raptor": True}, "graphrag": {"use_graphrag": True}}}
743+
res = create_dataset(HttpApiAuth, payload)
744+
assert res["code"] == 0, res
745+
746+
parser_config = res["data"]["parser_config"]
747+
assert parser_config["raptor"]["use_raptor"] is True, "User-provided raptor.use_raptor should be preserved"
748+
assert parser_config["graphrag"]["use_graphrag"] is True, "User-provided graphrag.use_graphrag should be preserved"
749+
750+
@pytest.mark.p2
751+
@pytest.mark.parametrize("chunk_method", ["qa", "manual", "paper", "book", "laws", "presentation"])
752+
def test_parser_config_different_chunk_methods(self, HttpApiAuth, chunk_method):
753+
payload = {"name": f"test_parser_config_{chunk_method}", "chunk_method": chunk_method, "parser_config": {"chunk_token_num": 512}}
754+
res = create_dataset(HttpApiAuth, payload)
755+
assert res["code"] == 0, res
756+
757+
parser_config = res["data"]["parser_config"]
758+
assert "raptor" in parser_config, f"raptor field should be present for {chunk_method}"
759+
assert "graphrag" in parser_config, f"graphrag field should be present for {chunk_method}"
760+
assert parser_config["raptor"]["use_raptor"] is False, f"raptor.use_raptor should default to False for {chunk_method}"
761+
assert parser_config["graphrag"]["use_graphrag"] is False, f"graphrag.use_graphrag should default to False for {chunk_method}"

test/testcases/test_http_api/test_dataset_mangement/test_update_dataset.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -755,6 +755,7 @@ def test_parser_config_empty(self, HttpApiAuth, add_dataset_func):
755755
"html4excel": False,
756756
"layout_recognize": "DeepDOC",
757757
"raptor": {"use_raptor": False},
758+
"graphrag": {"use_graphrag": False},
758759
}, res
759760

760761
@pytest.mark.p3
@@ -772,6 +773,7 @@ def test_parser_config_none(self, HttpApiAuth, add_dataset_func):
772773
"html4excel": False,
773774
"layout_recognize": "DeepDOC",
774775
"raptor": {"use_raptor": False},
776+
"graphrag": {"use_graphrag": False},
775777
}, res
776778

777779
@pytest.mark.p3
@@ -783,7 +785,7 @@ def test_parser_config_empty_with_chunk_method_change(self, HttpApiAuth, add_dat
783785

784786
res = list_datasets(HttpApiAuth)
785787
assert res["code"] == 0, res
786-
assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}}, res
788+
assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}}, res
787789

788790
@pytest.mark.p3
789791
def test_parser_config_unset_with_chunk_method_change(self, HttpApiAuth, add_dataset_func):
@@ -794,7 +796,7 @@ def test_parser_config_unset_with_chunk_method_change(self, HttpApiAuth, add_dat
794796

795797
res = list_datasets(HttpApiAuth)
796798
assert res["code"] == 0, res
797-
assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}}, res
799+
assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}}, res
798800

799801
@pytest.mark.p3
800802
def test_parser_config_none_with_chunk_method_change(self, HttpApiAuth, add_dataset_func):
@@ -805,7 +807,7 @@ def test_parser_config_none_with_chunk_method_change(self, HttpApiAuth, add_data
805807

806808
res = list_datasets(HttpApiAuth, {"id": dataset_id})
807809
assert res["code"] == 0, res
808-
assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}}, res
810+
assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}}, res
809811

810812
@pytest.mark.p2
811813
@pytest.mark.parametrize(

test/testcases/test_http_api/test_file_management_within_dataset/test_update_document.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,7 @@ def test_parser_config(
540540
"html4excel": False,
541541
"layout_recognize": "DeepDOC",
542542
"raptor": {"use_raptor": False},
543+
"graphrag": {"use_graphrag": False},
543544
}
544545
else:
545546
for k, v in parser_config.items():

test/testcases/test_sdk_api/test_dataset_mangement/test_create_dataset.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -593,6 +593,7 @@ def test_parser_config_empty(self, client):
593593
"html4excel": False,
594594
"layout_recognize": "DeepDOC",
595595
"raptor": {"use_raptor": False},
596+
"graphrag": {"use_graphrag": False},
596597
},
597598
)
598599
parser_config_o = DataSet.ParserConfig(client, {})
@@ -610,6 +611,7 @@ def test_parser_config_unset(self, client):
610611
"html4excel": False,
611612
"layout_recognize": "DeepDOC",
612613
"raptor": {"use_raptor": False},
614+
"graphrag": {"use_graphrag": False},
613615
},
614616
)
615617
payload = {"name": "parser_config_unset"}
@@ -626,6 +628,7 @@ def test_parser_config_none(self, client):
626628
"html4excel": False,
627629
"layout_recognize": "DeepDOC",
628630
"raptor": {"use_raptor": False},
631+
"graphrag": {"use_graphrag": False},
629632
},
630633
)
631634
payload = {"name": "parser_config_empty", "parser_config": None}
@@ -655,3 +658,64 @@ def test_unsupported_field(self, client, payload):
655658
with pytest.raises(Exception) as excinfo:
656659
client.create_dataset(**payload)
657660
assert "got an unexpected keyword argument" in str(excinfo.value), str(excinfo.value)
661+
662+
663+
@pytest.mark.usefixtures("clear_datasets")
664+
class TestParserConfigBugFix:
665+
@pytest.mark.p1
666+
def test_parser_config_missing_raptor_and_graphrag(self, client):
667+
parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 1024})
668+
payload = {"name": "test_parser_config_missing_fields_sdk", "parser_config": parser_config}
669+
dataset = client.create_dataset(**payload)
670+
671+
config = dataset.parser_config
672+
assert hasattr(config, "raptor"), "raptor field should be present"
673+
assert hasattr(config, "graphrag"), "graphrag field should be present"
674+
assert config.raptor.use_raptor is False, "raptor.use_raptor should default to False"
675+
assert config.graphrag.use_graphrag is False, "graphrag.use_graphrag should default to False"
676+
assert config.chunk_token_num == 1024, "User-provided chunk_token_num should be preserved"
677+
678+
@pytest.mark.p1
679+
def test_parser_config_with_only_raptor(self, client):
680+
parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 1024, "raptor": {"use_raptor": True}})
681+
payload = {"name": "test_parser_config_only_raptor_sdk", "parser_config": parser_config}
682+
dataset = client.create_dataset(**payload)
683+
684+
config = dataset.parser_config
685+
assert config.raptor.use_raptor is True, "User-provided raptor.use_raptor should be preserved"
686+
assert hasattr(config, "graphrag"), "graphrag field should be present"
687+
assert config.graphrag.use_graphrag is False, "graphrag.use_graphrag should default to False"
688+
689+
@pytest.mark.p1
690+
def test_parser_config_with_only_graphrag(self, client):
691+
parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 1024, "graphrag": {"use_graphrag": True}})
692+
payload = {"name": "test_parser_config_only_graphrag_sdk", "parser_config": parser_config}
693+
dataset = client.create_dataset(**payload)
694+
695+
config = dataset.parser_config
696+
assert hasattr(config, "raptor"), "raptor field should be present"
697+
assert config.raptor.use_raptor is False, "raptor.use_raptor should default to False"
698+
assert config.graphrag.use_graphrag is True, "User-provided graphrag.use_graphrag should be preserved"
699+
700+
@pytest.mark.p1
701+
def test_parser_config_with_both_fields(self, client):
702+
parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 1024, "raptor": {"use_raptor": True}, "graphrag": {"use_graphrag": True}})
703+
payload = {"name": "test_parser_config_both_fields_sdk", "parser_config": parser_config}
704+
dataset = client.create_dataset(**payload)
705+
706+
config = dataset.parser_config
707+
assert config.raptor.use_raptor is True, "User-provided raptor.use_raptor should be preserved"
708+
assert config.graphrag.use_graphrag is True, "User-provided graphrag.use_graphrag should be preserved"
709+
710+
@pytest.mark.p2
711+
@pytest.mark.parametrize("chunk_method", ["qa", "manual", "paper", "book", "laws", "presentation"])
712+
def test_parser_config_different_chunk_methods(self, client, chunk_method):
713+
parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 512})
714+
payload = {"name": f"test_parser_config_{chunk_method}_sdk", "chunk_method": chunk_method, "parser_config": parser_config}
715+
dataset = client.create_dataset(**payload)
716+
717+
config = dataset.parser_config
718+
assert hasattr(config, "raptor"), f"raptor field should be present for {chunk_method}"
719+
assert hasattr(config, "graphrag"), f"graphrag field should be present for {chunk_method}"
720+
assert config.raptor.use_raptor is False, f"raptor.use_raptor should default to False for {chunk_method}"
721+
assert config.graphrag.use_graphrag is False, f"graphrag.use_graphrag should default to False for {chunk_method}"

test/testcases/test_sdk_api/test_dataset_mangement/test_update_dataset.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,7 @@ def test_parser_config_empty(self, client, add_dataset_func):
641641
"html4excel": False,
642642
"layout_recognize": "DeepDOC",
643643
"raptor": {"use_raptor": False},
644+
"graphrag": {"use_graphrag": False},
644645
},
645646
)
646647
dataset.update({"parser_config": {}})
@@ -660,6 +661,7 @@ def test_parser_config_none(self, client, add_dataset_func):
660661
"html4excel": False,
661662
"layout_recognize": "DeepDOC",
662663
"raptor": {"use_raptor": False},
664+
"graphrag": {"use_graphrag": False},
663665
},
664666
)
665667
dataset.update({"parser_config": None})
@@ -675,6 +677,7 @@ def test_parser_config_empty_with_chunk_method_change(self, client, add_dataset_
675677
client,
676678
{
677679
"raptor": {"use_raptor": False},
680+
"graphrag": {"use_graphrag": False},
678681
},
679682
)
680683
dataset.update({"chunk_method": "qa", "parser_config": {}})

test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,7 @@ def test_parser_config(self, client, add_documents, chunk_method, parser_config,
406406
"html4excel": False,
407407
"layout_recognize": "DeepDOC",
408408
"raptor": {"use_raptor": False},
409+
"graphrag": {"use_graphrag": False},
409410
},
410411
)
411412
assert str(updated_doc.parser_config) == str(expected_config), str(updated_doc)

0 commit comments

Comments
 (0)