Fix: Refactor parser config handling and add GraphRAG defaults (#8778)

asiroliu · web-flow · commit 0020c50000f7 · 2025-07-23T09:29:37.000+08:00
### What problem does this PR solve? - Update `get_parser_config` to merge provided configs with defaults - Add GraphRAG configuration defaults for all chunk methods - Make raptor and graphrag fields non-nullable in ParserConfig schema - Update related test cases to reflect config changes - Ensure backward compatibility while adding new GraphRAG support - #8396 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
diff --git a/api/utils/api_utils.py b/api/utils/api_utils.py
@@ -351,28 +351,47 @@ def generate_confirmation_token(tenant_id):
 
 
 def get_parser_config(chunk_method, parser_config):
-    if parser_config:
-        return parser_config
     if not chunk_method:
         chunk_method = "naive"
+
+    # Define default configurations for each chunk method
     key_mapping = {
-        "naive": {"chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}},
-        "qa": {"raptor": {"use_raptor": False}},
+        "naive": {"chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
+        "qa": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
         "tag": None,
         "resume": None,
-        "manual": {"raptor": {"use_raptor": False}},
+        "manual": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
         "table": None,
-        "paper": {"raptor": {"use_raptor": False}},
-        "book": {"raptor": {"use_raptor": False}},
-        "laws": {"raptor": {"use_raptor": False}},
-        "presentation": {"raptor": {"use_raptor": False}},
+        "paper": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
+        "book": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
+        "laws": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
+        "presentation": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
         "one": None,
-        "knowledge_graph": {"chunk_token_num": 8192, "delimiter": r"\n", "entity_types": ["organization", "person", "location", "event", "time"]},
+        "knowledge_graph": {
+            "chunk_token_num": 8192,
+            "delimiter": r"\n",
+            "entity_types": ["organization", "person", "location", "event", "time"],
+            "raptor": {"use_raptor": False},
+            "graphrag": {"use_graphrag": False},
+        },
         "email": None,
         "picture": None,
     }
-    parser_config = key_mapping[chunk_method]
-    return parser_config
+
+    default_config = key_mapping[chunk_method]
+
+    # If no parser_config provided, return default
+    if not parser_config:
+        return default_config
+
+    # If parser_config is provided, merge with defaults to ensure required fields exist
+    if default_config is None:
+        return parser_config
+
+    # Ensure raptor and graphrag fields have default values if not provided
+    merged_config = deep_merge(default_config, parser_config)
+
+    return merged_config
 
 
 def get_data_openai(
@@ -602,17 +621,14 @@ def get_mcp_tools(mcp_servers: list, timeout: float | int = 10) -> tuple[dict, s
 
 TimeoutException = Union[Type[BaseException], BaseException]
 OnTimeoutCallback = Union[Callable[..., Any], Coroutine[Any, Any, Any]]
-def timeout(
-    seconds: float |int = None,
-    attempts: int = 2,
-    *,
-    exception: Optional[TimeoutException] = None,
-    on_timeout: Optional[OnTimeoutCallback] = None
-):
+
+
+def timeout(seconds: float | int = None, attempts: int = 2, *, exception: Optional[TimeoutException] = None, on_timeout: Optional[OnTimeoutCallback] = None):
     def decorator(func):
         @wraps(func)
         def wrapper(*args, **kwargs):
             result_queue = queue.Queue(maxsize=1)
+
             def target():
                 try:
                     result = func(*args, **kwargs)
@@ -644,7 +660,7 @@ async def async_wrapper(*args, **kwargs) -> Any:
                     with trio.fail_after(seconds):
                         return await func(*args, **kwargs)
                 except trio.TooSlowError:
-                    if a < attempts -1:
+                    if a < attempts - 1:
                         continue
                     if on_timeout is not None:
                         if callable(on_timeout):
@@ -668,11 +684,11 @@ async def async_wrapper(*args, **kwargs) -> Any:
         if asyncio.iscoroutinefunction(func):
             return async_wrapper
         return wrapper
+
     return decorator
 
 
 async def is_strong_enough(chat_model, embedding_model):
-
     @timeout(30, 2)
     async def _is_strong_enough():
         nonlocal chat_model, embedding_model
@@ -681,11 +697,11 @@ async def _is_strong_enough():
                 _ = await trio.to_thread.run_sync(lambda: embedding_model.encode(["Are you strong enough!?"]))
         if chat_model:
             with trio.fail_after(30):
-                res =  await trio.to_thread.run_sync(lambda: chat_model.chat("Nothing special.", [{"role":"user", "content": "Are you strong enough!?"}], {}))
+                res = await trio.to_thread.run_sync(lambda: chat_model.chat("Nothing special.", [{"role": "user", "content": "Are you strong enough!?"}], {}))
             if res.find("**ERROR**") >= 0:
                 raise Exception(res)
 
     # Pressure test for GraphRAG task
     async with trio.open_nursery() as nursery:
         for _ in range(32):
-            nursery.start_soon(_is_strong_enough)
+            nursery.start_soon(_is_strong_enough)
diff --git a/api/utils/validation_utils.py b/api/utils/validation_utils.py
@@ -365,10 +365,10 @@ class ParserConfig(Base):
     auto_questions: int = Field(default=0, ge=0, le=10)
     chunk_token_num: int = Field(default=512, ge=1, le=2048)
     delimiter: str = Field(default=r"\n", min_length=1)
-    graphrag: GraphragConfig | None = None
+    graphrag: GraphragConfig = Field(default_factory=lambda: GraphragConfig(use_graphrag=False))
     html4excel: bool = False
     layout_recognize: str = "DeepDOC"
-    raptor: RaptorConfig | None = None
+    raptor: RaptorConfig = Field(default_factory=lambda: RaptorConfig(use_raptor=False))
     tag_kb_ids: list[str] = Field(default_factory=list)
     topn_tags: int = Field(default=1, ge=1, le=10)
     filename_embd_weight: float | None = Field(default=0.1, ge=0.0, le=1.0)
diff --git a/test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py b/test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py
@@ -644,6 +644,7 @@ def test_parser_config_empty(self, HttpApiAuth):
             "html4excel": False,
             "layout_recognize": "DeepDOC",
             "raptor": {"use_raptor": False},
+            "graphrag": {"use_graphrag": False},
         }, res
 
     @pytest.mark.p2
@@ -657,6 +658,7 @@ def test_parser_config_unset(self, HttpApiAuth):
             "html4excel": False,
             "layout_recognize": "DeepDOC",
             "raptor": {"use_raptor": False},
+            "graphrag": {"use_graphrag": False},
         }, res
 
     @pytest.mark.p3
@@ -670,6 +672,7 @@ def test_parser_config_none(self, HttpApiAuth):
             "html4excel": False,
             "layout_recognize": "DeepDOC",
             "raptor": {"use_raptor": False},
+            "graphrag": {"use_graphrag": False},
         }, res
 
     @pytest.mark.p2
@@ -695,3 +698,64 @@ def test_unsupported_field(self, HttpApiAuth, payload):
         res = create_dataset(HttpApiAuth, payload)
         assert res["code"] == 101, res
         assert "Extra inputs are not permitted" in res["message"], res
+
+
+@pytest.mark.usefixtures("clear_datasets")
+class TestParserConfigBugFix:
+    @pytest.mark.p1
+    def test_parser_config_missing_raptor_and_graphrag(self, HttpApiAuth):
+        payload = {"name": "test_parser_config_missing_fields", "parser_config": {"chunk_token_num": 1024}}
+        res = create_dataset(HttpApiAuth, payload)
+        assert res["code"] == 0, res
+
+        parser_config = res["data"]["parser_config"]
+        assert "raptor" in parser_config, "raptor field should be present"
+        assert "graphrag" in parser_config, "graphrag field should be present"
+        assert parser_config["raptor"]["use_raptor"] is False, "raptor.use_raptor should default to False"
+        assert parser_config["graphrag"]["use_graphrag"] is False, "graphrag.use_graphrag should default to False"
+        assert parser_config["chunk_token_num"] == 1024, "User-provided chunk_token_num should be preserved"
+
+    @pytest.mark.p1
+    def test_parser_config_with_only_raptor(self, HttpApiAuth):
+        payload = {"name": "test_parser_config_only_raptor", "parser_config": {"chunk_token_num": 1024, "raptor": {"use_raptor": True}}}
+        res = create_dataset(HttpApiAuth, payload)
+        assert res["code"] == 0, res
+
+        parser_config = res["data"]["parser_config"]
+        assert parser_config["raptor"]["use_raptor"] is True, "User-provided raptor.use_raptor should be preserved"
+        assert "graphrag" in parser_config, "graphrag field should be present"
+        assert parser_config["graphrag"]["use_graphrag"] is False, "graphrag.use_graphrag should default to False"
+
+    @pytest.mark.p1
+    def test_parser_config_with_only_graphrag(self, HttpApiAuth):
+        payload = {"name": "test_parser_config_only_graphrag", "parser_config": {"chunk_token_num": 1024, "graphrag": {"use_graphrag": True}}}
+        res = create_dataset(HttpApiAuth, payload)
+        assert res["code"] == 0, res
+
+        parser_config = res["data"]["parser_config"]
+        assert "raptor" in parser_config, "raptor field should be present"
+        assert parser_config["raptor"]["use_raptor"] is False, "raptor.use_raptor should default to False"
+        assert parser_config["graphrag"]["use_graphrag"] is True, "User-provided graphrag.use_graphrag should be preserved"
+
+    @pytest.mark.p1
+    def test_parser_config_with_both_fields(self, HttpApiAuth):
+        payload = {"name": "test_parser_config_both_fields", "parser_config": {"chunk_token_num": 1024, "raptor": {"use_raptor": True}, "graphrag": {"use_graphrag": True}}}
+        res = create_dataset(HttpApiAuth, payload)
+        assert res["code"] == 0, res
+
+        parser_config = res["data"]["parser_config"]
+        assert parser_config["raptor"]["use_raptor"] is True, "User-provided raptor.use_raptor should be preserved"
+        assert parser_config["graphrag"]["use_graphrag"] is True, "User-provided graphrag.use_graphrag should be preserved"
+
+    @pytest.mark.p2
+    @pytest.mark.parametrize("chunk_method", ["qa", "manual", "paper", "book", "laws", "presentation"])
+    def test_parser_config_different_chunk_methods(self, HttpApiAuth, chunk_method):
+        payload = {"name": f"test_parser_config_{chunk_method}", "chunk_method": chunk_method, "parser_config": {"chunk_token_num": 512}}
+        res = create_dataset(HttpApiAuth, payload)
+        assert res["code"] == 0, res
+
+        parser_config = res["data"]["parser_config"]
+        assert "raptor" in parser_config, f"raptor field should be present for {chunk_method}"
+        assert "graphrag" in parser_config, f"graphrag field should be present for {chunk_method}"
+        assert parser_config["raptor"]["use_raptor"] is False, f"raptor.use_raptor should default to False for {chunk_method}"
+        assert parser_config["graphrag"]["use_graphrag"] is False, f"graphrag.use_graphrag should default to False for {chunk_method}"
diff --git a/test/testcases/test_http_api/test_dataset_mangement/test_update_dataset.py b/test/testcases/test_http_api/test_dataset_mangement/test_update_dataset.py
@@ -755,6 +755,7 @@ def test_parser_config_empty(self, HttpApiAuth, add_dataset_func):
             "html4excel": False,
             "layout_recognize": "DeepDOC",
             "raptor": {"use_raptor": False},
+            "graphrag": {"use_graphrag": False},
         }, res
 
     @pytest.mark.p3
@@ -772,6 +773,7 @@ def test_parser_config_none(self, HttpApiAuth, add_dataset_func):
             "html4excel": False,
             "layout_recognize": "DeepDOC",
             "raptor": {"use_raptor": False},
+            "graphrag": {"use_graphrag": False},
         }, res
 
     @pytest.mark.p3
@@ -783,7 +785,7 @@ def test_parser_config_empty_with_chunk_method_change(self, HttpApiAuth, add_dat
 
         res = list_datasets(HttpApiAuth)
         assert res["code"] == 0, res
-        assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}}, res
+        assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}}, res
 
     @pytest.mark.p3
     def test_parser_config_unset_with_chunk_method_change(self, HttpApiAuth, add_dataset_func):
@@ -794,7 +796,7 @@ def test_parser_config_unset_with_chunk_method_change(self, HttpApiAuth, add_dat
 
         res = list_datasets(HttpApiAuth)
         assert res["code"] == 0, res
-        assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}}, res
+        assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}}, res
 
     @pytest.mark.p3
     def test_parser_config_none_with_chunk_method_change(self, HttpApiAuth, add_dataset_func):
@@ -805,7 +807,7 @@ def test_parser_config_none_with_chunk_method_change(self, HttpApiAuth, add_data
 
         res = list_datasets(HttpApiAuth, {"id": dataset_id})
         assert res["code"] == 0, res
-        assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}}, res
+        assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}}, res
 
     @pytest.mark.p2
     @pytest.mark.parametrize(
diff --git a/test/testcases/test_http_api/test_file_management_within_dataset/test_update_document.py b/test/testcases/test_http_api/test_file_management_within_dataset/test_update_document.py
@@ -540,6 +540,7 @@ def test_parser_config(
                     "html4excel": False,
                     "layout_recognize": "DeepDOC",
                     "raptor": {"use_raptor": False},
+                    "graphrag": {"use_graphrag": False},
                 }
             else:
                 for k, v in parser_config.items():
diff --git a/test/testcases/test_sdk_api/test_dataset_mangement/test_create_dataset.py b/test/testcases/test_sdk_api/test_dataset_mangement/test_create_dataset.py
@@ -593,6 +593,7 @@ def test_parser_config_empty(self, client):
                 "html4excel": False,
                 "layout_recognize": "DeepDOC",
                 "raptor": {"use_raptor": False},
+                "graphrag": {"use_graphrag": False},
             },
         )
         parser_config_o = DataSet.ParserConfig(client, {})
@@ -610,6 +611,7 @@ def test_parser_config_unset(self, client):
                 "html4excel": False,
                 "layout_recognize": "DeepDOC",
                 "raptor": {"use_raptor": False},
+                "graphrag": {"use_graphrag": False},
             },
         )
         payload = {"name": "parser_config_unset"}
@@ -626,6 +628,7 @@ def test_parser_config_none(self, client):
                 "html4excel": False,
                 "layout_recognize": "DeepDOC",
                 "raptor": {"use_raptor": False},
+                "graphrag": {"use_graphrag": False},
             },
         )
         payload = {"name": "parser_config_empty", "parser_config": None}
@@ -655,3 +658,64 @@ def test_unsupported_field(self, client, payload):
         with pytest.raises(Exception) as excinfo:
             client.create_dataset(**payload)
         assert "got an unexpected keyword argument" in str(excinfo.value), str(excinfo.value)
+
+
+@pytest.mark.usefixtures("clear_datasets")
+class TestParserConfigBugFix:
+    @pytest.mark.p1
+    def test_parser_config_missing_raptor_and_graphrag(self, client):
+        parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 1024})
+        payload = {"name": "test_parser_config_missing_fields_sdk", "parser_config": parser_config}
+        dataset = client.create_dataset(**payload)
+
+        config = dataset.parser_config
+        assert hasattr(config, "raptor"), "raptor field should be present"
+        assert hasattr(config, "graphrag"), "graphrag field should be present"
+        assert config.raptor.use_raptor is False, "raptor.use_raptor should default to False"
+        assert config.graphrag.use_graphrag is False, "graphrag.use_graphrag should default to False"
+        assert config.chunk_token_num == 1024, "User-provided chunk_token_num should be preserved"
+
+    @pytest.mark.p1
+    def test_parser_config_with_only_raptor(self, client):
+        parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 1024, "raptor": {"use_raptor": True}})
+        payload = {"name": "test_parser_config_only_raptor_sdk", "parser_config": parser_config}
+        dataset = client.create_dataset(**payload)
+
+        config = dataset.parser_config
+        assert config.raptor.use_raptor is True, "User-provided raptor.use_raptor should be preserved"
+        assert hasattr(config, "graphrag"), "graphrag field should be present"
+        assert config.graphrag.use_graphrag is False, "graphrag.use_graphrag should default to False"
+
+    @pytest.mark.p1
+    def test_parser_config_with_only_graphrag(self, client):
+        parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 1024, "graphrag": {"use_graphrag": True}})
+        payload = {"name": "test_parser_config_only_graphrag_sdk", "parser_config": parser_config}
+        dataset = client.create_dataset(**payload)
+
+        config = dataset.parser_config
+        assert hasattr(config, "raptor"), "raptor field should be present"
+        assert config.raptor.use_raptor is False, "raptor.use_raptor should default to False"
+        assert config.graphrag.use_graphrag is True, "User-provided graphrag.use_graphrag should be preserved"
+
+    @pytest.mark.p1
+    def test_parser_config_with_both_fields(self, client):
+        parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 1024, "raptor": {"use_raptor": True}, "graphrag": {"use_graphrag": True}})
+        payload = {"name": "test_parser_config_both_fields_sdk", "parser_config": parser_config}
+        dataset = client.create_dataset(**payload)
+
+        config = dataset.parser_config
+        assert config.raptor.use_raptor is True, "User-provided raptor.use_raptor should be preserved"
+        assert config.graphrag.use_graphrag is True, "User-provided graphrag.use_graphrag should be preserved"
+
+    @pytest.mark.p2
+    @pytest.mark.parametrize("chunk_method", ["qa", "manual", "paper", "book", "laws", "presentation"])
+    def test_parser_config_different_chunk_methods(self, client, chunk_method):
+        parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 512})
+        payload = {"name": f"test_parser_config_{chunk_method}_sdk", "chunk_method": chunk_method, "parser_config": parser_config}
+        dataset = client.create_dataset(**payload)
+
+        config = dataset.parser_config
+        assert hasattr(config, "raptor"), f"raptor field should be present for {chunk_method}"
+        assert hasattr(config, "graphrag"), f"graphrag field should be present for {chunk_method}"
+        assert config.raptor.use_raptor is False, f"raptor.use_raptor should default to False for {chunk_method}"
+        assert config.graphrag.use_graphrag is False, f"graphrag.use_graphrag should default to False for {chunk_method}"
diff --git a/test/testcases/test_sdk_api/test_dataset_mangement/test_update_dataset.py b/test/testcases/test_sdk_api/test_dataset_mangement/test_update_dataset.py
@@ -641,6 +641,7 @@ def test_parser_config_empty(self, client, add_dataset_func):
                 "html4excel": False,
                 "layout_recognize": "DeepDOC",
                 "raptor": {"use_raptor": False},
+                "graphrag": {"use_graphrag": False},
             },
         )
         dataset.update({"parser_config": {}})
@@ -660,6 +661,7 @@ def test_parser_config_none(self, client, add_dataset_func):
                 "html4excel": False,
                 "layout_recognize": "DeepDOC",
                 "raptor": {"use_raptor": False},
+                "graphrag": {"use_graphrag": False},
             },
         )
         dataset.update({"parser_config": None})
@@ -675,6 +677,7 @@ def test_parser_config_empty_with_chunk_method_change(self, client, add_dataset_
             client,
             {
                 "raptor": {"use_raptor": False},
+                "graphrag": {"use_graphrag": False},
             },
         )
         dataset.update({"chunk_method": "qa", "parser_config": {}})
diff --git a/test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py
@@ -406,6 +406,7 @@ def test_parser_config(self, client, add_documents, chunk_method, parser_config,
                         "html4excel": False,
                         "layout_recognize": "DeepDOC",
                         "raptor": {"use_raptor": False},
+                        "graphrag": {"use_graphrag": False},
                     },
                 )
                 assert str(updated_doc.parser_config) == str(expected_config), str(updated_doc)

Original file line number	Diff line number	Diff line change
`@@ -540,6 +540,7 @@ def test_parser_config(`
`540`	`540`	`"html4excel": False,`
`541`	`541`	`"layout_recognize": "DeepDOC",`
`542`	`542`	`"raptor": {"use_raptor": False},`
	`543`	`+ "graphrag": {"use_graphrag": False},`
`543`	`544`	`}`
`544`	`545`	`else:`
`545`	`546`	`for k, v in parser_config.items():`
Original file line number	Diff line number	Diff line change
`@@ -641,6 +641,7 @@ def test_parser_config_empty(self, client, add_dataset_func):`
`641`	`641`	`"html4excel": False,`
`642`	`642`	`"layout_recognize": "DeepDOC",`
`643`	`643`	`"raptor": {"use_raptor": False},`
	`644`	`+ "graphrag": {"use_graphrag": False},`
`644`	`645`	`},`
`645`	`646`	`)`
`646`	`647`	`dataset.update({"parser_config": {}})`
`@@ -660,6 +661,7 @@ def test_parser_config_none(self, client, add_dataset_func):`
`660`	`661`	`"html4excel": False,`
`661`	`662`	`"layout_recognize": "DeepDOC",`
`662`	`663`	`"raptor": {"use_raptor": False},`
	`664`	`+ "graphrag": {"use_graphrag": False},`
`663`	`665`	`},`
`664`	`666`	`)`
`665`	`667`	`dataset.update({"parser_config": None})`
`@@ -675,6 +677,7 @@ def test_parser_config_empty_with_chunk_method_change(self, client, add_dataset_`
`675`	`677`	`client,`
`676`	`678`	`{`
`677`	`679`	`"raptor": {"use_raptor": False},`
	`680`	`+ "graphrag": {"use_graphrag": False},`
`678`	`681`	`},`
`679`	`682`	`)`
`680`	`683`	`dataset.update({"chunk_method": "qa", "parser_config": {}})`
Original file line number	Diff line number	Diff line change
`@@ -406,6 +406,7 @@ def test_parser_config(self, client, add_documents, chunk_method, parser_config,`
`406`	`406`	`"html4excel": False,`
`407`	`407`	`"layout_recognize": "DeepDOC",`
`408`	`408`	`"raptor": {"use_raptor": False},`
	`409`	`+ "graphrag": {"use_graphrag": False},`
`409`	`410`	`},`
`410`	`411`	`)`
`411`	`412`	`assert str(updated_doc.parser_config) == str(expected_config), str(updated_doc)`