fix run_dataset_info (#5068)

Jintao-Huang · web-flow · commit 54d8d778ff63 · 2025-07-23T00:30:52.000+08:00
diff --git a/docs/source/Instruction/支持的模型和数据集.md b/docs/source/Instruction/支持的模型和数据集.md
@@ -934,6 +934,8 @@
 |-|auto_math_text<br>khanacademy<br>openstax<br>stanford<br>stories<br>web_samples_v1<br>web_samples_v2<br>wikihow|huge dataset|-|multi-domain, en, qa|[HuggingFaceTB/cosmopedia](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia)|
 |[HumanLLMs/Human-Like-DPO-Dataset](https://modelscope.cn/datasets/HumanLLMs/Human-Like-DPO-Dataset)|default|10884|47.5±7.9, min=32, max=85|rlhf, dpo|[HumanLLMs/Human-Like-DPO-Dataset](https://huggingface.co/datasets/HumanLLMs/Human-Like-DPO-Dataset)|
 |[LLM-Research/xlam-function-calling-60k](https://modelscope.cn/datasets/LLM-Research/xlam-function-calling-60k)|default<br>grpo|120000|453.7±219.5, min=164, max=2779|agent, grpo, 🔥|[Salesforce/xlam-function-calling-60k](https://huggingface.co/datasets/Salesforce/xlam-function-calling-60k)|
+|[MTEB/scidocs-reranking](https://modelscope.cn/datasets/MTEB/scidocs-reranking)|default|39193|41.9±5.8, min=31, max=107|rerank, 🔥|[mteb/scidocs-reranking](https://huggingface.co/datasets/mteb/scidocs-reranking)|
+|[MTEB/stackoverflowdupquestions-reranking](https://modelscope.cn/datasets/MTEB/stackoverflowdupquestions-reranking)|default|26485|39.9±4.6, min=31, max=77|rerank, 🔥|[mteb/stackoverflowdupquestions-reranking](https://huggingface.co/datasets/mteb/stackoverflowdupquestions-reranking)|
 |[OmniData/Zhihu-KOL](https://modelscope.cn/datasets/OmniData/Zhihu-KOL)|default|huge dataset|-|zhihu, qa|[wangrui6/Zhihu-KOL](https://huggingface.co/datasets/wangrui6/Zhihu-KOL)|
 |[OmniData/Zhihu-KOL-More-Than-100-Upvotes](https://modelscope.cn/datasets/OmniData/Zhihu-KOL-More-Than-100-Upvotes)|default|271261|1003.4±1826.1, min=28, max=52541|zhihu, qa|[bzb2023/Zhihu-KOL-More-Than-100-Upvotes](https://huggingface.co/datasets/bzb2023/Zhihu-KOL-More-Than-100-Upvotes)|
 |[PowerInfer/LONGCOT-Refine-500K](https://modelscope.cn/datasets/PowerInfer/LONGCOT-Refine-500K)|default|521921|296.5±158.4, min=39, max=4634|chat, sft, 🔥, cot|[PowerInfer/LONGCOT-Refine-500K](https://huggingface.co/datasets/PowerInfer/LONGCOT-Refine-500K)|
@@ -981,13 +983,15 @@
 |[open-r1/verifiable-coding-problems-python-10k_decontaminated](https://modelscope.cn/datasets/open-r1/verifiable-coding-problems-python-10k_decontaminated)|default|1574|575.7±234.3, min=136, max=2022|grpo, code|[open-r1/verifiable-coding-problems-python-10k_decontaminated](https://huggingface.co/datasets/open-r1/verifiable-coding-problems-python-10k_decontaminated)|
 |[open-r1/verifiable-coding-problems-python_decontaminated](https://modelscope.cn/datasets/open-r1/verifiable-coding-problems-python_decontaminated)|default|27839|561.9±252.2, min=74, max=6191|grpo, code|[open-r1/verifiable-coding-problems-python_decontaminated](https://huggingface.co/datasets/open-r1/verifiable-coding-problems-python_decontaminated)|
 |[open-thoughts/OpenThoughts-114k](https://modelscope.cn/datasets/open-thoughts/OpenThoughts-114k)|default|113957|413.2±186.9, min=265, max=13868|chat, sft, cot, r1|[open-thoughts/OpenThoughts-114k](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k)|
-|[sentence-transformers/stsb](https://modelscope.cn/datasets/sentence-transformers/stsb)|default<br>generate<br>reg|5748|21.0±0.0, min=21, max=21|similarity, 🔥|[sentence-transformers/stsb](https://huggingface.co/datasets/sentence-transformers/stsb)|
+|[swift/self-cognition](https://modelscope.cn/datasets/swift/self-cognition)|default<br>qwen3<br>empty_think|108|58.9±20.3, min=32, max=131|chat, self-cognition, 🔥|[modelscope/self-cognition](https://huggingface.co/datasets/modelscope/self-cognition)|
+|[sentence-transformers/stsb](https://modelscope.cn/datasets/sentence-transformers/stsb)|default<br>positive<br>generate<br>reg|5748|21.0±0.0, min=21, max=21|similarity, 🔥|[sentence-transformers/stsb](https://huggingface.co/datasets/sentence-transformers/stsb)|
 |[shenweizhou/alpha-umi-toolbench-processed-v2](https://modelscope.cn/datasets/shenweizhou/alpha-umi-toolbench-processed-v2)|backbone<br>caller<br>planner<br>summarizer|huge dataset|-|chat, agent, 🔥|-|
 |[simpleai/HC3](https://modelscope.cn/datasets/simpleai/HC3)|finance<br>finance_cls<br>medicine<br>medicine_cls|11021|296.0±153.3, min=65, max=2267|text-generation, classification, 🔥|[Hello-SimpleAI/HC3](https://huggingface.co/datasets/Hello-SimpleAI/HC3)|
 |[simpleai/HC3-Chinese](https://modelscope.cn/datasets/simpleai/HC3-Chinese)|baike<br>baike_cls<br>open_qa<br>open_qa_cls<br>nlpcc_dbqa<br>nlpcc_dbqa_cls<br>finance<br>finance_cls<br>medicine<br>medicine_cls<br>law<br>law_cls<br>psychology<br>psychology_cls|39781|179.9±70.2, min=90, max=1070|text-generation, classification, 🔥|[Hello-SimpleAI/HC3-Chinese](https://huggingface.co/datasets/Hello-SimpleAI/HC3-Chinese)|
 |[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets)|train<br>validation<br>test|141600|40.8±3.3, min=33, max=53|chat, multi-modal, audio|-|
 |[swift/A-OKVQA](https://modelscope.cn/datasets/swift/A-OKVQA)|default|18201|43.5±7.9, min=27, max=94|multi-modal, en, vqa, quality|[HuggingFaceM4/A-OKVQA](https://huggingface.co/datasets/HuggingFaceM4/A-OKVQA)|
 |[swift/ChartQA](https://modelscope.cn/datasets/swift/ChartQA)|default|28299|36.8±6.5, min=26, max=74|en, vqa, quality|[HuggingFaceM4/ChartQA](https://huggingface.co/datasets/HuggingFaceM4/ChartQA)|
+|[swift/Chinese-Qwen3-235B-2507-Distill-data-110k-SFT](https://modelscope.cn/datasets/swift/Chinese-Qwen3-235B-2507-Distill-data-110k-SFT)|default|110000|72.1±60.9, min=29, max=2315|🔥, distill, sft|-|
 |[swift/GRIT](https://modelscope.cn/datasets/swift/GRIT)|caption<br>grounding<br>vqa|huge dataset|-|multi-modal, en, caption-grounding, vqa, quality|[zzliang/GRIT](https://huggingface.co/datasets/zzliang/GRIT)|
 |[swift/GenQA](https://modelscope.cn/datasets/swift/GenQA)|default|huge dataset|-|qa, quality, multi-task|[tomg-group-umd/GenQA](https://huggingface.co/datasets/tomg-group-umd/GenQA)|
 |[swift/Infinity-Instruct](https://modelscope.cn/datasets/swift/Infinity-Instruct)|3M<br>7M<br>0625<br>Gen<br>7M_domains|huge dataset|-|qa, quality, multi-task|[BAAI/Infinity-Instruct](https://huggingface.co/datasets/BAAI/Infinity-Instruct)|
@@ -1030,7 +1034,6 @@
 |[swift/pixelprose](https://modelscope.cn/datasets/swift/pixelprose)|default|huge dataset|-|caption, multi-modal, vision|[tomg-group-umd/pixelprose](https://huggingface.co/datasets/tomg-group-umd/pixelprose)|
 |[swift/refcoco](https://modelscope.cn/datasets/swift/refcoco)|caption<br>grounding|92430|45.4±3.0, min=37, max=63|multi-modal, en, grounding|[jxu124/refcoco](https://huggingface.co/datasets/jxu124/refcoco)|
 |[swift/refcocog](https://modelscope.cn/datasets/swift/refcocog)|caption<br>grounding|89598|50.3±4.6, min=39, max=91|multi-modal, en, grounding|[jxu124/refcocog](https://huggingface.co/datasets/jxu124/refcocog)|
-|[swift/self-cognition](https://modelscope.cn/datasets/swift/self-cognition)|default<br>qwen3<br>empty_think|108|58.9±20.3, min=32, max=131|chat, self-cognition, 🔥|[modelscope/self-cognition](https://huggingface.co/datasets/modelscope/self-cognition)|
 |[swift/sharegpt](https://modelscope.cn/datasets/swift/sharegpt)|common-zh<br>unknow-zh<br>common-en|194063|820.5±366.1, min=25, max=2221|chat, general, multi-round|-|
 |[swift/swift-sft-mixture](https://modelscope.cn/datasets/swift/swift-sft-mixture)|sharegpt<br>firefly<br>codefuse<br>metamathqa|huge dataset|-|chat, sft, general, 🔥|-|
 |[swift/tagengo-gpt4](https://modelscope.cn/datasets/swift/tagengo-gpt4)|default|76437|468.1±276.8, min=28, max=1726|chat, multi-lingual, quality|[lightblue/tagengo-gpt4](https://huggingface.co/datasets/lightblue/tagengo-gpt4)|
diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -935,6 +935,8 @@ The table below introduces information about the datasets integrated with ms-swi
 |-|auto_math_text<br>khanacademy<br>openstax<br>stanford<br>stories<br>web_samples_v1<br>web_samples_v2<br>wikihow|huge dataset|-|multi-domain, en, qa|[HuggingFaceTB/cosmopedia](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia)|
 |[HumanLLMs/Human-Like-DPO-Dataset](https://modelscope.cn/datasets/HumanLLMs/Human-Like-DPO-Dataset)|default|10884|47.5±7.9, min=32, max=85|rlhf, dpo|[HumanLLMs/Human-Like-DPO-Dataset](https://huggingface.co/datasets/HumanLLMs/Human-Like-DPO-Dataset)|
 |[LLM-Research/xlam-function-calling-60k](https://modelscope.cn/datasets/LLM-Research/xlam-function-calling-60k)|default<br>grpo|120000|453.7±219.5, min=164, max=2779|agent, grpo, 🔥|[Salesforce/xlam-function-calling-60k](https://huggingface.co/datasets/Salesforce/xlam-function-calling-60k)|
+|[MTEB/scidocs-reranking](https://modelscope.cn/datasets/MTEB/scidocs-reranking)|default|39193|41.9±5.8, min=31, max=107|rerank, 🔥|[mteb/scidocs-reranking](https://huggingface.co/datasets/mteb/scidocs-reranking)|
+|[MTEB/stackoverflowdupquestions-reranking](https://modelscope.cn/datasets/MTEB/stackoverflowdupquestions-reranking)|default|26485|39.9±4.6, min=31, max=77|rerank, 🔥|[mteb/stackoverflowdupquestions-reranking](https://huggingface.co/datasets/mteb/stackoverflowdupquestions-reranking)|
 |[OmniData/Zhihu-KOL](https://modelscope.cn/datasets/OmniData/Zhihu-KOL)|default|huge dataset|-|zhihu, qa|[wangrui6/Zhihu-KOL](https://huggingface.co/datasets/wangrui6/Zhihu-KOL)|
 |[OmniData/Zhihu-KOL-More-Than-100-Upvotes](https://modelscope.cn/datasets/OmniData/Zhihu-KOL-More-Than-100-Upvotes)|default|271261|1003.4±1826.1, min=28, max=52541|zhihu, qa|[bzb2023/Zhihu-KOL-More-Than-100-Upvotes](https://huggingface.co/datasets/bzb2023/Zhihu-KOL-More-Than-100-Upvotes)|
 |[PowerInfer/LONGCOT-Refine-500K](https://modelscope.cn/datasets/PowerInfer/LONGCOT-Refine-500K)|default|521921|296.5±158.4, min=39, max=4634|chat, sft, 🔥, cot|[PowerInfer/LONGCOT-Refine-500K](https://huggingface.co/datasets/PowerInfer/LONGCOT-Refine-500K)|
@@ -982,13 +984,15 @@ The table below introduces information about the datasets integrated with ms-swi
 |[open-r1/verifiable-coding-problems-python-10k_decontaminated](https://modelscope.cn/datasets/open-r1/verifiable-coding-problems-python-10k_decontaminated)|default|1574|575.7±234.3, min=136, max=2022|grpo, code|[open-r1/verifiable-coding-problems-python-10k_decontaminated](https://huggingface.co/datasets/open-r1/verifiable-coding-problems-python-10k_decontaminated)|
 |[open-r1/verifiable-coding-problems-python_decontaminated](https://modelscope.cn/datasets/open-r1/verifiable-coding-problems-python_decontaminated)|default|27839|561.9±252.2, min=74, max=6191|grpo, code|[open-r1/verifiable-coding-problems-python_decontaminated](https://huggingface.co/datasets/open-r1/verifiable-coding-problems-python_decontaminated)|
 |[open-thoughts/OpenThoughts-114k](https://modelscope.cn/datasets/open-thoughts/OpenThoughts-114k)|default|113957|413.2±186.9, min=265, max=13868|chat, sft, cot, r1|[open-thoughts/OpenThoughts-114k](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k)|
-|[sentence-transformers/stsb](https://modelscope.cn/datasets/sentence-transformers/stsb)|default<br>generate<br>reg|5748|21.0±0.0, min=21, max=21|similarity, 🔥|[sentence-transformers/stsb](https://huggingface.co/datasets/sentence-transformers/stsb)|
+|[swift/self-cognition](https://modelscope.cn/datasets/swift/self-cognition)|default<br>qwen3<br>empty_think|108|58.9±20.3, min=32, max=131|chat, self-cognition, 🔥|[modelscope/self-cognition](https://huggingface.co/datasets/modelscope/self-cognition)|
+|[sentence-transformers/stsb](https://modelscope.cn/datasets/sentence-transformers/stsb)|default<br>positive<br>generate<br>reg|5748|21.0±0.0, min=21, max=21|similarity, 🔥|[sentence-transformers/stsb](https://huggingface.co/datasets/sentence-transformers/stsb)|
 |[shenweizhou/alpha-umi-toolbench-processed-v2](https://modelscope.cn/datasets/shenweizhou/alpha-umi-toolbench-processed-v2)|backbone<br>caller<br>planner<br>summarizer|huge dataset|-|chat, agent, 🔥|-|
 |[simpleai/HC3](https://modelscope.cn/datasets/simpleai/HC3)|finance<br>finance_cls<br>medicine<br>medicine_cls|11021|296.0±153.3, min=65, max=2267|text-generation, classification, 🔥|[Hello-SimpleAI/HC3](https://huggingface.co/datasets/Hello-SimpleAI/HC3)|
 |[simpleai/HC3-Chinese](https://modelscope.cn/datasets/simpleai/HC3-Chinese)|baike<br>baike_cls<br>open_qa<br>open_qa_cls<br>nlpcc_dbqa<br>nlpcc_dbqa_cls<br>finance<br>finance_cls<br>medicine<br>medicine_cls<br>law<br>law_cls<br>psychology<br>psychology_cls|39781|179.9±70.2, min=90, max=1070|text-generation, classification, 🔥|[Hello-SimpleAI/HC3-Chinese](https://huggingface.co/datasets/Hello-SimpleAI/HC3-Chinese)|
 |[speech_asr/speech_asr_aishell1_trainsets](https://modelscope.cn/datasets/speech_asr/speech_asr_aishell1_trainsets)|train<br>validation<br>test|141600|40.8±3.3, min=33, max=53|chat, multi-modal, audio|-|
 |[swift/A-OKVQA](https://modelscope.cn/datasets/swift/A-OKVQA)|default|18201|43.5±7.9, min=27, max=94|multi-modal, en, vqa, quality|[HuggingFaceM4/A-OKVQA](https://huggingface.co/datasets/HuggingFaceM4/A-OKVQA)|
 |[swift/ChartQA](https://modelscope.cn/datasets/swift/ChartQA)|default|28299|36.8±6.5, min=26, max=74|en, vqa, quality|[HuggingFaceM4/ChartQA](https://huggingface.co/datasets/HuggingFaceM4/ChartQA)|
+|[swift/Chinese-Qwen3-235B-2507-Distill-data-110k-SFT](https://modelscope.cn/datasets/swift/Chinese-Qwen3-235B-2507-Distill-data-110k-SFT)|default|110000|72.1±60.9, min=29, max=2315|🔥, distill, sft|-|
 |[swift/GRIT](https://modelscope.cn/datasets/swift/GRIT)|caption<br>grounding<br>vqa|huge dataset|-|multi-modal, en, caption-grounding, vqa, quality|[zzliang/GRIT](https://huggingface.co/datasets/zzliang/GRIT)|
 |[swift/GenQA](https://modelscope.cn/datasets/swift/GenQA)|default|huge dataset|-|qa, quality, multi-task|[tomg-group-umd/GenQA](https://huggingface.co/datasets/tomg-group-umd/GenQA)|
 |[swift/Infinity-Instruct](https://modelscope.cn/datasets/swift/Infinity-Instruct)|3M<br>7M<br>0625<br>Gen<br>7M_domains|huge dataset|-|qa, quality, multi-task|[BAAI/Infinity-Instruct](https://huggingface.co/datasets/BAAI/Infinity-Instruct)|
@@ -1031,7 +1035,6 @@ The table below introduces information about the datasets integrated with ms-swi
 |[swift/pixelprose](https://modelscope.cn/datasets/swift/pixelprose)|default|huge dataset|-|caption, multi-modal, vision|[tomg-group-umd/pixelprose](https://huggingface.co/datasets/tomg-group-umd/pixelprose)|
 |[swift/refcoco](https://modelscope.cn/datasets/swift/refcoco)|caption<br>grounding|92430|45.4±3.0, min=37, max=63|multi-modal, en, grounding|[jxu124/refcoco](https://huggingface.co/datasets/jxu124/refcoco)|
 |[swift/refcocog](https://modelscope.cn/datasets/swift/refcocog)|caption<br>grounding|89598|50.3±4.6, min=39, max=91|multi-modal, en, grounding|[jxu124/refcocog](https://huggingface.co/datasets/jxu124/refcocog)|
-|[swift/self-cognition](https://modelscope.cn/datasets/swift/self-cognition)|default<br>qwen3<br>empty_think|108|58.9±20.3, min=32, max=131|chat, self-cognition, 🔥|[modelscope/self-cognition](https://huggingface.co/datasets/modelscope/self-cognition)|
 |[swift/sharegpt](https://modelscope.cn/datasets/swift/sharegpt)|common-zh<br>unknow-zh<br>common-en|194063|820.5±366.1, min=25, max=2221|chat, general, multi-round|-|
 |[swift/swift-sft-mixture](https://modelscope.cn/datasets/swift/swift-sft-mixture)|sharegpt<br>firefly<br>codefuse<br>metamathqa|huge dataset|-|chat, sft, general, 🔥|-|
 |[swift/tagengo-gpt4](https://modelscope.cn/datasets/swift/tagengo-gpt4)|default|76437|468.1±276.8, min=28, max=1726|chat, multi-lingual, quality|[lightblue/tagengo-gpt4](https://huggingface.co/datasets/lightblue/tagengo-gpt4)|
diff --git a/scripts/utils/run_dataset_info.py b/scripts/utils/run_dataset_info.py
@@ -37,8 +37,9 @@ def get_dataset_id(key):
 
 
 def run_dataset(key, template, cache_mapping):
-    ms_id, hf_id, _ = key
     dataset_meta = DATASET_MAPPING[key]
+    ms_id = dataset_meta.ms_dataset_id
+    hf_id = dataset_meta.hf_dataset_id
     tags = ', '.join(tag for tag in dataset_meta.tags) or '-'
     dataset_id = ms_id or hf_id
     use_hf = ms_id is None
@@ -63,7 +64,8 @@ def run_dataset(key, template, cache_mapping):
         dataset_size = len(dataset)
         random_state = np.random.RandomState(42)
         idx_list = random_state.choice(dataset_size, size=min(dataset_size, 100000), replace=False)
-        encoded_dataset = EncodePreprocessor(template)(dataset.select(idx_list), num_proc=num_proc)
+        encoded_dataset = EncodePreprocessor(template)(
+            dataset.select(idx_list), num_proc=num_proc, load_from_cache_file=False)
 
         input_ids = encoded_dataset['input_ids']
         token_len = [len(tokens) for tokens in input_ids]