Merge branch 'main' into chat_template

minmingzhu · web-flow · commit 3e6ccac508f5 · 2024-04-10T13:53:10.000Z
Signed-off-by: minmingzhu &lt;45281494+minmingzhu@users.noreply.github.com&gt;
diff --git a/.github/workflows/night_build_memo.txt b/.github/workflows/night_build_memo.txt
@@ -1 +1 @@
-finetune: gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b-chat, huggyllama/llama-7b
+finetune: gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b, huggyllama/llama-7b
diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md
@@ -7,6 +7,7 @@ The following are the parameters supported in the finetuning workflow.
 |Configuration Name| Default|Meaning|
 |-|-|-|
 |base_model| EleutherAI/gpt-j-6b|Path to pretrained model or model identifier from huggingface.co/models|
+|tokenizer_name|None|Path to pretrained tokenizer from huggingface.co/models. If not provided, the tokenizer will be loaded from the `base_model`.|
 |gpt_base_model|True|This parameter is for [Transformers#22482](https://github.com/huggingface/transformers/issues/22482). It needs to be set to True when the pretrained model is realted to gpt, otherwise it is False.|
 |output_dir|/tmp/llm-ray/output|The output directory to store the finetuned model|
 |checkpoint_dir|/tmp/llm-ray/checkpoint|The directory to store checkpoint|
diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py
@@ -155,6 +155,10 @@ def train_func(config: Dict[str, Any]):
 
     gradient_accumulation_steps = config["Training"].get("gradient_accumulation_steps", 1)
     base_model = config["General"]["base_model"]
+    if config["General"].get("tokenizer_name") is not None:
+        tokenizer_name = config["General"].get("tokenizer_name")
+    else:
+        tokenizer_name = base_model
     dataset_file = config["Dataset"]["train_file"]
 
     seed = config["Training"].get("seed")
@@ -171,7 +175,7 @@ def train_func(config: Dict[str, Any]):
 
     tokenizer = common.tokenizer.Tokenizer.registory.get("HuggingFaceTokenizer")()(
         config={
-            "name": base_model,
+            "name": tokenizer_name,
             "config": config["General"]["config"],
         }
     )
diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py
@@ -51,6 +51,7 @@ class DeltatunerConfig(BaseModel):
 
 class General(BaseModel):
     base_model: str
+    tokenizer_name: Optional[str] = None
     gpt_base_model: bool
     output_dir: str
     checkpoint_dir: Optional[str]
diff --git a/llm_on_ray/finetune/models/mpt-7b.yaml b/llm_on_ray/finetune/models/mpt-7b.yaml
@@ -1,5 +1,6 @@
 General:
   base_model: mosaicml/mpt-7b
+  tokenizer_name: EleutherAI/gpt-neox-20b
   gpt_base_model: false
   output_dir: /tmp/llm-ray/output
   checkpoint_dir: /tmp/llm-ray/checkpoint

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-finetune: gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b-chat, huggyllama/llama-7b`
	`1`	`+finetune: gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b, huggyllama/llama-7b`