Skip to content

Commit e6c0041

Browse files
Pushing change for MetaCAT training
Improved by addressing: - saving model by overwriting the existing one as default - mentioning the function that auto adjusts for variations in task and class names - Added note that training from scratch is not recommended
1 parent 3b01bfa commit e6c0041

File tree

2 files changed

+64
-42
lines changed

2 files changed

+64
-42
lines changed

medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb

Lines changed: 30 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,17 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 1,
5+
"execution_count": null,
66
"id": "d58c720d",
77
"metadata": {},
88
"outputs": [],
99
"source": [
1010
"import json\n",
1111
"import os\n",
1212
"from datetime import date\n",
13-
"from medcat.cat import CAT\n",
1413
"from medcat.meta_cat import MetaCAT\n",
1514
"from medcat.config_meta_cat import ConfigMetaCAT\n",
16-
"from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE, TokenizerWrapperBERT\n",
17-
"from tokenizers import ByteLevelBPETokenizer"
15+
"from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBERT"
1816
]
1917
},
2018
{
@@ -78,35 +76,39 @@
7876
]
7977
},
8078
{
81-
"cell_type": "markdown",
82-
"id": "35aa5605",
79+
"cell_type": "code",
80+
"execution_count": null,
81+
"id": "2933f7e1",
8382
"metadata": {},
83+
"outputs": [],
8484
"source": [
85-
"Before you run the next section please double check that the model meta_annotation names matches to those specified in the mct export.\n",
86-
"\n"
85+
"for meta_model in meta_model_names:\n",
86+
" config_file = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config.json\")\n",
87+
" with open(config_file, 'r') as jfile:\n",
88+
" config_dict = json.load(jfile)\n",
89+
" print(f\"Model used for meta_{meta_model}:\",config_dict['model']['model_name'])"
8790
]
8891
},
8992
{
9093
"cell_type": "markdown",
91-
"id": "8bf6f5c3",
94+
"id": "3047b1d9",
9295
"metadata": {},
9396
"source": [
94-
"Depending on the model pack you have, please run the LSTM model or BERT model section. <br>\n",
95-
"If you are unsure, use this section to check the model type."
97+
"<b> Note: </b> \n",
98+
" The name for the classification task can vary. <br> E.g: Task name for 'Experiencer' can be 'Subject'.\n",
99+
" <br><br>To accomodate for this, we have a list that stores the variations for the alternate names. This attribute can be found under `mc.config.general.alternative_category_names`\n",
100+
"<br> E.g. for Experiencer, it will be pre-loaded as alternative_category_names = ['Experiencer','Subject']"
96101
]
97102
},
98103
{
99-
"cell_type": "code",
100-
"execution_count": null,
101-
"id": "2933f7e1",
104+
"cell_type": "markdown",
105+
"id": "12e91f77",
102106
"metadata": {},
103-
"outputs": [],
104107
"source": [
105-
"for meta_model in meta_model_names:\n",
106-
" config_file = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config.json\")\n",
107-
" with open(config_file, 'r') as jfile:\n",
108-
" config_dict = json.load(jfile)\n",
109-
" print(f\"Model used for meta_{meta_model}:\",config_dict['model']['model_name'])"
108+
"<b> Note: </b> \n",
109+
" The name for the classes can vary too. <br> E.g: For Presence task, the class name can be 'Not present (False)' or 'False'\n",
110+
" <br><br>To accomodate for this, we have a mapping that stores the variations for the alternate names. This attribute can be found under `mc.config.general.alternative_class_names`\n",
111+
"<br> E.g. for Presence, it will be pre-loaded as alternative_class_names = [[\"Hypothetical (N/A)\",\"Hypothetical\"],[\"Not present (False)\",\"False\"],[\"Present (True)\",\"True\"]]"
110112
]
111113
},
112114
{
@@ -131,9 +133,11 @@
131133
"\n",
132134
" # changing parameters\n",
133135
" mc.config.train['nepochs'] = 15\n",
134-
"\n",
135-
" save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n",
136-
" #Ideally this should replace the meta_models inside the modelpack\n",
136+
" \n",
137+
" # current model will be overwritten\n",
138+
" save_dir_path = os.path.join(base_dir_meta_models,\"meta_\"+meta_model)\n",
139+
" # to save the new model elsewhere, uncomment the below line\n",
140+
" #save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n",
137141
"\n",
138142
" # train the meta_model\n",
139143
" results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n",
@@ -147,7 +151,8 @@
147151
"id": "ab23e424",
148152
"metadata": {},
149153
"source": [
150-
"## If you dont have the model packs, and are training from scratch"
154+
"## If you dont have the model packs, and are training from scratch\n",
155+
"<b>This is very rare, it is recommended to always use the model packs and then fine-tune them</b>"
151156
]
152157
},
153158
{
@@ -167,8 +172,7 @@
167172
"\n",
168173
"tokenizer = TokenizerWrapperBERT.load(\"\", config.model['model_variant'])\n",
169174
"\n",
170-
"save_dir_path= \"test_meta\" # Where to save the meta_model and results. \n",
171-
"#Ideally this should replace the meta_models inside the modelpack\n",
175+
"save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n",
172176
"\n",
173177
"# Initialise and train meta_model\n",
174178
"mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)\n",

medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,16 @@
1010
},
1111
{
1212
"cell_type": "code",
13-
"execution_count": 1,
13+
"execution_count": null,
1414
"id": "d58c720d",
1515
"metadata": {},
1616
"outputs": [],
1717
"source": [
1818
"import json\n",
1919
"import os\n",
2020
"from datetime import date\n",
21-
"from medcat.cat import CAT\n",
2221
"from medcat.meta_cat import MetaCAT\n",
23-
"from medcat.config_meta_cat import ConfigMetaCAT\n",
24-
"from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE, TokenizerWrapperBERT\n",
25-
"from tokenizers import ByteLevelBPETokenizer"
22+
"from medcat.config_meta_cat import ConfigMetaCAT"
2623
]
2724
},
2825
{
@@ -88,11 +85,24 @@
8885
},
8986
{
9087
"cell_type": "markdown",
91-
"id": "35aa5605",
88+
"id": "d4a3632b",
89+
"metadata": {},
90+
"source": [
91+
"<b> Note: </b> \n",
92+
" The name for the classification task can vary. <br> E.g: Task name for 'Experiencer' can be 'Subject'.\n",
93+
" <br><br>To accomodate for this, we have a list that stores the variations for the alternate names. This attribute can be found under `mc.config.general.alternative_category_names`\n",
94+
"<br> E.g. for Experiencer, it will be pre-loaded as alternative_category_names = ['Experiencer','Subject']"
95+
]
96+
},
97+
{
98+
"cell_type": "markdown",
99+
"id": "d8bdc404",
92100
"metadata": {},
93101
"source": [
94-
"Before you run the next section please double check that the model meta_annotation names matches to those specified in the mct export.\n",
95-
"\n"
102+
"<b> Note: </b> \n",
103+
" The name for the classes can vary too. <br> E.g: For Presence task, the class name can be 'Not present (False)' or 'False'\n",
104+
" <br><br>To accomodate for this, we have a mapping that stores the variations for the alternate names. This attribute can be found under `mc.config.general.alternative_class_names`\n",
105+
"<br> E.g. for Presence, it will be pre-loaded as alternative_class_names = [[\"Hypothetical (N/A)\",\"Hypothetical\"],[\"Not present (False)\",\"False\"],[\"Present (True)\",\"True\"]]"
96106
]
97107
},
98108
{
@@ -183,9 +193,14 @@
183193
" if class_wt_phase1:\n",
184194
" mc.config.train['class_weights'] = class_wt_phase1\n",
185195
"\n",
186-
" mc.config.train['nepochs'] = 30 #You can change the number of epochs, remember to keep them higher for phase 1\n",
196+
" #You can change the number of epochs, remember to keep them higher for phase 1\n",
197+
" mc.config.train['nepochs'] = 40 \n",
198+
"\n",
199+
" # current model will be overwritten\n",
200+
" save_dir_path = os.path.join(base_dir_meta_models,\"meta_\"+meta_model)\n",
201+
" # to save the new model elsewhere, uncomment the below line\n",
202+
" #save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n",
187203
"\n",
188-
" save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n",
189204
" results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n",
190205
" # Save results\n",
191206
" json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results_phase1.json'), 'w'))\n",
@@ -202,18 +217,21 @@
202217
" if class_wt_phase2:\n",
203218
" mc.config.train['class_weights'] = class_wt_phase2\n",
204219
"\n",
205-
" mc.config.train['nepochs'] = 15\n",
220+
" #You can change the number of epochs\n",
221+
" mc.config.train['nepochs'] = 20\n",
206222
"\n",
207-
" save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. Ensure to keep this same as Phase 1\n",
223+
" # Where to save the meta_model and results. Ensure to keep this same as Phase 1\n",
224+
" save_dir_path = os.path.join(base_dir_meta_models,\"meta_\"+meta_model)\n",
225+
" \n",
208226
" results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n",
209227
" # Save results\n",
210228
" json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results_phase2.json'), 'w'))\n",
211229
"\n",
212230
"#--------------------------------Driver--------------------------------\n",
213231
"for meta_model in meta_model_names:\n",
214-
" #To use your own class weights instead of the pre-defined ones for the 2 phases, uncomment the below lines\n",
215-
" '''class_wt_phase1 = []\n",
216-
" class_wt_phase2 = []'''\n",
232+
" #To use your own class weights instead of the pre-defined ones for the 2 phases, put the weights in the lists below\n",
233+
" class_wt_phase1 = [] # Example [0.4,0.4,0.2]\n",
234+
" class_wt_phase2 = [] # Example [0.4,0.3,0.3]\n",
217235
"\n",
218236
" # Train 2 phase learning\n",
219237
" logger.info(\"\\n********************Beginning Phase 1********************\")\n",
@@ -257,7 +275,7 @@
257275
"# Follow all the same steps till initializing the metacat model\n",
258276
"\n",
259277
"# Initialise and train meta_model\n",
260-
"mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)\n",
278+
"mc = MetaCAT.load(save_dir_path=os.path.join(base_dir_meta_models,\"meta_\"+meta_model))\n",
261279
"\n",
262280
"# the format expected is [[['text','of','the','document'], [index of medical entity], \"label\" ],\n",
263281
"# ['text','of','the','document'], [index of medical entity], \"label\" ]]\n",

0 commit comments

Comments
 (0)