Pushing change for MetaCAT training

shubham-s-agarwal · shubham-s-agarwal · commit e6c0041860c9 · 2025-05-22T14:44:20.000+01:00
Improved by addressing:
- saving model by overwriting the existing one as default
- mentioning the function that auto adjusts for variations in task and class names
- Added note that training from scratch is not recommended
diff --git a/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb b/medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb
@@ -2,19 +2,17 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "d58c720d",
    "metadata": {},
    "outputs": [],
    "source": [
     "import json\n",
     "import os\n",
     "from datetime import date\n",
-    "from medcat.cat import CAT\n",
     "from medcat.meta_cat import MetaCAT\n",
     "from medcat.config_meta_cat import ConfigMetaCAT\n",
-    "from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE, TokenizerWrapperBERT\n",
-    "from tokenizers import ByteLevelBPETokenizer"
+    "from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBERT"
    ]
   },
   {
@@ -78,35 +76,39 @@
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "35aa5605",
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2933f7e1",
    "metadata": {},
+   "outputs": [],
    "source": [
-    "Before you run the next section please double check that the model meta_annotation names matches to those specified in the mct export.\n",
-    "\n"
+    "for meta_model in meta_model_names:\n",
+    "    config_file = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config.json\")\n",
+    "    with open(config_file, 'r') as jfile:\n",
+    "        config_dict = json.load(jfile)\n",
+    "    print(f\"Model used for meta_{meta_model}:\",config_dict['model']['model_name'])"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "8bf6f5c3",
+   "id": "3047b1d9",
    "metadata": {},
    "source": [
-    "Depending on the model pack you have, please run the LSTM model or BERT model section. <br>\n",
-    "If you are unsure, use this section to check the model type."
+    "<b> Note: </b> \n",
+    " The name for the classification task can vary. <br> E.g: Task name for 'Experiencer' can be 'Subject'.\n",
+    " <br><br>To accomodate for this, we have a list that stores the variations for the alternate names. This attribute can be found under `mc.config.general.alternative_category_names`\n",
+    "<br> E.g. for Experiencer, it will be pre-loaded as alternative_category_names = ['Experiencer','Subject']"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2933f7e1",
+   "cell_type": "markdown",
+   "id": "12e91f77",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "for meta_model in meta_model_names:\n",
-    "    config_file = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config.json\")\n",
-    "    with open(config_file, 'r') as jfile:\n",
-    "        config_dict = json.load(jfile)\n",
-    "    print(f\"Model used for meta_{meta_model}:\",config_dict['model']['model_name'])"
+    "<b> Note: </b> \n",
+    " The name for the classes can vary too. <br> E.g: For Presence task, the class name can be 'Not present (False)' or 'False'\n",
+    " <br><br>To accomodate for this, we have a mapping that stores the variations for the alternate names. This attribute can be found under `mc.config.general.alternative_class_names`\n",
+    "<br> E.g. for Presence, it will be pre-loaded as alternative_class_names = [[\"Hypothetical (N/A)\",\"Hypothetical\"],[\"Not present (False)\",\"False\"],[\"Present (True)\",\"True\"]]"
    ]
   },
   {
@@ -131,9 +133,11 @@
     "\n",
     "    # changing parameters\n",
     "    mc.config.train['nepochs'] = 15\n",
-    "\n",
-    "    save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n",
-    "    #Ideally this should replace the meta_models inside the modelpack\n",
+    "    \n",
+    "    # current model will be overwritten\n",
+    "    save_dir_path = os.path.join(base_dir_meta_models,\"meta_\"+meta_model)\n",
+    "    # to save the new model elsewhere, uncomment the below line\n",
+    "    #save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n",
     "\n",
     "    # train the meta_model\n",
     "    results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n",
@@ -147,7 +151,8 @@
    "id": "ab23e424",
    "metadata": {},
    "source": [
-    "## If you dont have the model packs, and are training from scratch"
+    "## If you dont have the model packs, and are training from scratch\n",
+    "<b>This is very rare, it is recommended to always use the model packs and then fine-tune them</b>"
    ]
   },
   {
@@ -167,8 +172,7 @@
     "\n",
     "tokenizer = TokenizerWrapperBERT.load(\"\", config.model['model_variant'])\n",
     "\n",
-    "save_dir_path= \"test_meta\" # Where to save the meta_model and results. \n",
-    "#Ideally this should replace the meta_models inside the modelpack\n",
+    "save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n",
     "\n",
     "# Initialise and train meta_model\n",
     "mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)\n",
diff --git a/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb b/medcat/2_train_model/2_supervised_training/meta_annotation_training_advanced.ipynb
@@ -10,19 +10,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "d58c720d",
    "metadata": {},
    "outputs": [],
    "source": [
     "import json\n",
     "import os\n",
     "from datetime import date\n",
-    "from medcat.cat import CAT\n",
     "from medcat.meta_cat import MetaCAT\n",
-    "from medcat.config_meta_cat import ConfigMetaCAT\n",
-    "from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE, TokenizerWrapperBERT\n",
-    "from tokenizers import ByteLevelBPETokenizer"
+    "from medcat.config_meta_cat import ConfigMetaCAT"
    ]
   },
   {
@@ -88,11 +85,24 @@
   },
   {
    "cell_type": "markdown",
-   "id": "35aa5605",
+   "id": "d4a3632b",
+   "metadata": {},
+   "source": [
+    "<b> Note: </b> \n",
+    " The name for the classification task can vary. <br> E.g: Task name for 'Experiencer' can be 'Subject'.\n",
+    " <br><br>To accomodate for this, we have a list that stores the variations for the alternate names. This attribute can be found under `mc.config.general.alternative_category_names`\n",
+    "<br> E.g. for Experiencer, it will be pre-loaded as alternative_category_names = ['Experiencer','Subject']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d8bdc404",
    "metadata": {},
    "source": [
-    "Before you run the next section please double check that the model meta_annotation names matches to those specified in the mct export.\n",
-    "\n"
+    "<b> Note: </b> \n",
+    " The name for the classes can vary too. <br> E.g: For Presence task, the class name can be 'Not present (False)' or 'False'\n",
+    " <br><br>To accomodate for this, we have a mapping that stores the variations for the alternate names. This attribute can be found under `mc.config.general.alternative_class_names`\n",
+    "<br> E.g. for Presence, it will be pre-loaded as alternative_class_names = [[\"Hypothetical (N/A)\",\"Hypothetical\"],[\"Not present (False)\",\"False\"],[\"Present (True)\",\"True\"]]"
    ]
   },
   {
@@ -183,9 +193,14 @@
     "    if class_wt_phase1:\n",
     "        mc.config.train['class_weights'] = class_wt_phase1\n",
     "\n",
-    "    mc.config.train['nepochs'] = 30 #You can change the number of epochs, remember to keep them higher for phase 1\n",
+    "    #You can change the number of epochs, remember to keep them higher for phase 1\n",
+    "    mc.config.train['nepochs'] = 40 \n",
+    "\n",
+    "    # current model will be overwritten\n",
+    "    save_dir_path = os.path.join(base_dir_meta_models,\"meta_\"+meta_model)\n",
+    "    # to save the new model elsewhere, uncomment the below line\n",
+    "    #save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n",
     "\n",
-    "    save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n",
     "    results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n",
     "    # Save results\n",
     "    json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results_phase1.json'), 'w'))\n",
@@ -202,18 +217,21 @@
     "    if class_wt_phase2:\n",
     "        mc.config.train['class_weights'] = class_wt_phase2\n",
     "\n",
-    "    mc.config.train['nepochs'] = 15\n",
+    "    #You can change the number of epochs\n",
+    "    mc.config.train['nepochs'] = 20\n",
     "\n",
-    "    save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. Ensure to keep this same as Phase 1\n",
+    "    # Where to save the meta_model and results. Ensure to keep this same as Phase 1\n",
+    "    save_dir_path = os.path.join(base_dir_meta_models,\"meta_\"+meta_model)\n",
+    "    \n",
     "    results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n",
     "    # Save results\n",
     "    json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results_phase2.json'), 'w'))\n",
     "\n",
     "#--------------------------------Driver--------------------------------\n",
     "for meta_model in meta_model_names:\n",
-    "    #To use your own class weights instead of the pre-defined ones for the 2 phases, uncomment the below lines\n",
-    "    '''class_wt_phase1 = []\n",
-    "    class_wt_phase2 = []'''\n",
+    "    #To use your own class weights instead of the pre-defined ones for the 2 phases, put the weights in the lists below\n",
+    "    class_wt_phase1 = [] # Example [0.4,0.4,0.2]\n",
+    "    class_wt_phase2 = [] # Example [0.4,0.3,0.3]\n",
     "\n",
     "    # Train 2 phase learning\n",
     "    logger.info(\"\\n********************Beginning Phase 1********************\")\n",
@@ -257,7 +275,7 @@
     "# Follow all the same steps till initializing the metacat model\n",
     "\n",
     "# Initialise and train meta_model\n",
-    "mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)\n",
+    "mc = MetaCAT.load(save_dir_path=os.path.join(base_dir_meta_models,\"meta_\"+meta_model))\n",
     "\n",
     "# the format expected is [[['text','of','the','document'], [index of medical entity], \"label\" ],\n",
     "#                ['text','of','the','document'], [index of medical entity], \"label\" ]]\n",