superannotateai
diff --git a/‎.gitignore
Lines changed: 9 additions & 6 deletions b/‎.gitignore
Lines changed: 9 additions & 6 deletions
diff --git a/‎Changelog
Lines changed: 14 additions & 0 deletions b/‎Changelog
Lines changed: 14 additions & 0 deletions
diff --git a/‎Dockerfile
Lines changed: 46 additions & 0 deletions b/‎Dockerfile
Lines changed: 46 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 150 additions & 1 deletion b/‎README.md
Lines changed: 150 additions & 1 deletion
diff --git a/‎etc/action_code.py
Lines changed: 87 additions & 0 deletions b/‎etc/action_code.py
Lines changed: 87 additions & 0 deletions
diff --git a/‎etc/configs/SA_config.ini
Lines changed: 2 additions & 0 deletions b/‎etc/configs/SA_config.ini
Lines changed: 2 additions & 0 deletions
diff --git a/‎etc/configs/service_config.json
Lines changed: 9 additions & 0 deletions b/‎etc/configs/service_config.json
Lines changed: 9 additions & 0 deletions
diff --git a/‎etc/configs/train_config.json
Lines changed: 14 additions & 0 deletions b/‎etc/configs/train_config.json
Lines changed: 14 additions & 0 deletions
diff --git a/‎pics/Main_readme_schemas.png
280 KB b/‎pics/Main_readme_schemas.png
280 KB
diff --git a/‎pics/examples/example_1.png b/‎pics/examples/example_1.png
@@ -127,6 +127,7 @@ venv/
 ENV/
 env.bak/
 venv.bak/
+local_env/
 
 # Spyder project settings
 .spyderproject
@@ -152,9 +153,11 @@ dmypy.json
 # Cython debug symbols
 cython_debug/
 
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+# VS Code IDE files
+.vscode/
+
+# Others
+my_configs/
+temp_data/
+output_dir/
+test.py
@@ -0,0 +1,14 @@
+## [1.0.0] - 2024-19-04
+
+_First release._
+
+### Added
+
+- **Breaking:** Base functionality for HTTP service 
+- **Breaking:** Fine-tuning of multi-class classification task 
+- **Breaking:** Action example for running pipeline in SuperAnnotate infrastructure
+
+
+## [0.0.1] - 2024-12-03
+
+_Init._
@@ -0,0 +1,46 @@
+FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04
+
+# Set utility env varibles
+ENV PATH=/text_auto_classification_private/miniconda/bin:$PATH
+
+# Set paths as env variables
+ARG DEFAULT_SERVICE_CONFIG
+ARG DEFAULT_TRAINING_CONFIG
+
+ENV DEFAULT_SERVICE_CONFIG=${DEFAULT_SERVICE_CONFIG}
+ENV DEFAULT_TRAINING_CONFIG=${DEFAULT_TRAINING_CONFIG}
+
+# Install some basic utilities
+RUN apt-get update && apt-get install -y \
+    curl \
+    ca-certificates \
+    sudo \
+    git \
+    bzip2 \
+    build-essential \
+    libgl1 \
+    libglib2.0-0 \
+ && rm -rf /var/lib/apt/lists/*
+
+# Set workdir
+WORKDIR /text_auto_classification_private
+
+# Install Miniconda and Python
+RUN curl -sLo /text_auto_classification_private/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-py311_24.1.2-0-Linux-x86_64.sh \
+ && chmod +x /text_auto_classification_private/miniconda.sh \
+ && /text_auto_classification_private/miniconda.sh -b -p /text_auto_classification_private/miniconda \
+ && rm /text_auto_classification_private/miniconda.sh \
+ && conda install -y python==3.11 \
+ && pip3 install nvitop
+
+# Install python requirements
+COPY text_auto_classification/requirements.txt .
+RUN pip3 install -r requirements.txt --no-cache
+
+# Copy code to container
+COPY text_auto_classification/ text_auto_classification/
+COPY etc/ etc/
+COPY version.txt .
+
+EXPOSE 8080
+CMD uvicorn --host 0.0.0.0 --port 8080 text_auto_classification.fastapi_app:app
@@ -1 +1,150 @@
-# text_auto_classification
+# SuperAnnotate Text Auto Classification #
+
+[![Version](https://img.shields.io/badge/version-1.0.0-green.svg)]() [![Python 3.11](https://img.shields.io/badge/python-3.11-blue.svg)](https://www.python.org/downloads/release/python-3110/) [![CUDA 12.2](https://img.shields.io/badge/CUDA-12.2-green.svg)](https://developer.nvidia.com/cuda-12-2-0-download-archive)
+
+This repository contains an HTTP service designed for automatic text classification for pipeline integration with the SuperAnnotate platform.
+
+To integrate this HTTP service into your pipeline on the SuperAnnotate platform, follow these steps:
+
+- Create and set up a text project on the SuperAnnotate platform.
+- Deploy this HTTP service to a global accessible location.
+- Configure a pipeline on the SuperAnnotate platform to link this service to your project.
+
+\
+<img src="pics/Main_readme_schemas.png" alt="Main schemas" width="500"/>
+
+\
+The project facilitates the automatic training of a text classification and data tagging model on the SuperAnnotate platform. \
+Here's a high-level overview of the process:
+
+1. **Annotate Data:** Annotate approximately 100 items per class using the SuperAnnotate platform.
+2. **Model Fine-Tuning:** Fine-tune the text classification model using the annotated data.
+3. **Prediction:** Use the fine-tuned model to predict labels for other items in your dataset.
+
+## How it works ##
+
+The project was created for the automatic training of a text classification and data tagging model on the SuperAnnotate platform. Everything happens in 3 main stages:
+
+### 1. Loading and preparing data ###
+
+- Annotations with file names are loaded from the specified project (and optional folders) from the platform.
+- Document texts are also loaded through the selected integration.
+- All this data is combined into a dataset and has standard processing, such as removing empty, duplicates, and extremely short/long (less than 10 and above 2000 words) texts. Texts are also preprocessed by converting them to lowercase and removing unnecessary spaces and line breaks. At any time you can change the text preprocessing function (`text_auto_classification/utils/data/data_processing.py`) to suit your needs.
+
+### 2. Model training ###
+
+- The training data is divided into training and validation data to evaluate the quality of the model and the learning process.
+- Next, the hyperparameters are initialized, which can be customized through the training config file.
+- The model's auto fine-tuning process, specified in the config, begins. All model, arguments and trainer are defined by standard HuggingFace abstractions.
+- The model output layer is gonna be based on the number of classes in the training data
+
+### 3. Prediction ###
+
+- All downloaded data from the platform that did not yet have labels is separated during the data preparation process into a separate set for future prediction.
+- At this stage, we run the texts of these elements through the model to obtain predictions.
+- These predictions are then uploaded to the platform.
+
+To configure the Pipeline and service operation from the platform side, read this [**Tutorial**](tutorial.md)
+
+## How to run service ##
+
+### API Service Configuration ###
+
+You can deploy the service wherever it is convenient; one of the basic options is on a created EC2 instance. Learn about instance creation and setup [here](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html).
+
+***NOTES***:
+
+- To verify that everything is functioning correctly, try calling the healthcheck endpoint.
+- Hardware requirements will depend largely on your arguments and the base model being used. However, it's recommended to utilize NVIDIA GPU architecture. For the basic configuration of the default service, it's advisable to use the following instance: [**g3s.xlarge**](https://instances.vantage.sh/aws/ec2/g3s.xlarge).
+- Also, ensure that the port on which your service is deployed (8080 by default) is open to the global network. Refer to this [**tutorial**](https://stackoverflow.com/questions/5004159/opening-port-80-ec2-amazon-web-services/10454688#10454688) for guidance on opening a port on an EC2 instance.
+
+### Pre-requirements ###
+
+To get started with the project, you should determine all the necessary configuration files. By default, they have located in the following path: `etc/configs`. Namely, there are 3 configs:
+
+1. **SA_config.ini**:
+   - This is a configuration file for connecting work with SDK SuperAnnotate, which contains your key to the platform and is needed for authorization in SAClient. You can read more [here](https://doc.superannotate.com/docs/python-sdk#with-arguments).
+
+2. **service_config.json**:
+   - This file contains a basic field for the working of the service in general. Contains the following fields:
+     - `SA_CONFIG_PATH`: The path to the first config (SA_config.ini).
+     - `SA_PROJECT_NAME`, `SA_FOLDERS`: The name and optionally the folders of the project on the platform with which to work.
+     - `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`: AWS keys, more details [here](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html).
+     - `AWS_URL_FOR_DATA_DOWNLOADS`, `AWS_URL_TO_MODEL_UPLOAD`: S3 URLs to the location of original documents and the place to save model checkpoints, respectively. More details [here](https://docs.aws.amazon.com/AmazonS3/latest/userguide/access-bucket-intro.html).
+
+    ***NOTE***: Please ensure that the file structure on following path `AWS_URL_FOR_DATA_DOWNLOADS` matches with folders `SA_FOLDERS`
+
+3. **train_config.json**:
+   - This config contains the basic arguments necessary for training:
+     - `pretrain_model`: The name of the pre-trained model with HuggingFace (it is recommended to use Bert-like model).
+     - `validation_ratio`: A value from 0 to 1, representing the proportion of data that will be used to validate the model.
+     - `max_length`: The maximum length of texts for the tokenizer, by default 512 is the limit for Bert-like models.
+     - The remaining keys correspond to the arguments of the following `TrainingArguments` class. More details [here](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments).
+
+After all the configs are configured as described, you can start the service. There are 2 options:
+
+### As Python file ###
+
+- Install Python version 3.11. More details [here](https://www.python.org/downloads/)
+- Install Nvidia drivers and CUDA toolkit using, for example, this instructions: [**Nvidia drivers**](https://ubuntu.com/server/docs/nvidia-drivers-installation) and [**CUDA toolkit**](https://developer.nvidia.com/cuda-12-2-2-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_local)
+- Install dependencies: `pip install -r ./text_auto_classification/requirements.txt`
+- Set the Python path variable: `export PYTHONPATH="."`
+- Run the API: `uvicorn --host 0.0.0.0 --port 8080 text_auto_classification.fastapi_app:app`
+
+### As Docker container ###
+
+- Initialize environment variables:
+  - Path to the general configuration file `DEFAULT_SERVICE_CONFIG`: `export DEFAULT_SERVICE_CONFIG=etc/configs/service_config.json`
+  - Path to the configuration file with parameters for training `DEFAULT_TRAINING_CONFIG`: `export DEFAULT_TRAINING_CONFIG=etc/configs/train_config.json`
+- Install Docker, Nvidia drivers, CUDA toolkit and NVIDIA Container Toolkit using, for example, this instructions: [**Docker**](https://docs.docker.com/engine/install/ubuntu/); [**Nvidia drivers**](https://ubuntu.com/server/docs/nvidia-drivers-installation); [**CUDA toolkit**](https://developer.nvidia.com/cuda-12-2-2-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_local); [**NVIDIA Container Toolkit**](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
+- Build the docker image: `sudo docker build -t text_auto_classification --build-arg DEFAULT_SERVICE_CONFIG=$DEFAULT_SERVICE_CONFIG --build-arg DEFAULT_TRAINING_CONFIG=$DEFAULT_TRAINING_CONFIG .`
+- Run a container: `sudo docker run --gpus all -p 8080:8080 -d text_auto_classification`
+
+## Endpoints ##
+
+The following endpoints are available in the Text Auto Classification service:
+
+- **GET /healthcheck**:
+  - **Summary**: Ping
+  - **Description**: Alive method
+  - **Input Type**: None
+  - **Output Type**: JSON
+  - **Output Values**:
+    - `{"healthy": True}`
+  - **Status Codes**:
+    - `200`: Successful Response
+
+- **POST /train_predict**:
+  - **Summary**: Train Predict
+  - **Description**: Train model on annotated data from SA project and auto annotate other data
+  - **Input Type**: None
+  - **Output Type**: JSON
+  - **Output Values**:
+    - `{"status": "Pipeline successfully started"}`
+    - `{"status": "Pipeline is already started"}`
+  - **Status Codes**:
+    - `200`: Pipeline successfully started
+    - `429`: Pipeline is already started
+
+- **GET /status**:
+  - **Summary**: Status
+  - **Description**: Method for status tracking
+  - **Input Type**: None
+  - **Output Type**: JSON
+  - **Output Values**:
+    - `{"status": "Not started"}`
+    - `{"status": "Downloading data"}`
+    - `{"status": "Model training"}`
+    - `{"status": "Predicting other items"}`
+    - `{"status": "Completed"}`
+    - `{"status": "Failed"}`
+  - **Status Codes**:
+    - `200`: Successful Response
+
+## Room for Improvements ##
+
+There are several areas where the project can be further improved:
+
+- **Implement support for multi-label classification**: Currently, the project focuses on single-label classification. Adding support for multi-label classification would enhance its versatility and applicability in various use cases.
+
+- **Logic for working with long texts, add auto chunking**: Handling long texts efficiently is crucial for many natural language processing tasks. Implementing logic to handle long texts, such as automatic chunking, would improve the project's performance and scalability when dealing with lengthy documents.
@@ -0,0 +1,87 @@
+import json
+import os
+import urllib.parse
+from datetime import datetime
+from time import sleep, time
+
+import requests
+from superannotate import SAClient
+
+SA_TOKEN = os.environ["SA_TOKEN"]
+URL = os.environ["URL"]
+# Constant for limiting the amount of data for starting Auto-Classification
+# You can change it, but by default it's set up to 100, changing the limit to less may lead to unstable results
+COUNT_ITEMS_PER_CLASS = 100
+
+sa = SAClient(token=SA_TOKEN)
+
+
+def read_status(resp):
+    return json.loads(resp.content.decode()).get("status")
+
+
+def check_enough_data(project_name, threshold):
+    project_metadata = sa.get_project_metadata(
+        project = project_name,
+        include_annotation_classes=True
+    )
+
+    classes = [cl["name"] for cl in project_metadata["classes"] if cl["type"] == "tag"]
+
+    enough_data_flag = True
+    for cl in classes:
+        cl_items = sa.query(
+            project = project_name,
+            query = f"metadata(status =Completed) AND instance(className = {cl})"
+        )
+
+        if len(cl_items) < threshold:
+            print(f"Amount of completed items is too small for *{cl}*. {len(cl_items)}/{threshold}")
+            enough_data_flag = False
+
+    return enough_data_flag
+
+
+def handler(event, context):
+    # Get project name
+    project_name = sa.get_project_by_id(context['after']['project_id'])['name']
+    
+    # Can't run service if count completed items less than COUNT_ITEMS_PER_CLASS per class
+    if not check_enough_data(project_name, COUNT_ITEMS_PER_CLASS):
+        return False
+    
+    # Call serice
+    started = start_train_predict()
+    if not started:
+        return False
+    
+    # Loop of monitoring the service and waiting for execution
+    while True:
+        resp = requests.get(urllib.parse.urljoin(URL, "text-auto-classification/status"))
+
+        print(f"Status code: {read_status(resp)}, waiting")
+        # Create datetime object from current timestamp
+        dt = datetime.fromtimestamp(int(time()))
+        # Format datetime as "YYYY-MM-DD hh:mm:ss"
+        formatted_datetime = dt.strftime("%Y-%m-%d %H:%M:%S")
+        print(formatted_datetime)
+
+        if resp.status_code == 200 and read_status(resp) == "Completed":
+            return True
+        if (resp.status_code == 200 and read_status(resp) == "Failed") or resp.status_code != 200:
+            print(resp.status_code)
+            print(read_status(resp))
+            return False
+        
+        sleep(60)
+
+
+def start_train_predict():
+    resp = requests.post(urllib.parse.urljoin(URL, "text-auto-classification/train_predict"))
+    
+    if resp.status_code == 200:
+        return True
+    else:
+        print(resp.status_code)
+        print(read_status(resp))
+        return False
@@ -0,0 +1,2 @@
+[DEFAULT]
+SA_TOKEN = <token>
@@ -0,0 +1,9 @@
+{
+    "SA_CONFIG_PATH": "etc/configs/SA_config.ini",
+    "SA_PROJECT_NAME": "Project Name",
+    "SA_FOLDERS": ["Folder1", "Folder2"],
+    "AWS_ACCESS_KEY_ID": "AWS ACCESS KEY",
+    "AWS_SECRET_ACCESS_KEY": "AWS SECRET ACCESS KEY",
+    "AWS_URL_FOR_DATA_DOWNLOADS": "S3 URL",
+    "AWS_URL_TO_MODEL_UPLOAD": "S3 URL"
+}
@@ -0,0 +1,14 @@
+{
+    "pretrain_model": "FacebookAI/xlm-roberta-base",
+    "validation_ratio": 0.15,
+    "max_length": 512,
+    "optim": "adamw_torch",
+    "learning_rate": 3e-5,
+    "lr_scheduler_type": "cosine_with_restarts",
+    "warmup_ratio": 0.2,
+    "per_device_train_batch_size": 4,
+    "per_device_eval_batch_size": 4,
+    "gradient_accumulation_steps": 2,
+    "num_train_epochs": 10,
+    "weight_decay": 0.01
+}