From cb4ce7e2e3f4f3804507306a072d8848a996f8dc Mon Sep 17 00:00:00 2001
From: vicliv <78241611+vicliv@users.noreply.github.com>
Date: Tue, 25 Apr 2023 11:06:58 -0400
Subject: [PATCH] Update data_generator.py

Added semi-supervised setting with normal samples only for training, added a parameter to select the maximum size of the dataset (it was fixed at 10,000).
---
 data_generator.py | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/data_generator.py b/data_generator.py
index 5a0a47a..e03ecf7 100644
--- a/data_generator.py
+++ b/data_generator.py
@@ -16,18 +16,20 @@
 # currently, data generator only supports for generating the binary classification datasets
 class DataGenerator():
     def __init__(self, seed:int=42, dataset:str=None, test_size:float=0.3,
-                 generate_duplicates=True, n_samples_threshold=1000):
+                 generate_duplicates=True, n_samples_threshold=1000, normal = False):
         '''
         :param seed: seed for reproducible results
         :param dataset: specific the dataset name
         :param test_size: testing set size
         :param generate_duplicates: whether to generate duplicated samples when sample size is too small
         :param n_samples_threshold: threshold for generating the above duplicates, if generate_duplicates is False, then datasets with sample size smaller than n_samples_threshold will be dropped
+        :param normal: using only normal samples for training (second semi-supervised setting)
         '''
 
         self.seed = seed
         self.dataset = dataset
         self.test_size = test_size
+        self.normal = normal
 
         self.generate_duplicates = generate_duplicates
         self.n_samples_threshold = n_samples_threshold
@@ -206,7 +208,7 @@ def add_label_contamination(self, X, y, noise_ratio:float):
     def generator(self, X=None, y=None, minmax=True,
                   la=None, at_least_one_labeled=False,
                   realistic_synthetic_mode=None, alpha:int=5, percentage:float=0.1,
-                  noise_type=None, duplicate_times:int=2, contam_ratio=1.00, noise_ratio:float=0.05):
+                  noise_type=None, duplicate_times:int=2, contam_ratio=1.00, noise_ratio:float=0.05, max_size:int=10000):
         '''
         la: labeled anomalies, can be either the ratio of labeled anomalies or the number of labeled anomalies
         at_least_one_labeled: whether to guarantee at least one labeled anomalies in the training set
@@ -251,10 +253,10 @@ def generator(self, X=None, y=None, minmax=True,
             y = y[idx_duplicate]
 
         # if the dataset is too large, subsampling for considering the computational cost
-        if len(y) > 10000:
+        if len(y) > max_size:
             print(f'subsampling for dataset {self.dataset}...')
             self.utils.set_seed(self.seed)
-            idx_sample = np.random.choice(np.arange(len(y)), 10000, replace=False)
+            idx_sample = np.random.choice(np.arange(len(y)), max_size, replace=False)
             X = X[idx_sample]
             y = y[idx_sample]
 
@@ -307,7 +309,21 @@ def generator(self, X=None, y=None, minmax=True,
         self.utils.data_description(X=X, y=y)
 
         # spliting the current data to the training set and testing set
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, shuffle=True, stratify=y)
+        if self.normal: # if we use normal samples only for training
+            indices = np.arange(len(X))
+            normal_indices = indices[y == 0]
+            anomaly_indices = indices[y == 1]
+
+            train_size = round((1-self.test_size) * normal_indices.size)
+            train_indices, test_indices = normal_indices[:train_size], normal_indices[train_size:]
+            test_indices = np.append(test_indices, anomaly_indices)
+            
+            X_train = X[train_indices]
+            y_train = y[train_indices]
+            X_test = X[test_indices]
+            y_test = y[test_indices]
+        else: # classical unsupervised
+            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, shuffle=True, stratify=y)
 
         # we respectively generate the duplicated anomalies for the training and testing set
         if noise_type == 'duplicated_anomalies':
@@ -355,4 +371,4 @@ def generator(self, X=None, y=None, minmax=True,
         y_train[idx_unlabeled] = 0
         y_train[idx_labeled_anomaly] = 1
 
-        return {'X_train':X_train, 'y_train':y_train, 'X_test':X_test, 'y_test':y_test}
\ No newline at end of file
+        return {'X_train':X_train, 'y_train':y_train, 'X_test':X_test, 'y_test':y_test}