From cb4ce7e2e3f4f3804507306a072d8848a996f8dc Mon Sep 17 00:00:00 2001 From: vicliv <78241611+vicliv@users.noreply.github.com> Date: Tue, 25 Apr 2023 11:06:58 -0400 Subject: [PATCH] Update data_generator.py Added semi-supervised setting with normal samples only for training, added a parameter to select the maximum size of the dataset (it was fixed at 10,000). --- data_generator.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/data_generator.py b/data_generator.py index 5a0a47a..e03ecf7 100644 --- a/data_generator.py +++ b/data_generator.py @@ -16,18 +16,20 @@ # currently, data generator only supports for generating the binary classification datasets class DataGenerator(): def __init__(self, seed:int=42, dataset:str=None, test_size:float=0.3, - generate_duplicates=True, n_samples_threshold=1000): + generate_duplicates=True, n_samples_threshold=1000, normal = False): ''' :param seed: seed for reproducible results :param dataset: specific the dataset name :param test_size: testing set size :param generate_duplicates: whether to generate duplicated samples when sample size is too small :param n_samples_threshold: threshold for generating the above duplicates, if generate_duplicates is False, then datasets with sample size smaller than n_samples_threshold will be dropped + :param normal: using only normal samples for training (second semi-supervised setting) ''' self.seed = seed self.dataset = dataset self.test_size = test_size + self.normal = normal self.generate_duplicates = generate_duplicates self.n_samples_threshold = n_samples_threshold @@ -206,7 +208,7 @@ def add_label_contamination(self, X, y, noise_ratio:float): def generator(self, X=None, y=None, minmax=True, la=None, at_least_one_labeled=False, realistic_synthetic_mode=None, alpha:int=5, percentage:float=0.1, - noise_type=None, duplicate_times:int=2, contam_ratio=1.00, noise_ratio:float=0.05): + noise_type=None, duplicate_times:int=2, contam_ratio=1.00, noise_ratio:float=0.05, max_size:int=10000): ''' la: labeled anomalies, can be either the ratio of labeled anomalies or the number of labeled anomalies at_least_one_labeled: whether to guarantee at least one labeled anomalies in the training set @@ -251,10 +253,10 @@ def generator(self, X=None, y=None, minmax=True, y = y[idx_duplicate] # if the dataset is too large, subsampling for considering the computational cost - if len(y) > 10000: + if len(y) > max_size: print(f'subsampling for dataset {self.dataset}...') self.utils.set_seed(self.seed) - idx_sample = np.random.choice(np.arange(len(y)), 10000, replace=False) + idx_sample = np.random.choice(np.arange(len(y)), max_size, replace=False) X = X[idx_sample] y = y[idx_sample] @@ -307,7 +309,21 @@ def generator(self, X=None, y=None, minmax=True, self.utils.data_description(X=X, y=y) # spliting the current data to the training set and testing set - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, shuffle=True, stratify=y) + if self.normal: # if we use normal samples only for training + indices = np.arange(len(X)) + normal_indices = indices[y == 0] + anomaly_indices = indices[y == 1] + + train_size = round((1-self.test_size) * normal_indices.size) + train_indices, test_indices = normal_indices[:train_size], normal_indices[train_size:] + test_indices = np.append(test_indices, anomaly_indices) + + X_train = X[train_indices] + y_train = y[train_indices] + X_test = X[test_indices] + y_test = y[test_indices] + else: # classical unsupervised + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, shuffle=True, stratify=y) # we respectively generate the duplicated anomalies for the training and testing set if noise_type == 'duplicated_anomalies': @@ -355,4 +371,4 @@ def generator(self, X=None, y=None, minmax=True, y_train[idx_unlabeled] = 0 y_train[idx_labeled_anomaly] = 1 - return {'X_train':X_train, 'y_train':y_train, 'X_test':X_test, 'y_test':y_test} \ No newline at end of file + return {'X_train':X_train, 'y_train':y_train, 'X_test':X_test, 'y_test':y_test}