From f01aa9fe0d908a8f3d60fad8c48cfb41a74b838f Mon Sep 17 00:00:00 2001 From: iamaayushrivastava Date: Sun, 5 Jan 2025 17:03:13 +0530 Subject: [PATCH 1/3] Add scripts --- scripts/create-ll-files.sh | 57 ++++ scripts/models/histogram-mlp-model.py | 125 +++++++ scripts/models/ir2vec-classifier.py | 315 ++++++++++++++++++ scripts/models/milepost-mlp-model.py | 159 +++++++++ scripts/preprocessing/copy-folders.py | 20 ++ .../preprocessing/copy-profiled-ll-files.py | 44 +++ scripts/preprocessing/delete-subfolders.py | 32 ++ scripts/preprocessing/folder-count.py | 40 +++ scripts/preprocessing/folder-preprocessing.py | 50 +++ .../generate-ir2vec-embeddings.py | 84 +++++ scripts/preprocessing/ir2vec-preprocess.py | 132 ++++++++ scripts/preprocessing/merge-directories.py | 46 +++ scripts/preprocessing/rename-folders.py | 39 +++ scripts/preprocessing/split-dataset.py | 206 ++++++++++++ .../preprocessing/train-test-val-to-csv.py | 98 ++++++ .../generate-input-folder-with-input-files.py | 82 +++++ ...led-ll-files-with-testcases-using-cores.sh | 118 +++++++ .../profiling/profiling-without-parallel.sh | 123 +++++++ .../profiling/profiling-without-testcases.sh | 101 ++++++ scripts/read-npz.files.py | 7 + 20 files changed, 1878 insertions(+) create mode 100644 scripts/create-ll-files.sh create mode 100644 scripts/models/histogram-mlp-model.py create mode 100644 scripts/models/ir2vec-classifier.py create mode 100644 scripts/models/milepost-mlp-model.py create mode 100644 scripts/preprocessing/copy-folders.py create mode 100644 scripts/preprocessing/copy-profiled-ll-files.py create mode 100644 scripts/preprocessing/delete-subfolders.py create mode 100644 scripts/preprocessing/folder-count.py create mode 100644 scripts/preprocessing/folder-preprocessing.py create mode 100644 scripts/preprocessing/generate-ir2vec-embeddings.py create mode 100644 scripts/preprocessing/ir2vec-preprocess.py create mode 100644 scripts/preprocessing/merge-directories.py create mode 100644 scripts/preprocessing/rename-folders.py create mode 100644 scripts/preprocessing/split-dataset.py create mode 100644 scripts/preprocessing/train-test-val-to-csv.py create mode 100644 scripts/profiling/generate-input-folder-with-input-files.py create mode 100644 scripts/profiling/generate-profiled-ll-files-with-testcases-using-cores.sh create mode 100644 scripts/profiling/profiling-without-parallel.sh create mode 100644 scripts/profiling/profiling-without-testcases.sh create mode 100644 scripts/read-npz.files.py diff --git a/scripts/create-ll-files.sh b/scripts/create-ll-files.sh new file mode 100644 index 0000000..6b9f2f0 --- /dev/null +++ b/scripts/create-ll-files.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +CLANG=/usr/lib/llvm-17/bin/clang-17 +SRC_DIR=/Pramana/IR2Vec/datasets/Codeforces-Src-Files +ll_FD=/Pramana/IR2Vec/dataset-opt-levels/codeforces/O0 + +mkdir -p ${ll_FD} + +# Determine the range of numeric subfolder names +FIRST=$(find "${SRC_DIR}" -mindepth 1 -maxdepth 1 -type d -exec basename {} \; | grep -E '^[0-9]+$' | sort -n | head -1) +LAST=$(find "${SRC_DIR}" -mindepth 1 -maxdepth 1 -type d -exec basename {} \; | grep -E '^[0-9]+$' | sort -n | tail -1) + +echo "First: $FIRST" +echo "Last: $LAST" + +# Validate that FIRST and LAST are numeric +if ! [[ "$FIRST" =~ ^[0-9]+$ && "$LAST" =~ ^[0-9]+$ ]]; then + echo "Error: Subfolder names must be numeric. Check the directory structure." + exit 1 +fi + +# Create a semaphore with 20 slots +MAX_CORES=40 +semaphore() { + while [ $(jobs -r | wc -l) -ge $MAX_CORES ]; do + sleep 1 + done +} + +# Loop through the dynamically calculated range of subfolders +for dir in $(seq $FIRST $LAST); do + DIR=${dir} + FULL_DIR="${SRC_DIR}/${DIR}" + echo "${DIR} ${FULL_DIR}" + + # Check if the directory exists + if [ -d "$FULL_DIR" ]; then + mkdir -p ${ll_FD}/${DIR} + + find "$FULL_DIR" -regex '.*\.\(c\|cc\|cpp\)' -print0 | + while IFS= read -r -d '' line; do + semaphore # Wait if too many jobs are running + ( + filename=$(basename "$line") + filename=${filename%.*} + ${CLANG} -O0 -S -emit-llvm -I "$FULL_DIR" "$line" -o "${ll_FD}/${DIR}/${filename}.ll" + # ${CLANG} -Xclang -disable-O0-optnone -S -emit-llvm -I $dir "$line" -o ${ll_FD}/${DIR}/"${filename}.ll" + ) & + done + else + echo "Directory ${FULL_DIR} does not exist. Skipping." + fi +done + +wait + +echo "Done" \ No newline at end of file diff --git a/scripts/models/histogram-mlp-model.py b/scripts/models/histogram-mlp-model.py new file mode 100644 index 0000000..d523624 --- /dev/null +++ b/scripts/models/histogram-mlp-model.py @@ -0,0 +1,125 @@ +import os +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.metrics import classification_report, confusion_matrix, accuracy_score + +# Import TensorFlow Keras +from tensorflow import keras +from tensorflow.keras import optimizers +from tensorflow.keras.utils import to_categorical +from tensorflow.keras.layers import (Activation, Dense, Dropout, BatchNormalization) +from tensorflow.keras.models import Sequential + +# Model definition +def getModel(input_dim, output_dim): + model = Sequential() + + model.add(Dense(650, input_shape=(input_dim,), kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Dropout(0.25)) + + model.add(Dense(600, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Dropout(0.25)) + + model.add(Dense(500, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Dropout(0.25)) + + model.add(Dense(output_dim, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation('softmax')) + + opt = keras.optimizers.Adam(learning_rate=0.001) + model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) + model.summary() + + return model + +# Load data from directory +def load_data_from_directory(directory): + data = [] + labels = [] + classes = sorted(os.listdir(directory)) # Ensure consistent label mapping + class_to_label = {cls: idx for idx, cls in enumerate(classes)} + + for cls in classes: + class_path = os.path.join(directory, cls) + if os.path.isdir(class_path): + for file_name in os.listdir(class_path): + if file_name.endswith(".npz"): + file_path = os.path.join(class_path, file_name) + try: + loaded = np.load(file_path)["values"] + data.append(loaded.flatten()) + labels.append(class_to_label[cls]) + except Exception as e: + print(f"Failed to load {file_path}: {e}") + + return np.array(data), np.array(labels) + +# Prepare train and test data +def prepare_data(train_dir, test_dir): + X_train, y_train = load_data_from_directory(train_dir) + X_test, y_test = load_data_from_directory(test_dir) + + return X_train, y_train, X_test, y_test + +# Main function +def main(): + # Paths to the train and test directories + train_dir = "/home/aayusphere/Program-Classification/yali/Volume/Embeddings/milepost/codeforcestrainO0" + test_dir = "/home/aayusphere/Program-Classification/yali/Volume/Embeddings/milepost/codeforcestestO0" + + # Prepare data + X_train, y_train, X_test, y_test = prepare_data(train_dir, test_dir) + + # Check data shapes + print(f"Training data shape: {X_train.shape}") + print(f"Training labels shape: {y_train.shape}") + print(f"Testing data shape: {X_test.shape}") + print(f"Testing labels shape: {y_test.shape}") + + # One-hot encode labels + num_classes = len(np.unique(y_train)) + y_train = to_categorical(y_train, num_classes) + y_test = to_categorical(y_test, num_classes) + + # No train-test split for validation, using all X_train and y_train for training + model = getModel(X_train.shape[1], num_classes) + + mc = keras.callbacks.ModelCheckpoint( + filepath='/home/aayusphere/Program-Classification/milepost/weights_epoch_{epoch:08d}.weights.h5', + save_weights_only=True, + save_freq=500) + + + # Train the model + model.fit(X_train, + y_train, + batch_size=128, + epochs=2000, + verbose=1, + callbacks=[mc]) + + # Evaluate model + y_pred = np.argmax(model.predict(X_test), axis=1) + y_true = np.argmax(y_test, axis=1) + print("Classification Report:") + print(classification_report(y_true, y_pred)) + print("Confusion Matrix:") + print(confusion_matrix(y_true, y_pred)) + print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}") + + # Save the trained model + model.save("codeforces-milepost-ir2vec-model.h5") + print("Saved model to disk as 'codeforces-milepost-ir2vec-model.keras'.") + + return model + +# Execute the script +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/models/ir2vec-classifier.py b/scripts/models/ir2vec-classifier.py new file mode 100644 index 0000000..fa87090 --- /dev/null +++ b/scripts/models/ir2vec-classifier.py @@ -0,0 +1,315 @@ +# IITH-Compilers - Rohit Aggarwal, VenkataKeerthy + +# First run preprocess.py to split the data into training, testing and val if the splited data is not present. + +# Usage Instructions +# python IR2Vec_classifier.py [options] +# --train: Path of the training data file +# --test: Path of the testing data file +# --val: Path of the validation data file +# --epochs: Number of Epochs +# --batch_size: Size of the batch +# --model: Path of the trained Model + +# Structure of the Input data +# label<\t>vector_dim1<\t>vector_dim2<\t>.......<\t>vector_dimN + +# For Training: +# python IR2Vec_classifier.py --train [--test ] [--val ] [--epochs XX] [--batch_size YY] +# While training snapshot of the model is saved after 10 epochs. +# dictionary.pkl file is produced which have the data used in testing + +# For retraining the model further from saved checkpoint of the model: +# python IR2Vec_classifier.py --data [--test ] [--val ] [--epochs XX] [--batch_size YY] --model + +# For Testing: +# python IR2Vec_classifier.py --test --model +#--------------------------------------------------------------------------------------------------- + +# import numpy as np +# import pandas as pd +# from sklearn.decomposition import IncrementalPCA +# from sklearn.metrics import accuracy_score +# from sklearn.model_selection import train_test_split + +# import keras +# from keras import optimizers +# from keras.layers import (Activation, Dense, Dropout) +# from keras.layers.normalization import BatchNormalization +# from keras.models import Sequential +# import argparse +# import pickle + +import numpy as np +import pandas as pd +from sklearn.decomposition import IncrementalPCA +from sklearn.metrics import accuracy_score +from sklearn.model_selection import train_test_split + +# Import TensorFlow Keras +from tensorflow import keras +from tensorflow.keras import optimizers +from tensorflow.keras.layers import (Activation, Dense, Dropout, BatchNormalization) +from tensorflow.keras.models import Sequential +import argparse +import pickle + +# Create the model +def getModel(input_dim, output_dim): + model = Sequential() + + model.add( + Dense(650, + input_shape=(input_dim, ), + kernel_initializer=keras.initializers.glorot_normal(seed=None))) # Initializes weights using Glorot normal initializer + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Dropout(0.25)) + + model.add( + Dense(600, + kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Dropout(0.25)) + + model.add( + Dense(500, + kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Dropout(0.25)) + + model.add( + Dense(output_dim, + kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation('softmax')) + opt = keras.optimizers.Adam(lr=0.001,beta_1=0.9, beta_2=0.999, decay=0.0, amsgrad=False) + model.compile(loss=keras.losses.categorical_crossentropy, + optimizer=opt, metrics=['accuracy']) + model.summary() + + return model + +# train the model on the given data +def train(x_train, y_train, x_test, y_test,x_val, y_val, options, model): + X_min = x_train.min() + X_max = x_train.max() + + # Ensure the correct number of classes is derived + num_classes = np.unique(y_train).shape[0] + print(f" Number of classes: {num_classes}") + # num_classes = np.unique(y_train).shape[0] + + # Ensure the correct number of classes (only for Code Jam dataset) + # num_classes= num_classes+1 + + num_classes = np.max(y_train) + 1 + print(f"Adjusted number of classes: {num_classes}") + + # Normalize and preprocess data + x_train = (x_train - X_min) / (X_max - X_min) + x_train = np.array(x_train) + y_train = np.array(y_train) + + y_train = y_train - 1 + print(f" After subtracting -1 from labels: {y_train}") + print(f" After subtracting -1 from labels: {np.unique(y_train).shape[0]}") + + # Print the unique values in y_train to check the range of labels + print("Unique values in y_train:", np.unique(y_train)) + + y_train = keras.utils.to_categorical(y_train, num_classes) + + print(y_train) + + # PCA transformation + ipca = IncrementalPCA(n_components=300) + ipca.fit(x_train) + x_train = ipca.transform(x_train) + + # Handle validation data similarly + val_tuple = None + if x_val is not None: + x_val = (x_val - X_min) / (X_max - X_min) + x_val = np.array(x_val) + y_val = np.array(y_val) + y_val = y_val - 1 + y_val = keras.utils.to_categorical(y_val, num_classes) + x_val = ipca.transform(x_val) + val_tuple = (x_val, y_val) + # print(x_val[0]) + + # Setup model and training parameters + batch_size = options.batch_size + epochs = options.epochs + + # from keras.callbacks import Callback + + # class SaveEveryNepochs(Callback): + # def __init__(self, filepath, save_every=100): + # super(SaveEveryNepochs, self).__init__() + # self.filepath = filepath + # self.save_every = save_every + + # def on_epoch_end(self, epoch, logs=None): + # if (epoch + 1) % self.save_every == 0: # Save every N epochs + # self.model.save_weights(self.filepath.format(epoch=epoch+1)) + + # # Use the custom callback + # mc = SaveEveryNepochs(filepath='weights{epoch:08d}.h5', save_every=100) + + # mc = keras.callbacks.ModelCheckpoint('weights{epoch:08d}.h5', save_weights_only=False, period=10) + mc = keras.callbacks.ModelCheckpoint(filepath='/home/cs24mtech02001/Aayush-IR2Vec/program-classification-model-weights/ir2vec/fa/codejam-ir2vec-fa-model/weights{epoch:08d}.h5', save_weights_only=False, save_freq='epoch', period=500) + # mc = keras.callbacks.ModelCheckpoint('weights{epoch:08d}.keras', save_weights_only=False, save_freq=10) + + if model is None: + model = getModel(x_train.shape[1], num_classes) + + model.fit(x_train, + y_train, + batch_size=batch_size, + epochs=epochs, + verbose=1, + validation_data=val_tuple, callbacks=[mc]) + + # model.save("codejam-combined-last-model.h5") + model.save("codejam-137-ir2vec-fa-model.h5") + print("Saved model to disk") + + if x_test is not None: + x_test = (x_test - X_min) / (X_max - X_min) + x_test = np.array(x_test) + y_test = np.array(y_test) + y_test = y_test - 1 + y_test = keras.utils.to_categorical(y_test, num_classes) + x_test = ipca.transform(x_test) + score = model.evaluate(x_test, y_test, verbose=0) + print('Test Accuracy : {acc:.3f}%'.format(acc=score[1]*100)) + + with open('dictionary.pkl', 'wb') as f: + pickle.dump(num_classes, f) + pickle.dump(X_min, f) + pickle.dump(X_max, f) + pickle.dump(ipca, f) + + +# test the learnt model on the data +def test(X, targetLabel, model): + with open('dictionary.pkl', 'rb') as f: + num_classes = pickle.load(f) + X_min = pickle.load(f) + X_max = pickle.load(f) + ipca=pickle.load(f) + + X = (X - X_min) / (X_max - X_min) + X = np.array(X) + targetLabel = np.array(targetLabel) + targetLabel = targetLabel - 1 + targetLabel = keras.utils.to_categorical(targetLabel, num_classes) + X = ipca.transform(X) + + score = model.evaluate(X, targetLabel, verbose=0) + print('Test accuracy : {acc:.3f}%'.format(acc=score[1]*100)) + +# Entry Point of the program +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument('-tr', '--train', dest='train', metavar='FILE', help='Path Of the Data/embedding file having training data', default=None) + parser.add_argument('-t', '--test', dest='test', metavar='FILE', help='Path Of the Data/embedding file having testing data', default=None) + parser.add_argument('-v', '--val', dest='val', metavar='FILE', help='Path Of the Data/embedding file having validation data', default=None) + + + # parser.add_argument('-t', '--test', dest='test', action="store_true") + parser.add_argument('-e', '--epochs', dest='epochs', required=False, type=int, help='Number of Epoches', default=100) + parser.add_argument('-bs', '--batch_size', dest='batch_size', required=False, type=int, help='Tune the batch size', default=32) + parser.add_argument('-m', '--model', dest='model', metavar='FILE', help='Path Of the file with learnt weights.', required=False, default=None) + args = parser.parse_args() + + # trained/Learnt model is required for the testing phase. + if args.test is None and args.train is None: + print("Enter training or testing data") + exit() + + X_test = None + y_test = None + # if args.test is not None: + # X_test = pd.read_csv(args.test, sep='\t', header=None) + # y_test = X_test.loc[:,0] + # X_test = X_test.loc[:,1:] + # X_test.columns = range(X_test.shape[1]) + + # if args.train is not None: + # X = pd.read_csv(args.train, sep='\t',header=None) + # Y = X.loc[:,0] + # X = X.loc[:,1:] + # X.columns = range(X.shape[1]) + + # X_val = None + # y_val = None + # if args.val is not None: + # X_val = pd.read_csv(args.val, sep='\t', header=None) + # y_val = X_val.loc[:,0] + # X_val = X_val.loc[:,1:] + # X_val.columns =range(X_val.shape[1]) + + # model = None + # if args.model is not None: + # print('============================The trained weight to initialize the NN=========================================') + # model = keras.models.load_model(args.model) + # model.summary() + + # train(X, Y, X_test, y_test,X_val, y_val, args, model) + # + # if args.test is not None: + X_test = pd.read_csv(args.test, sep='\t', header=None) + y_test = X_test.loc[:,0] + X_test = X_test.loc[:,1:] + X_test.columns = range(X_test.shape[1]) + + print("Test Set:") + print(f"X_test shape: {X_test.shape}") + print(f"y_test unique counts: \n{y_test.value_counts()}") + + if args.train is not None: + X = pd.read_csv(args.train, sep='\t', header=None) + Y = X.loc[:,0] + X = X.loc[:,1:] + X.columns = range(X.shape[1]) + + print("Train Set:") + print(f"X_train shape: {X.shape}") + print(f"y_train unique counts: \n{Y.value_counts()}") + + X_val = None + y_val = None + if args.val is not None: + X_val = pd.read_csv(args.val, sep='\t', header=None) + y_val = X_val.loc[:,0] + X_val = X_val.loc[:,1:] + X_val.columns = range(X_val.shape[1]) + + print("Validation Set:") + print(f"X_val shape: {X_val.shape}") + print(f"y_val unique counts: \n{y_val.value_counts()}") + + model = None + if args.model is not None: + print('============================The trained weight to initialize the NN=========================================') + model = keras.models.load_model(args.model) + model.summary() + + train(X, Y, X_test, y_test, X_val, y_val, args, model) + + elif args.test is not None: + + if args.model is None: + print('***********************Model is not passed in the testing**************') + exit() + + model = keras.models.load_model(args.model) + model.summary() + + test(X_test, y_test, model) \ No newline at end of file diff --git a/scripts/models/milepost-mlp-model.py b/scripts/models/milepost-mlp-model.py new file mode 100644 index 0000000..8f69ab0 --- /dev/null +++ b/scripts/models/milepost-mlp-model.py @@ -0,0 +1,159 @@ +import os +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.metrics import classification_report, confusion_matrix, accuracy_score + +# Import TensorFlow Keras +import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import optimizers +from tensorflow.keras.utils import to_categorical +from tensorflow.keras.layers import (Activation, Dense, Dropout, BatchNormalization) +from tensorflow.keras.models import Sequential + +# Model definition +def getModel(input_dim, output_dim): + model = Sequential() + + model.add(Dense(650, input_shape=(input_dim,), kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Dropout(0.25)) + + model.add(Dense(600, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Dropout(0.25)) + + model.add(Dense(500, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation('relu')) + model.add(Dropout(0.25)) + + model.add(Dense(output_dim, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation('softmax')) + + opt = keras.optimizers.Adam(learning_rate=0.001) + model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) + model.summary() + + return model + +# Load data from directory +# def load_data_from_directory(directory): +# data = [] +# labels = [] +# classes = sorted(os.listdir(directory)) # Ensure consistent label mapping +# class_to_label = {cls: idx for idx, cls in enumerate(classes)} + +# for cls in classes: +# class_path = os.path.join(directory, cls) +# if os.path.isdir(class_path): +# for file_name in os.listdir(class_path): +# if file_name.endswith(".npz"): +# file_path = os.path.join(class_path, file_name) +# try: +# loaded = np.load(file_path)["values"] +# data.append(loaded.flatten()) +# labels.append(class_to_label[cls]) +# except Exception as e: +# print(f"Failed to load {file_path}: {e}") + +# return np.array(data), np.array(labels) + +def load_data_from_directory(directory, max_length=56): + data = [] + labels = [] + classes = sorted(os.listdir(directory)) # Ensure consistent label mapping + class_to_label = {cls: idx for idx, cls in enumerate(classes)} + + for cls in classes: + class_path = os.path.join(directory, cls) + if os.path.isdir(class_path): + for file_name in os.listdir(class_path): + if file_name.endswith(".npz"): + file_path = os.path.join(class_path, file_name) + try: + loaded = np.load(file_path)["values"] + flattened = loaded.flatten() + + # Handle sequences with length 0 + if len(flattened) == 0: + print(f"Replacing zero-length data in {file_path}") + flattened = np.zeros(max_length) + + # Pad or truncate the sequence to max_length + if len(flattened) < max_length: + padded = np.pad(flattened, (0, max_length - len(flattened)), 'constant') + else: + padded = flattened[:max_length] + + data.append(padded) + labels.append(class_to_label[cls]) + except Exception as e: + print(f"Failed to load {file_path}: {e}") + + return np.array(data), np.array(labels) + +# Prepare train and test data +def prepare_data(train_dir, test_dir): + X_train, y_train = load_data_from_directory(train_dir) + X_test, y_test = load_data_from_directory(test_dir) + + return X_train, y_train, X_test, y_test + +# Main function +def main(): + # Paths to the train and test directories + train_dir = "/home/cs24mtech02001/Program-Classification/yali/Volume/Embeddings/milepost/codejam-trainO0" + test_dir = "/home/cs24mtech02001/Program-Classification/yali/Volume/Embeddings/milepost/codejam-testO0" + + # Prepare data + X_train, y_train, X_test, y_test = prepare_data(train_dir, test_dir) + + # Check data shapes + print(f"Training data shape: {X_train.shape}") + print(f"Training labels shape: {y_train.shape}") + print(f"Testing data shape: {X_test.shape}") + print(f"Testing labels shape: {y_test.shape}") + + # One-hot encode labels + num_classes = len(np.unique(y_train)) + y_train = to_categorical(y_train, num_classes) + y_test = to_categorical(y_test, num_classes) + + # No train-test split for validation, using all X_train and y_train for training + model = getModel(X_train.shape[1], num_classes) + + mc = keras.callbacks.ModelCheckpoint( + filepath='/Pramana/IR2Vec/pc-embeddings-model-weight/codejam-milepost/weights_epoch_{epoch:08d}.weights.h5', + save_weights_only=True, + save_freq=500) + + # Train the model + model.fit(X_train, + y_train, + batch_size=128, + epochs=2000, + verbose=1, + callbacks=[mc]) + + # Evaluate model + y_pred = np.argmax(model.predict(X_test), axis=1) + y_true = np.argmax(y_test, axis=1) + print("Classification Report:") + print(classification_report(y_true, y_pred)) + print("Confusion Matrix:") + print(confusion_matrix(y_true, y_pred)) + print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}") + + # Save the trained model + model.save("milepost-ir2vec-codejam-model.h5") + print("Saved model to disk as 'milepost-ir2vec-codejam-model.keras'.") + + return model + +# Execute the script +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/preprocessing/copy-folders.py b/scripts/preprocessing/copy-folders.py new file mode 100644 index 0000000..47dc9a5 --- /dev/null +++ b/scripts/preprocessing/copy-folders.py @@ -0,0 +1,20 @@ +import os +import shutil + +# Define the source and destination directories +source_dir = "/home/cs24mtech02001/Aayush-IR2Vec/CodeJam-data/srcfiles" +destination_dir = "/home/cs24mtech02001/Aayush-IR2Vec/CodeJam-data/data" + +# Ensure the destination directory exists +os.makedirs(destination_dir, exist_ok=True) + +# Iterate over all items in the source directory +for folder in os.listdir(source_dir): + folder_path = os.path.join(source_dir, folder) + if os.path.isdir(folder_path): # Check if it's a folder + # Define the destination path for the folder + dest_folder_path = os.path.join(destination_dir, folder) + # Copy the folder to the destination directory + shutil.copytree(folder_path, dest_folder_path, dirs_exist_ok=True) + +print("All folders copied successfully to", destination_dir) \ No newline at end of file diff --git a/scripts/preprocessing/copy-profiled-ll-files.py b/scripts/preprocessing/copy-profiled-ll-files.py new file mode 100644 index 0000000..cf8d9a5 --- /dev/null +++ b/scripts/preprocessing/copy-profiled-ll-files.py @@ -0,0 +1,44 @@ +import os +import shutil + +def copy_profiled_ll_files(source_top_level, target_top_level): + """ + Copies all the files from `profiled-ll-files` directories under the source folder + to corresponding subdirectories in the target folder, preserving parent directory structure. + + :param source_top_level: The source top-level directory containing subdirectories. + :param target_top_level: The target top-level directory where subdirectories and files will be copied. + """ + # Ensure the target top-level directory exists + os.makedirs(target_top_level, exist_ok=True) + + # Walk through each subdirectory in the source top-level directory + for root, dirs, files in os.walk(source_top_level): + if 'profiled-ll-files' in dirs: + # Extract the parent directory name + parent_dir_name = os.path.basename(root) + + # Define source and target paths for profiled-ll-files + source_profiled_path = os.path.join(root, 'profiled-ll-files') + target_subdir_path = os.path.join(target_top_level, parent_dir_name) + + # Create the corresponding target subdirectory + os.makedirs(target_subdir_path, exist_ok=True) + + # Copy all files from the source profiled-ll-files to the target subdirectory + for file_name in os.listdir(source_profiled_path): + source_file = os.path.join(source_profiled_path, file_name) + target_file = os.path.join(target_subdir_path, file_name) + + if os.path.isfile(source_file): + shutil.copy2(source_file, target_file) + +if __name__ == "__main__": + # Replace with your source and target top-level paths + source_top_level = "/Pramana/IR2Vec/cofo" + target_top_level = "/Pramana/IR2Vec/COFO-profiled-ll-files-17.x" + + copy_profiled_ll_files(source_top_level, target_top_level) + print(f"Files copied successfully to {target_top_level}") + +# /Pramana/IR2Vec/Aayush-IR2Vec-Brahmaputra \ No newline at end of file diff --git a/scripts/preprocessing/delete-subfolders.py b/scripts/preprocessing/delete-subfolders.py new file mode 100644 index 0000000..e7518d1 --- /dev/null +++ b/scripts/preprocessing/delete-subfolders.py @@ -0,0 +1,32 @@ +import os +import shutil + +def delete_small_subfolders(top_level_dir, file_threshold=200): + if not os.path.exists(top_level_dir): + print(f"The provided directory '{top_level_dir}' does not exist.") + return + + # Iterate through all subfolders in the top-level directory + for subfolder in os.listdir(top_level_dir): + subfolder_path = os.path.join(top_level_dir, subfolder) + + # Skip if it's not a directory + if not os.path.isdir(subfolder_path): + continue + + # Count the number of files in the subfolder + file_count = sum([1 for f in os.listdir(subfolder_path) if os.path.isfile(os.path.join(subfolder_path, f))]) + + # If file count is less than the threshold, delete the subfolder + if file_count < file_threshold: + try: + shutil.rmtree(subfolder_path) + print(f"Deleted '{subfolder}' as it contains fewer than {file_threshold} files.") + except Exception as e: + print(f"Failed to delete '{subfolder}': {e}") + +if __name__ == "__main__": + # Provide the path to the top-level directory + top_level_dir = "/path/to/top_level_directory" # Replace with your directory path + file_threshold=200 + delete_small_subfolders(top_level_dir, file_threshold) \ No newline at end of file diff --git a/scripts/preprocessing/folder-count.py b/scripts/preprocessing/folder-count.py new file mode 100644 index 0000000..ee1a662 --- /dev/null +++ b/scripts/preprocessing/folder-count.py @@ -0,0 +1,40 @@ +import os +import matplotlib.pyplot as plt + +# Define the base path where the folders are located +base_path = '/Pramana/IR2Vec/Program-Classification/datasets-profiled-llvm-17.x/poj-104-profiled-ll-files' + +# Initialize lists to store folder names, file counts, and empty folders +folder_names = [] +file_counts = [] +empty_folders = [] + +# Dynamically list all folder names in the base path +for folder in os.listdir(base_path): + folder_path = os.path.join(base_path, folder) + if os.path.isdir(folder_path): # Check if it is a folder + num_files = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]) + folder_names.append(folder) + file_counts.append(num_files) + if num_files == 0: + empty_folders.append(folder) + +# Sort folders and file counts by folder name (numerically) +folder_names, file_counts = zip(*sorted(zip(folder_names, file_counts), key=lambda x: int(x[0]))) + +# Print folders and their file counts +print("Folders and their file counts:") +for folder, count in zip(folder_names, file_counts): + print(f"Folder name: {folder:>5}, No. of files: {count:>5}") + +# Print empty folders +if empty_folders: + print("\nFolders with zero files:") + for folder in sorted(empty_folders, key=int): + print(f"Folder name: {folder}") +else: + print("\nNo folders with zero files.") + +# Calculate the total number of folders +total_folders = len(folder_names) +print(f"\nTotal number of folders: {total_folders}") \ No newline at end of file diff --git a/scripts/preprocessing/folder-preprocessing.py b/scripts/preprocessing/folder-preprocessing.py new file mode 100644 index 0000000..491d3f2 --- /dev/null +++ b/scripts/preprocessing/folder-preprocessing.py @@ -0,0 +1,50 @@ +import os +import shutil + +# Define the base path where the folders are located +base_path = '/Pramana/IR2Vec/codeforces-profiled-ll-files-llvm17' + +# Initialize lists to store folder names, file counts, and folders to delete +folder_names = [] +file_counts = [] +folders_to_delete = [] + +# Dynamically list all folder names in the base path +for folder in os.listdir(base_path): + folder_path = os.path.join(base_path, folder) + if os.path.isdir(folder_path): # Check if it is a folder + num_files = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]) + folder_names.append(folder) + file_counts.append(num_files) + # Mark folders with less than 20 files or 0 files for deletion + # if num_files == 0 or num_files < 20: + # folders_to_delete.append(folder_path) + if num_files == 0: + folders_to_delete.append(folder_path) + +# Delete the marked folders +for folder_path in folders_to_delete: + try: + shutil.rmtree(folder_path) # Delete the folder and all its contents + print(f"Deleted folder: {folder_path}") + except Exception as e: + print(f"Error deleting folder {folder_path}: {e}") + +# Recheck remaining folders +folder_names = [] +file_counts = [] + +for folder in os.listdir(base_path): + folder_path = os.path.join(base_path, folder) + if os.path.isdir(folder_path): # Check if it is a folder + num_files = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]) + folder_names.append(folder) + file_counts.append(num_files) + +# Sort folders and file counts by folder name (numerically) +folder_names, file_counts = zip(*sorted(zip(folder_names, file_counts), key=lambda x: int(x[0]))) + +# Print the remaining folders and their file counts +print("\nRemaining folders and their file counts:") +for folder, count in zip(folder_names, file_counts): + print(f"Folder name: {folder:>5}, No. of files: {count:>5}") \ No newline at end of file diff --git a/scripts/preprocessing/generate-ir2vec-embeddings.py b/scripts/preprocessing/generate-ir2vec-embeddings.py new file mode 100644 index 0000000..72c6f72 --- /dev/null +++ b/scripts/preprocessing/generate-ir2vec-embeddings.py @@ -0,0 +1,84 @@ +import os +import ir2vec +import concurrent.futures +import gc + +def process_file(file_path, folder_name, encoding_type="sym", level="p", dim=300): + """ + Processes a single .ll file to generate its embedding. + """ + try: + # Initialize IR2Vec embedding + initObj = ir2vec.initEmbedding(file_path, encoding_type, level, dim) + + # Get the program-level vector representation + progVector = initObj.getProgramVector() + + # Prepare the output line: `label<\t>embedding_values` + output_line = f"{folder_name}\t" + "\t".join(map(str, progVector)) + "\n" + + # Explicitly clean up the embedding object to free memory + del initObj + return output_line + except Exception as e: + print(f"Error processing file {file_path}: {e}") + return None + +def process_folder_parallel(folder_path, folder_name, encoding_type="sym", level="p", dim=300): + """ + Processes all .ll files in a folder in parallel and returns the embeddings as lines. + """ + lines = [] + file_paths = [ + os.path.join(folder_path, filename) + for filename in os.listdir(folder_path) + if filename.endswith(".ll") + ] + + # Process files in parallel using ThreadPoolExecutor or ProcessPoolExecutor + with concurrent.futures.ProcessPoolExecutor(max_workers=30) as executor: + futures = [ + executor.submit(process_file, file_path, folder_name, encoding_type, level, dim) + for file_path in file_paths + ] + for future in concurrent.futures.as_completed(futures): + result = future.result() + if result: + lines.append(result) + + return lines + +def generate_embeddings(input_folder, output_txt_path, encoding_type="sym", level="p", dim=300): + """ + Iterates over all folders to generate embeddings for .ll files, processing each folder one at a time. + """ + with open(output_txt_path, 'w') as output_file: + # Iterate over all folders + for i in range(1, 343): + folder_name = str(i) + folder_path = os.path.join(input_folder, folder_name) + + # Check if the folder exists + if os.path.isdir(folder_path): + print(f"Processing folder {folder_name}...") + + # Process all files in the folder in parallel + lines = process_folder_parallel(folder_path, folder_name, encoding_type, level, dim) + + # Write results to the output file + output_file.writelines(lines) + + # Force garbage collection to free memory after processing a folder + gc.collect() + + print(f"Embeddings for all files saved to {output_txt_path}.") + +# Specify the input folder and output text file path +input_folder = "/home/cs24mtech02001/Aayush-IR2Vec/datasets-17.x/codeforces/test" +output_txt_path = "/home/cs24mtech02001/Aayush-IR2Vec/datasets-17.x/codeforces/sym/codeforces-sym-test.txt" +encoding_type="sym" +level="p" +dim=300 + +# Generate embeddings for all .ll files across all folders and save them in the text file +generate_embeddings(input_folder, output_txt_path) \ No newline at end of file diff --git a/scripts/preprocessing/ir2vec-preprocess.py b/scripts/preprocessing/ir2vec-preprocess.py new file mode 100644 index 0000000..ed67023 --- /dev/null +++ b/scripts/preprocessing/ir2vec-preprocess.py @@ -0,0 +1,132 @@ +# IITH-Compilers - Rohit Aggarwal, VenkataKeerthy +# +# Usage Instructions +# python preprocess.py [options] +# --data: Path of the data file +# +# Structure of the Input data +# label<\t>vector_dim1<\t>vector_dim2<\t>.......<\t>vector_dimN +# +# For spliting the data: +# python preprocess.py --data +# +#------------------------------------------------------------------------------------------# +import argparse +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +import os + +# Load the data file +def load_data(filepath): + lines = [line.strip('\n\t') for line in open(filepath)] + entity = [] + rep = [] + targetLabel = [] + flag = 0 + for line in lines: + if flag == 0: + flag = 1 + continue + else: + r = line.split('\t') + targetLabel.append(int(r[0])) + res = r[1:] + res_double = [float(val) for val in res] + rep.append(res_double) + + X = pd.DataFrame(rep) + + return X, targetLabel + +# Save the data to the file +def saveToFile(X,Y,filepath): + X = pd.DataFrame(X) + Y = pd.DataFrame(Y) + temp = pd.concat([Y, X], axis=1) + temp.columns = range(temp.shape[1]) + temp.to_csv(filepath,header=None,index=False,sep='\t') + +# def splitData(X, Y, args): +# from collections import Counter +# X = np.array(X) +# Y = np.array(Y) + +# # Check if stratified splitting is feasible +# class_counts = Counter(Y) +# min_class_count = min(class_counts.values()) +# if min_class_count < 2: +# print(f"Warning: Some classes have fewer than 2 samples. Skipping stratified splitting.") +# stratify = None +# else: +# stratify = Y + +# x_train, x_test, y_train, y_test = train_test_split( +# X, Y, train_size=0.6, test_size=0.4, random_state=123, stratify=stratify) + +# x_test, x_val, y_test, y_val = train_test_split( +# x_test, y_test, train_size=0.5, test_size=0.5, random_state=123, stratify=y_test if stratify is not None else None) + +# dirname = os.path.basename(args.data).replace(".txt", "") +# if not os.path.exists(dirname): +# os.makedirs(dirname) + +# train_file_path = os.path.join(dirname, "training.csv") +# saveToFile(x_train, y_train, train_file_path) +# print(f'Training data created =====> {train_file_path}.') + +# test_file_path = os.path.join(dirname, "testing.csv") +# saveToFile(x_test, y_test, test_file_path) +# print(f'Testing data created =====> {test_file_path}.') + +# val_file_path = os.path.join(dirname, "val.csv") +# saveToFile(x_val, y_val, val_file_path) +# print(f'Validation data created =====> {val_file_path}.') + + +# Split the data into train, test and val +def splitData(X, Y, args): + X = np.array(X) + Y = np.array(Y) + x_train, x_test, y_train, y_test = train_test_split(X, + Y, + train_size=0.6, + test_size=0.4, + random_state=123, + stratify=Y) + + + x_test, x_val, y_test, y_val = train_test_split(x_test, + y_test, + train_size=0.5, + test_size=0.5, + random_state=123, + stratify=y_test) + + dirname = os.path.basename(args.data).replace(".txt","") + if not os.path.exists(dirname): + os.makedirs(dirname) + train_file_path=os.path.join(dirname,"training.csv") + saveToFile(x_train,y_train,train_file_path) + print('Training data created =====> {}.'.format(train_file_path)) + + test_file_path= os.path.join(dirname, "testing.csv") + saveToFile(x_test,y_test,test_file_path) + print('Testing data created =====> {}.'.format(test_file_path)) + + val_file_path= os.path.join(dirname, "val.csv") + saveToFile(x_val,y_val,val_file_path) + print('validation data created =====> {}.'.format(val_file_path)) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument('-d', '--data', dest='data', metavar='FILE', help='Path Of the Data/embedding file', required=True) + args = parser.parse_args() + + + X,Y = load_data(args.data) + print('Data loaded. Start the Splitting of the data....') + print(f"Loaded data: X.shape={len(X)}, Y.shape={len(Y)}") + splitData(X,Y, args) \ No newline at end of file diff --git a/scripts/preprocessing/merge-directories.py b/scripts/preprocessing/merge-directories.py new file mode 100644 index 0000000..c80d6c3 --- /dev/null +++ b/scripts/preprocessing/merge-directories.py @@ -0,0 +1,46 @@ +import os +import shutil + +def merge_directories(source_dirs, destination_dir): + """ + Merges files from multiple source directories into the destination directory, + preserving the subdirectory structure and ensuring unique filenames. + + Parameters: + - source_dirs: List of source directories to merge + - destination_dir: Path to the destination directory + """ + # Create the destination directory if it doesn't exist + if not os.path.exists(destination_dir): + os.makedirs(destination_dir) + + for source_dir in source_dirs: + # Iterate through all subdirectories in the source directory + for subdir in os.listdir(source_dir): + source_subdir = os.path.join(source_dir, subdir) + destination_subdir = os.path.join(destination_dir, subdir) + + # Ensure the destination subdirectory exists + if not os.path.exists(destination_subdir): + os.makedirs(destination_subdir) + + # Copy files from the source subdirectory + if os.path.isdir(source_subdir): + for filename in os.listdir(source_subdir): + src_file = os.path.join(source_subdir, filename) + if os.path.isfile(src_file): + # Append the source directory name to the filename for uniqueness + unique_filename = f"{os.path.basename(source_dir)}_{filename}" + dst_file = os.path.join(destination_subdir, unique_filename) + shutil.copy(src_file, dst_file) + +# Usage example +source_directories = [ + '/Pramana/IR2Vec/datasets/CodeJam-data/code-jam-00-ll-files', + '/Pramana/IR2Vec/datasets/CodeJam-data/code-jam-01-ll-files', + '/Pramana/IR2Vec/datasets/CodeJam-data/code-jam-02-ll-files', + '/Pramana/IR2Vec/datasets/CodeJam-data/code-jam-03-ll-files' +] +destination_directory = '/Pramana/IR2Vec/datasets/CodeJam-data/llvm17-ll-files' + +merge_directories(source_directories, destination_directory) \ No newline at end of file diff --git a/scripts/preprocessing/rename-folders.py b/scripts/preprocessing/rename-folders.py new file mode 100644 index 0000000..bb8c756 --- /dev/null +++ b/scripts/preprocessing/rename-folders.py @@ -0,0 +1,39 @@ +import os + +def rename_folders_sequentially(directory_path): + try: + # Get a list of all subfolders in the directory + subfolders = [folder for folder in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, folder))] + + # Print the total count of subfolders + total_folders = len(subfolders) + print(f"Total number of subfolders: {total_folders}") + + # Sort the subfolders alphabetically + subfolders.sort() + + # Step 1: Rename all folders to temporary names to avoid conflicts + temp_names = {} + for index, folder in enumerate(subfolders, start=1): + old_path = os.path.join(directory_path, folder) + temp_name = f"temp_{index}" + temp_path = os.path.join(directory_path, temp_name) + os.rename(old_path, temp_path) + temp_names[temp_name] = folder # Track the original name + + # Step 2: Rename temporary names to sequential numbers + for index, temp_name in enumerate(temp_names.keys(), start=1): + temp_path = os.path.join(directory_path, temp_name) + new_folder_name = str(index) + new_path = os.path.join(directory_path, new_folder_name) + os.rename(temp_path, new_path) + print(f"Renamed: {temp_names[temp_name]} -> {new_folder_name}") + + print("Renaming completed successfully.") + + except Exception as e: + print(f"An error occurred: {e}") + +# Replace 'your_directory_path' with the actual path to the top-level directory +directory_path = "/Pramana/IR2Vec/yali/codeforces/test" +rename_folders_sequentially(directory_path) \ No newline at end of file diff --git a/scripts/preprocessing/split-dataset.py b/scripts/preprocessing/split-dataset.py new file mode 100644 index 0000000..0bc053d --- /dev/null +++ b/scripts/preprocessing/split-dataset.py @@ -0,0 +1,206 @@ +# import os +# import shutil +# from sklearn.model_selection import train_test_split + +# def split_dataset(source_dir, train_dir, test_dir, test_size=0.4): +# """ +# Splits a dataset into training and testing sets. + +# :param source_dir: Path to the source directory containing class subfolders. +# :param train_dir: Path to the training directory to be created. +# :param test_dir: Path to the testing directory to be created. +# :param test_size: Proportion of the dataset to include in the test split. +# """ +# # Ensure the output directories are empty +# if os.path.exists(train_dir): +# shutil.rmtree(train_dir) +# if os.path.exists(test_dir): +# shutil.rmtree(test_dir) +# os.makedirs(train_dir) +# os.makedirs(test_dir) + +# # Iterate through each class directory +# for class_name in os.listdir(source_dir): +# class_path = os.path.join(source_dir, class_name) +# if os.path.isdir(class_path): +# # Collect all files in the class directory +# files = [os.path.join(class_path, file) for file in os.listdir(class_path) if os.path.isfile(os.path.join(class_path, file))] + +# # Split files into training and testing sets +# train_files, test_files = train_test_split(files, test_size=test_size, random_state=42) + +# # Create class subdirectories in train and test directories +# train_class_dir = os.path.join(train_dir, class_name) +# test_class_dir = os.path.join(test_dir, class_name) +# os.makedirs(train_class_dir, exist_ok=True) +# os.makedirs(test_class_dir, exist_ok=True) + +# # Copy files to respective directories +# for file in train_files: +# shutil.copy(file, train_class_dir) +# for file in test_files: +# shutil.copy(file, test_class_dir) + +# print(f"Dataset split completed. Training data in '{train_dir}', testing data in '{test_dir}'.") + +# def split_dataset(source_directory, train_directory, test_directory, test_size=0.4): +# # Iterate over subdirectories (classes) in the source directory +# for sub_dir in os.listdir(source_directory): +# sub_dir_path = os.path.join(source_directory, sub_dir) + +# # Ensure it is a directory +# if not os.path.isdir(sub_dir_path): +# continue + +# # Get the list of files in the subdirectory +# files = [f for f in os.listdir(sub_dir_path) if os.path.isfile(os.path.join(sub_dir_path, f))] +# num_files = len(files) + +# # Skip subdirectories with no files +# if num_files == 0: +# print(f"Skipping empty subdirectory: {sub_dir_path}") +# continue + +# print(f"Processing subdirectory: {sub_dir_path}") + +# # Handle case with only one file +# if num_files == 1: +# print(f"Only one file found in {sub_dir_path}. Copying the file to both train and test directories.") +# train_sub_dir = os.path.join(train_directory, sub_dir) +# test_sub_dir = os.path.join(test_directory, sub_dir) + +# os.makedirs(train_sub_dir, exist_ok=True) +# os.makedirs(test_sub_dir, exist_ok=True) + +# file = files[0] +# shutil.copy(os.path.join(sub_dir_path, file), os.path.join(train_sub_dir, file)) +# shutil.copy(os.path.join(sub_dir_path, file), os.path.join(test_sub_dir, file)) +# print(f"Copied {file} to both train and test directories.") +# continue + +# # Handle case with more than one file +# train_files, test_files = train_test_split(files, test_size=test_size, random_state=42) + +# # Create destination subdirectories if they don't exist +# train_sub_dir = os.path.join(train_directory, sub_dir) +# test_sub_dir = os.path.join(test_directory, sub_dir) + +# os.makedirs(train_sub_dir, exist_ok=True) +# os.makedirs(test_sub_dir, exist_ok=True) + +# # Move files to the respective directories +# for file in train_files: +# shutil.move(os.path.join(sub_dir_path, file), os.path.join(train_sub_dir, file)) + +# for file in test_files: +# shutil.move(os.path.join(sub_dir_path, file), os.path.join(test_sub_dir, file)) + +# print(f"Dataset split completed for {sub_dir_path}. {len(train_files)} files in train, {len(test_files)} files in test.") + +# if __name__ == "__main__": +# # Hard-code the directories here +# # source_directory = "/path/to/source_directory" # Replace with actual path +# # train_directory = "/path/to/train_directory" # Replace with actual path +# # test_directory = "/path/to/test_directory" # Replace with actual path + +# # split_dataset(source_directory, train_directory, test_directory, test_size=0.4) +# source_directory = "/Pramana/IR2Vec/Program-Classification/datasets-profiled-llvm-17.x/codejam-profiled-ll-files" +# train_directory = "/Pramana/IR2Vec/train-test-split-datasets/codejam/train" +# test_directory = "/Pramana/IR2Vec/train-test-split-datasets/codejam/test" +# split_dataset(source_directory, train_directory, test_directory, test_size=0.4) + +# # # Example usage +# # source_directory = "/Pramana/IR2Vec/Program-Classification/datasets-profiled-llvm-17.x/codejam-profiled-ll-files" +# # train_directory = "/Pramana/IR2Vec/train-test-split-datasets/codejam/train" +# # test_directory = "/Pramana/IR2Vec/train-test-split-datasets/codejam/test" +# # split_dataset(source_directory, train_directory, test_directory, test_size=0.4) + +import os +import shutil +from sklearn.model_selection import train_test_split + +def split_dataset(source_directory, train_directory, test_directory, val_directory, train_ratio=0.6, test_ratio=0.2): + """ + Splits the dataset in the source directory into train, test, and validation sets. + + Parameters: + source_directory: Path to the source directory containing class subdirectories. + train_directory: Path to the directory where the training set will be stored. + test_directory: Path to the directory where the test set will be stored. + val_directory: Path to the directory where the validation set will be stored. + train_ratio: Proportion of data to allocate to the training set. + test_ratio: Proportion of data to allocate to the test set. + """ + for sub_dir in os.listdir(source_directory): + sub_dir_path = os.path.join(source_directory, sub_dir) + + # Ensure it is a directory + if not os.path.isdir(sub_dir_path): + continue + + # Get the list of files in the subdirectory + files = [f for f in os.listdir(sub_dir_path) if os.path.isfile(os.path.join(sub_dir_path, f))] + num_files = len(files) + + # Skip subdirectories with fewer than 200 files + if num_files < 200: + print(f"Skipping subdirectory with less than 200 files: {sub_dir_path} ({num_files} files)") + continue + + print(f"Processing subdirectory: {sub_dir_path}") + + # # Handle case with only one file + # if num_files == 1: + # print(f"Only one file found in {sub_dir_path}. Copying the file to train, test, and validation directories.") + # train_sub_dir = os.path.join(train_directory, sub_dir) + # test_sub_dir = os.path.join(test_directory, sub_dir) + # val_sub_dir = os.path.join(val_directory, sub_dir) + + # os.makedirs(train_sub_dir, exist_ok=True) + # os.makedirs(test_sub_dir, exist_ok=True) + # os.makedirs(val_sub_dir, exist_ok=True) + + # file = files[0] + # shutil.copy(os.path.join(sub_dir_path, file), os.path.join(train_sub_dir, file)) + # shutil.copy(os.path.join(sub_dir_path, file), os.path.join(test_sub_dir, file)) + # shutil.copy(os.path.join(sub_dir_path, file), os.path.join(val_sub_dir, file)) + # print(f"Copied {file} to train, test, and validation directories.") + # continue + + # Split files into train and temp (test + validation) + train_files, temp_files = train_test_split(files, test_size=(1 - train_ratio), random_state=42) + + # Split temp into test and validation + test_files, val_files = train_test_split(temp_files, test_size=(test_ratio / (1 - train_ratio)), random_state=42) + + # Create destination subdirectories if they don't exist + train_sub_dir = os.path.join(train_directory, sub_dir) + test_sub_dir = os.path.join(test_directory, sub_dir) + val_sub_dir = os.path.join(val_directory, sub_dir) + + os.makedirs(train_sub_dir, exist_ok=True) + os.makedirs(test_sub_dir, exist_ok=True) + os.makedirs(val_sub_dir, exist_ok=True) + + # Move files to the respective directories + for file in train_files: + shutil.copy(os.path.join(sub_dir_path, file), os.path.join(train_sub_dir, file)) + + for file in test_files: + shutil.copy(os.path.join(sub_dir_path, file), os.path.join(test_sub_dir, file)) + + for file in val_files: + shutil.copy(os.path.join(sub_dir_path, file), os.path.join(val_sub_dir, file)) + + print(f"Dataset split completed for {sub_dir_path}. " + f"{len(train_files)} files in train, {len(test_files)} files in test, {len(val_files)} files in validation.") + +if __name__ == "__main__": + # Hard-code the directories here + source_directory = "/Pramana/IR2Vec/dataset-opt-levels/codejam/O0" + # source_directory = "/Pramana/IR2Vec/test" + train_directory = "/home/cs24mtech02001/Aayush-IR2Vec/datasets-17.x/codejam/train" + test_directory = "/home/cs24mtech02001/Aayush-IR2Vec/datasets-17.x/codejam/test" + val_directory = "/home/cs24mtech02001/Aayush-IR2Vec/datasets-17.x/codejam/val" + + split_dataset(source_directory, train_directory, test_directory, val_directory, train_ratio=0.6, test_ratio=0.2) \ No newline at end of file diff --git a/scripts/preprocessing/train-test-val-to-csv.py b/scripts/preprocessing/train-test-val-to-csv.py new file mode 100644 index 0000000..92d88c4 --- /dev/null +++ b/scripts/preprocessing/train-test-val-to-csv.py @@ -0,0 +1,98 @@ +# Updated Script for Handling Separate Folders for Train, Test, and Val +import argparse +import pandas as pd +import numpy as np +import os +from sklearn.model_selection import train_test_split +from collections import Counter + +def load_data(filepath): + lines = [line.strip('\n\t') for line in open(filepath)] + rep, targetLabel = [], [] + flag = 0 + for line in lines: + if flag == 0: + flag = 1 + continue + else: + r = line.split('\t') + targetLabel.append(int(r[0])) + res_double = [float(val) for val in r[1:]] + rep.append(res_double) + X = pd.DataFrame(rep) + return X, targetLabel + +def save_to_file(X, Y, filepath): + X = pd.DataFrame(X) + Y = pd.DataFrame(Y) + temp = pd.concat([Y, X], axis=1) + temp.columns = range(temp.shape[1]) + temp.to_csv(filepath, header=None, index=False, sep='\t') + +# def process_and_save(folder_path, output_path, split_name): +# if not os.path.exists(folder_path): +# print(f"Warning: {split_name} folder does not exist: {folder_path}") +# return + +# # Load data from the folder +# input_file = os.path.join(folder_path, "data.txt") +# if not os.path.isfile(input_file): +# print(f"Warning: Data file not found in {folder_path}: {input_file}") +# return + +# X, Y = load_data(input_file) +# output_file = os.path.join(output_path, f"{split_name}.csv") +# save_to_file(X, Y, output_file) +# print(f"{split_name.capitalize()} data saved to {output_file}.") + +def process_and_save(data_path, output_path, filename): + if not os.path.exists(data_path): + print(f"Warning: Data file not found at {data_path}") + return + + X, Y = load_data(data_path) + print(f"Loaded data from {data_path}: X.shape={len(X)}, Y.shape={len(Y)}") + save_to_file(X, Y, os.path.join(output_path, filename)) + print(f"Data saved to {os.path.join(output_path, filename)}") + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--train', required=True, help='Path to the training data file') + parser.add_argument('--test', required=True, help='Path to the testing data file') + parser.add_argument('--val', required=True, help='Path to the validation data file') + parser.add_argument('--output', required=True, help='Output directory for processed CSV files') + + args = parser.parse_args() + + if not os.path.exists(args.output): + os.makedirs(args.output) + + process_and_save(args.train, args.output, 'training.csv') + process_and_save(args.test, args.output, 'testing.csv') + process_and_save(args.val, args.output, 'val.csv') + + +# if __name__ == '__main__': +# parser = argparse.ArgumentParser() +# parser.add_argument('--train', dest='train', metavar='TRAIN', help='Path to the training folder') +# parser.add_argument('--test', dest='test', metavar='TEST', help='Path to the testing folder') +# parser.add_argument('--val', dest='val', metavar='VAL', help='Path to the validation folder') +# parser.add_argument('--output', dest='output', metavar='OUTPUT', required=True, help='Path to save the output CSV files') + +# args = parser.parse_args() + +# # Ensure the output directory exists +# if not os.path.exists(args.output): +# os.makedirs(args.output) + +# # Process each folder separately +# if args.train: +# process_and_save(args.train, args.output, "training") + +# if args.test: +# process_and_save(args.test, args.output, "testing") + +# if args.val: +# process_and_save(args.val, args.output, "validation") + +# python preprocess.py --train path/to/train --test path/to/test --val path/to/val --output path/to/output \ No newline at end of file diff --git a/scripts/profiling/generate-input-folder-with-input-files.py b/scripts/profiling/generate-input-folder-with-input-files.py new file mode 100644 index 0000000..af5b51d --- /dev/null +++ b/scripts/profiling/generate-input-folder-with-input-files.py @@ -0,0 +1,82 @@ +import subprocess +import os +import re + +def parse_testcases(testcases_path): + """Parse test cases from testcases.txt.""" + print(f"\nEntering into parse_testcases function\n") + with open(testcases_path, 'r') as file: + content = file.read() + + testcases = [] + tests = re.split(r"Test: #[0-9]+,", content) + for test in tests[1:]: + input_match = re.search(r"Input\n([\s\S]*?)Output", test) + output_match = re.search(r"Output\n([\s\S]*?)Answer", test) + + if input_match and output_match: + test_input = input_match.group(1).strip() + expected_output = output_match.group(1).strip() + testcases.append((test_input, expected_output)) + return testcases + +def create_input_files(folder_path, testcases): + print(f"\nEntering into create_input_files function\n") + """Create input files for each test case.""" + testcases_folder = os.path.join(folder_path, "testcases") + os.makedirs(testcases_folder, exist_ok=True) + + input_files = [] + for i, (test_input, _) in enumerate(testcases, start=1): + input_file = os.path.join(testcases_folder, f"input{i}.txt") + with open(input_file, 'w') as f: + f.write(test_input) + input_files.append(input_file) + + return input_files + +def process_subfolder(folder_path): + """Processes a single folder to generate input files.""" + # Locate all C/C++ files and testcases.txt + testcases_file = None + file_count=0 + for filename in os.listdir(folder_path): + if filename.endswith(".c") or filename.endswith(".cpp"): + # print(f"c/cpp filename: {filename}\n") + file_count+=1 + elif filename == "testcases.txt": + testcases_file = os.path.join(folder_path, filename) + + if not testcases_file: + print(f"testcases.txt not found in {folder_path}.") + return + print("-"*20) + print(f"\nTotal number of files in the current folder --> {folder_path} is {file_count}\n") + print("-" * 20) + + # Parse test cases + testcases = parse_testcases(testcases_file) + + # Create input files for the test cases + input_files = create_input_files(folder_path, testcases) + +def main(top_level_directory): + """Main function to process all subdirectories.""" + for root, dirs, files in os.walk(top_level_directory): + for subdir in dirs: + # print(subdir) + if subdir == 'testcases': + return + folder_path = os.path.join(root, subdir) + print(f"Subdirectory Path: {folder_path}\n") + print(f"Entering into process_folder function\n") + process_subfolder(folder_path) + print("\n") + print ("-" * 20) + print(f"Processed subdirectory --> {subdir}") + print("-" * 20) + +if __name__ == "__main__": + # Specify the top-level output directory containing subfolders + top_level_dir = "/Pramana/IR2Vec/codeforces-dataset-with-tc" + main(top_level_dir) \ No newline at end of file diff --git a/scripts/profiling/generate-profiled-ll-files-with-testcases-using-cores.sh b/scripts/profiling/generate-profiled-ll-files-with-testcases-using-cores.sh new file mode 100644 index 0000000..7c5f373 --- /dev/null +++ b/scripts/profiling/generate-profiled-ll-files-with-testcases-using-cores.sh @@ -0,0 +1,118 @@ +#!/bin/bash + +# Define the directory containing subfolders with source files and test cases +BASE_DIR="$1" +NUM_CORES="$2" # Number of cores to use for parallelism + +# Compiler and flags for coverage instrumentation +# /home/cs24mtech02001/Aayush-IR2Vec/llvm-project/build-llvm17/bin/clang-17 +# COMPILER="clang++" +COMPILER=/home/cs24mtech02001/LLVM/llvm-project/build-llvm17/bin/clang++ +# STD="-std=c++17" +PROFILING_FLAGS="-fprofile-generate" +OPT=-O1 +OPTIMIZED_FLAGS="-fprofile-instr-use" +# PROFILER="/home/cs24mtech02001/LLVM/llvm-project/build-llvm17/bin/llvm-profdata" + +# Function to handle individual source files +process_file() +{ + SUBDIR="$1" + SRC="$2" + + BASE_NAME=$(basename "$SRC" | sed 's/\.[^.]*$//') + OUT_DIR="$SUBDIR/executables" + PROF_DIR="$SUBDIR/profiles" + LL_DIR="$SUBDIR/profiled-ll-files" + + EXECUTABLE="$OUT_DIR/${BASE_NAME}.out" + + echo + echo "Compiling $SRC with profiling flags..." + # $COMPILER $STD $PROFILING_FLAGS "$SRC" -o "$EXECUTABLE" + $COMPILER $OPT $PROFILING_FLAGS "$SRC" -o "$EXECUTABLE" + + if [[ $? -ne 0 ]]; then + echo "Compilation failed for $SRC" + return + fi + + # Create a profile subfolder for the current source file + SRC_PROF_DIR="$PROF_DIR/$BASE_NAME" + mkdir -p "$SRC_PROF_DIR" + + # Step 2: Run the executable with each test case in the testcases folder + TESTCASE_DIR="$SUBDIR/testcases" + if [[ -d "$TESTCASE_DIR" ]]; then + for INPUT_FILE in "$TESTCASE_DIR"/*.txt; do + if [[ -f "$INPUT_FILE" ]]; then + echo "Running $EXECUTABLE with input $INPUT_FILE..." + + PROFILE_FILE="$SRC_PROF_DIR/$(basename "$INPUT_FILE" .txt).profraw" + + # Use timeout to enforce a time limit of 5 seconds + timeout 5s bash -c "LLVM_PROFILE_FILE=\"$PROFILE_FILE\" \"$EXECUTABLE\" < \"$INPUT_FILE\" > /dev/null 2>&1" + + EXIT_CODE=$? + if [[ $EXIT_CODE -eq 124 ]]; then + echo "Skipping input file $INPUT_FILE (execution time exceeded 5 seconds)." + continue + elif [[ $EXIT_CODE -ne 0 ]]; then + echo "Execution failed for $EXECUTABLE with input $INPUT_FILE" + continue + fi + fi + done + else + echo "No testcases folder found in $SUBDIR" + fi + + # Step 3: Merge raw profiles into a single profile data file + MERGED_PROFILE="$SRC_PROF_DIR/${BASE_NAME}.profdata" + echo + echo "Merging raw profiles for $BASE_NAME..." + llvm-profdata-17 merge -output="$MERGED_PROFILE" "$SRC_PROF_DIR"/*.profraw + # $PROFILER merge -output="$MERGED_PROFILE" "$SRC_PROF_DIR"/*.profraw + + if [[ $? -ne 0 ]]; then + echo "Failed to merge profiles for $BASE_NAME." + return + fi + + # Step 4: Generate profiled LLVM IR files + PROFILED_LL_FILE="$LL_DIR/${BASE_NAME}.ll" + echo "Generating profiled LLVM IR for $BASE_NAME..." + # $COMPILER $STD $OPT $OPTIMIZED_FLAGS="$MERGED_PROFILE" "$SRC" -S -emit-llvm -o "$PROFILED_LL_FILE" + $COMPILER $OPT $OPTIMIZED_FLAGS="$MERGED_PROFILE" "$SRC" -S -emit-llvm -o "$PROFILED_LL_FILE" + + if [[ $? -ne 0 ]]; then + echo "Failed to generate LLVM IR for $BASE_NAME." + return + fi +} + +export -f process_file # Export the function for parallel processing +# export COMPILER STD OPT PROFILING_FLAGS OPTIMIZED_FLAGS # Export variables for use in subshells +export COMPILER OPT PROFILING_FLAGS OPTIMIZED_FLAGS # Export variables for use in subshells + +# Step 1: Iterate through subdirectories and process source files in parallel +for SUBDIR in "$BASE_DIR"/*/; do + echo "*****************************" + echo "Processing directory: $SUBDIR" + echo "*****************************" + + # Create directories for outputs and profiles + OUT_DIR="$SUBDIR/executables" + PROF_DIR="$SUBDIR/profiles" + LL_DIR="$SUBDIR/profiled-ll-files" + + # Delete the existing subdirectories if already present + rm -rf "$OUT_DIR" "$PROF_DIR" "$LL_DIR" + mkdir -p "$OUT_DIR" "$PROF_DIR" "$LL_DIR" + + # Find source files and process them in parallel + find "$SUBDIR" -maxdepth 1 -type f \( -name "*.c" -o -name "*.cpp" \) | \ + parallel -j "$NUM_CORES" process_file "$SUBDIR" {} +done + +echo "All operations completed successfully." \ No newline at end of file diff --git a/scripts/profiling/profiling-without-parallel.sh b/scripts/profiling/profiling-without-parallel.sh new file mode 100644 index 0000000..1124d88 --- /dev/null +++ b/scripts/profiling/profiling-without-parallel.sh @@ -0,0 +1,123 @@ +#!/bin/bash + +# Define the directory containing subfolders with source files and test cases +BASE_DIR="$1" +NUM_CORES="$2" # Number of cores to use for parallelism + +# Compiler and flags for coverage instrumentation +COMPILER="/home/cs24mtech02001/Aayush-IR2Vec/llvm-project/build-llvm17/bin/clang++" +# STD="-std=c++17" +OPT="-O1" +PROFILING_FLAGS="-fprofile-generate" +OPTIMIZED_FLAGS="-fprofile-instr-use" + +# Function to handle individual source files +process_file() { + SUBDIR="$1" + SRC="$2" + + BASE_NAME=$(basename "$SRC" | sed 's/\.[^.]*$//') + OUT_DIR="$SUBDIR/executables" + PROF_DIR="$SUBDIR/profiles" + LL_DIR="$SUBDIR/profiled-ll-files" + + EXECUTABLE="$OUT_DIR/${BASE_NAME}.out" + + echo + echo "Compiling $SRC with profiling flags..." + $COMPILER $OPT $PROFILING_FLAGS "$SRC" -o "$EXECUTABLE" + if [[ $? -ne 0 ]]; then + echo "Compilation failed for $SRC" + return + fi + + # Create a profile subfolder for the current source file + SRC_PROF_DIR="$PROF_DIR/$BASE_NAME" + mkdir -p "$SRC_PROF_DIR" + + # Step 2: Run the executable with each test case in the testcases folder + TESTCASE_DIR="$SUBDIR/testcases" + if [[ -d "$TESTCASE_DIR" ]]; then + for INPUT_FILE in "$TESTCASE_DIR"/*.txt; do + if [[ -f "$INPUT_FILE" ]]; then + echo "Running $EXECUTABLE with input $INPUT_FILE..." + + PROFILE_FILE="$SRC_PROF_DIR/$(basename "$INPUT_FILE" .txt).profraw" + + # Use timeout to enforce a time limit of 3 seconds + timeout 3s bash -c "LLVM_PROFILE_FILE=\"$PROFILE_FILE\" \"$EXECUTABLE\" < \"$INPUT_FILE\" > /dev/null 2>&1" + + EXIT_CODE=$? + if [[ $EXIT_CODE -eq 124 ]]; then + echo "Skipping input file $INPUT_FILE (execution time exceeded 3 seconds)." + continue + elif [[ $EXIT_CODE -ne 0 ]]; then + echo "Execution failed for $EXECUTABLE with input $INPUT_FILE" + continue + fi + fi + done + else + echo "No testcases folder found in $SUBDIR" + fi + + # Step 3: Merge raw profiles into a single profile data file + MERGED_PROFILE="$SRC_PROF_DIR/${BASE_NAME}.profdata" + echo + echo "Merging raw profiles for $BASE_NAME..." + llvm-profdata merge -output="$MERGED_PROFILE" "$SRC_PROF_DIR"/*.profraw + if [[ $? -ne 0 ]]; then + echo "Failed to merge profiles for $BASE_NAME." + return + fi + + # Step 4: Generate profiled LLVM IR files + PROFILED_LL_FILE="$LL_DIR/${BASE_NAME}.ll" + echo "Generating profiled LLVM IR for $BASE_NAME..." + $COMPILER $OPT $OPTIMIZED_FLAGS="$MERGED_PROFILE" "$SRC" -S -emit-llvm -o "$PROFILED_LL_FILE" + if [[ $? -ne 0 ]]; then + echo "Failed to generate LLVM IR for $BASE_NAME." + return + fi +} + +export -f process_file # Export the function for subshells +export COMPILER OPT PROFILING_FLAGS OPTIMIZED_FLAGS # Export variables for use in subshells + +# Step 1: Iterate through subdirectories and process source files +for SUBDIR in "$BASE_DIR"/*/; do + echo "*****************************" + echo "Processing directory: $SUBDIR" + echo "*****************************" + + # Create directories for outputs and profiles + OUT_DIR="$SUBDIR/executables" + PROF_DIR="$SUBDIR/profiles" + LL_DIR="$SUBDIR/profiled-ll-files" + + # Delete the existing subdirectories if already present + rm -rf "$OUT_DIR" "$PROF_DIR" "$LL_DIR" + mkdir -p "$OUT_DIR" "$PROF_DIR" "$LL_DIR" + + # Initialize job counter + job_count=0 + + # Find source files and process them + for SRC in "$SUBDIR"*.c "$SUBDIR"*.cpp; do + if [[ -f "$SRC" ]]; then + process_file "$SUBDIR" "$SRC" & # Run in background + ((job_count++)) + + # Wait if the number of background jobs reaches NUM_CORES + if ((job_count >= NUM_CORES)); then + wait + job_count=0 + fi + fi + done + + # Wait for remaining background jobs to complete + wait +done + +echo "All operations completed successfully." \ No newline at end of file diff --git a/scripts/profiling/profiling-without-testcases.sh b/scripts/profiling/profiling-without-testcases.sh new file mode 100644 index 0000000..be0b2cc --- /dev/null +++ b/scripts/profiling/profiling-without-testcases.sh @@ -0,0 +1,101 @@ +#!/bin/bash + +# Define the directory containing subfolders with source files +BASE_DIR="$1" +CORE_COUNT="$2" + +# Compiler and flags for instrumentation +COMPILER=/home/cs24mtech02001/LLVM/llvm-project/build-llvm17/bin/clang++ +# STD="-std=c++17" +PROFILING_FLAGS="-fprofile-generate" +OPT="-O1" +OPTIMIZED_FLAGS="-fprofile-instr-use" + +# Function to handle individual source files +process_source_file() +{ + local SUBDIR="$1" + local SRC="$2" + local OUT_DIR="$3" + local PROF_DIR="$4" + local LL_DIR="$5" + + BASE_NAME=$(basename "$SRC" | sed 's/\.[^.]*$//') + EXECUTABLE="$OUT_DIR/${BASE_NAME}.out" + + echo + echo "Compiling $SRC with profiling flags..." + # $COMPILER $STD $PROFILING_FLAGS "$SRC" -o "$EXECUTABLE" + $COMPILER $OPT $PROFILING_FLAGS "$SRC" -o "$EXECUTABLE" + if [[ $? -ne 0 ]]; then + echo "Compilation failed for $SRC" + return + fi + + # Run the executable to generate the .profraw file + SRC_PROF_DIR="$PROF_DIR/$BASE_NAME" + mkdir -p "$SRC_PROF_DIR" + PROFILE_FILE="$SRC_PROF_DIR/${BASE_NAME}.profraw" + + echo + echo "Running $EXECUTABLE to generate profile data..." + timeout 5s bash -c "LLVM_PROFILE_FILE=\"$PROFILE_FILE\" \"$EXECUTABLE\" > /dev/null 2>&1" + + EXIT_CODE=$? + if [[ $EXIT_CODE -eq 124 ]]; then + echo "Skipping the file (execution time exceeded 5 seconds)." + return + elif [[ $EXIT_CODE -ne 0 ]]; then + echo "Execution failed for $EXECUTABLE with input $INPUT_FILE" + return + fi + + # Merge the raw profile into a single profile data file + MERGED_PROFILE="$SRC_PROF_DIR/${BASE_NAME}.profdata" + echo + echo "Merging raw profile for $BASE_NAME..." + llvm-profdata-17 merge -output="$MERGED_PROFILE" "$SRC_PROF_DIR"/*.profraw + if [[ $? -ne 0 ]]; then + echo "Failed to merge profiles for $BASE_NAME." + return + fi + + # Generate profiled LLVM IR files + PROFILED_LL_FILE="$LL_DIR/${BASE_NAME}.ll" + echo "Generating profiled LLVM IR for $BASE_NAME..." + # $COMPILER $STD $OPT $OPTIMIZED_FLAGS="$MERGED_PROFILE" "$SRC" -S -emit-llvm -o "$PROFILED_LL_FILE" + $COMPILER $OPT $OPTIMIZED_FLAGS="$MERGED_PROFILE" "$SRC" -S -emit-llvm -o "$PROFILED_LL_FILE" + if [[ $? -ne 0 ]]; then + echo "Failed to generate LLVM IR for $BASE_NAME." + return + fi +} + +# Export the function for parallel execution +export -f process_source_file +# export COMPILER STD OPT PROFILING_FLAGS OPTIMIZED_FLAGS +export COMPILER OPT PROFILING_FLAGS OPTIMIZED_FLAGS + +# Step 1: Iterate through subdirectories and process source files in parallel +for SUBDIR in "$BASE_DIR"/*/; do + echo "*****************************" + echo "Processing directory: $SUBDIR" + echo "*****************************" + + # Create directories for outputs and profiles + OUT_DIR="$SUBDIR/executables" + PROF_DIR="$SUBDIR/profiles" + LL_DIR="$SUBDIR/profiled-ll-files" + + # Delete the existing subdirectories if already present + rm -rf "$OUT_DIR" "$PROF_DIR" "$LL_DIR" + + mkdir -p "$OUT_DIR" "$PROF_DIR" "$LL_DIR" + + # Find source files and process them in parallel using parallel + find "$SUBDIR" -maxdepth 1 \( -name "*.c" -o -name "*.cpp" \) | \ + parallel -j "$CORE_COUNT" process_source_file "$SUBDIR" {} "$OUT_DIR" "$PROF_DIR" "$LL_DIR" + +done + +echo "All operations completed successfully." \ No newline at end of file diff --git a/scripts/read-npz.files.py b/scripts/read-npz.files.py new file mode 100644 index 0000000..db5fa71 --- /dev/null +++ b/scripts/read-npz.files.py @@ -0,0 +1,7 @@ +from numpy import load + +data = load('/home/aayusphere/Embeddings/poj/milepost/trainO0/1/1-14.npz') +lst = data.files +for item in lst: + print(item) + print(data[item]) \ No newline at end of file From a6e408a08a468fe7986e0a188f2d8e5318bf7f6c Mon Sep 17 00:00:00 2001 From: iamaayushrivastava Date: Fri, 10 Jan 2025 17:45:34 +0530 Subject: [PATCH 2/3] FlowAware.cpp --- code/FlowAware.cpp | 2320 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2320 insertions(+) create mode 100755 code/FlowAware.cpp diff --git a/code/FlowAware.cpp b/code/FlowAware.cpp new file mode 100755 index 0000000..fdd9f14 --- /dev/null +++ b/code/FlowAware.cpp @@ -0,0 +1,2320 @@ +#include "FlowAware.h" +#include "VectorSolver.h" + +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/CallGraph.h" + +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/Type.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/IR/Function.h" +#include "llvm/Pass.h" +#include "llvm/Passes/PassBuilder.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/IR/Dominators.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Passes/PassPlugin.h" +#include "llvm/InitializePasses.h" +// #include "llvm/Support/BranchProbability.h" +#include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/InitializePasses.h" +#include "llvm/ADT/MapVector.h" + + +#include // for transform + +#include +#include +#include + +using namespace llvm; +using namespace std; +using namespace IR2Vec; + +BranchProbabilityInfo *IR2Vec_FA::getBPI(Function *F, FunctionAnalysisManager &FAM) { + auto It = bpiMap.find(F); + if (It != bpiMap.end()) + { + return It->second; + } + // BranchProbabilityInfo &BPI = getAnalysis(*F).getBPI(); + // BranchProbabilityInfo &BPI = &FAM.getResult(F).getBPI(); + // bpiMap[F] = &FAM.getResult(*F).getBPI(); + // Get new BPI analysis result + BranchProbabilityInfo *BPI = &FAM.getResult(*F); + bpiMap[F] = BPI; + return bpiMap[F]; +} + +// Scales a vector by multiplying each element by a factor +void IR2Vec_FA::scaleVector(SmallVector &vec, float factor) { + for (unsigned i = 0; i < vec.size(); i++) { + vec[i] = vec[i] * factor; + } +} + +void IR2Vec_FA::killAndUpdate(Instruction *I, SmallVector val) { + // LLVM_DEBUG(dbgs() << "kill and update: \n"); + // LLVM_DEBUG(I->dump()); + if (I == nullptr) + return; + auto It1 = instVecMap.find(I); + assert(It1 != instVecMap.end() && "Instruction should be defined in map"); + It1->second = val; + + auto It2 = livelinessMap.find(I); + assert(It2 != livelinessMap.end() && + "Instruction should be in livelinessMap"); + It2->second = false; + + transitiveKillAndUpdate(I, val, false); +} + +// Ensures that the vector updates propagate through all related memory operations. +void IR2Vec_FA::transitiveKillAndUpdate(Instruction *I, + SmallVector val, + bool avg) { + assert(I != nullptr); + // LLVM_DEBUG(dbgs() << "I: "); + // LLVM_DEBUG(I->dump()); + unsigned operandNum; + bool isMemAccess = isMemOp(I->getOpcodeName(), operandNum, memAccessOps); + if (!isMemAccess) + return; + + auto parentI = dyn_cast(I->getOperand(operandNum)); + if (parentI == nullptr) + return; + // assert(parentI != nullptr); + // LLVM_DEBUG(dbgs() << "\n parentI: "); + // LLVM_DEBUG(parentI->dump()); + + if (strcmp(parentI->getOpcodeName(), "getelementptr") == 0) + avg = true; + + // LLVM_DEBUG(dbgs() << "\nVal : "; for (auto i : val) { dbgs() << i << " "; }); + auto It1 = instVecMap.find(parentI); + assert(It1 != instVecMap.end() && "Instruction should be defined in map"); + + // LLVM_DEBUG(dbgs() << "\nIt.second = : "; + // for (auto i + // : It1->second) { dbgs() << i << " "; }); + + if (avg) { + std::transform(It1->second.begin(), It1->second.end(), val.begin(), + It1->second.begin(), std::plus()); + scaleVector(It1->second, WT); + } else { + It1->second = val; + } + // LLVM_DEBUG(dbgs() << "\nafter transforming : "; + // for (auto i + // : It1->second) { dbgs() << i << " "; }); + auto It2 = livelinessMap.find(parentI); + assert(It2 != livelinessMap.end() && + "Instruction should be in livelinessMap"); + It2->second = false; + + transitiveKillAndUpdate(parentI, val, avg); +} + +// void IR2Vec_FA::collectData() { +// static bool wasExecuted = false; +// if (!wasExecuted) { +// errs() << "Reading from " + fname + "\n"; +// std::ifstream i(fname); +// std::string delimiter = ":"; +// for (std::string line; getline(i, line);) { +// std::string token = line.substr(0, line.find(delimiter)); +// SmallVector rep; +// std::string vec = line.substr(line.find(delimiter) + 1, line.length()); +// std::string val = vec.substr(vec.find("[") + 1, vec.find(", ") - 1); +// rep.push_back(stod(val)); +// int pos = vec.find(", "); +// vec = vec.substr(pos + 1); +// for (int i = 1; i < DIM - 1; i++) { +// val = vec.substr(1, vec.find(", ") - 1); +// rep.push_back(stod(val)); +// pos = vec.find(", "); +// vec = vec.substr(pos + 1); +// } +// val = vec.substr(1, vec.find("]") - 1); +// rep.push_back(stod(val)); +// opcMap[token] = rep; +// } +// wasExecuted = true; +// } +// } + +// Performs recursive analysis of how instructions are used +// Recursively analyzes transitive uses of memory operations +void IR2Vec_FA::getTransitiveUse( + const Instruction *root, const Instruction *def, + SmallVector &visitedList, + SmallVector toAppend) { + unsigned operandNum = 0; + visitedList.push_back(def); + + for (auto U : def->users()) { + if (auto use = dyn_cast(U)) { + if (std::find(visitedList.begin(), visitedList.end(), use) == + visitedList.end()) { + IR2VEC_DEBUG(outs() << "\nDef " << /* def << */ " "; + def->print(outs(), true); outs() << "\n";); + IR2VEC_DEBUG(outs() << "Use " << /* use << */ " "; + use->print(outs(), true); outs() << "\n";); + if (isMemOp(use->getOpcodeName(), operandNum, memWriteOps) && + use->getOperand(operandNum) == def) { + writeDefsMap[root].push_back(use); + } + // If it's a memory access operation, continue the transitive analysis + else if (isMemOp(use->getOpcodeName(), operandNum, memAccessOps) && + use->getOperand(operandNum) == def) { + getTransitiveUse(root, use, visitedList, toAppend); + } + } + } + } + return; +} +// Connects root instructions to their dependent write operations +void IR2Vec_FA::collectWriteDefsMap(Module &M) { + SmallVector visitedList; + for (auto &F : M) { + if (!F.isDeclaration()) { + EliminateUnreachableBlocks(F); + for (auto &BB : F) { + for (auto &I : BB) { + unsigned operandNum = 0; + if ((isMemOp(I.getOpcodeName(), operandNum, memAccessOps) || + isMemOp(I.getOpcodeName(), operandNum, memWriteOps) || + strcmp(I.getOpcodeName(), "alloca") == 0) && + std::find(visitedList.begin(), visitedList.end(), &I) == + visitedList.end()) { + if (I.getNumOperands() > 0) { + // IR2VEC_DEBUG(I.print(outs()); outs() << "\n"); + // IR2VEC_DEBUG(outs() << "operandnum = " << operandNum << "\n"); + if (auto parent = + dyn_cast(I.getOperand(operandNum))) { + if (std::find(visitedList.begin(), visitedList.end(), parent) == + visitedList.end()) { + visitedList.push_back(parent); + getTransitiveUse(parent, parent, visitedList); + } + } + } + } + } + } + } + } +} + +Vector IR2Vec_FA::getValue(std::string key) { + // printf("entering get value"); + Vector vec; + if (opcMap.find(key) == opcMap.end()) { + IR2VEC_DEBUG(errs() << "cannot find key in map : " << key << "\n"); + dataMissCounter++; + } else + vec = opcMap[key]; + // for(auto x: opcMap){ + // cout<< "x.first : "<()); + } + + scaleVector(calleeVector, WA); + auto tmpParent = funcVecMap[function]; + std::transform(calleeVector.begin(), calleeVector.end(), tmpParent.begin(), + tmpParent.begin(), std::plus()); + funcVecMap[function] = tmpParent; + } +} + +void IR2Vec_FA::generateFlowAwareEncodings(std::ostream *o, + std::ostream *missCount, + std::ostream *cyclicCount) { + + // collectWriteDefsMap(M); + cout<<"it reaches generateFlow encodings right?"<<"\n"; + int noOfFunc = 0; + + llvm::FunctionAnalysisManager FAM; + // FAM.add(new BranchProbabilityAnalysis()); + + // FAM.addPass(BranchProbabilityAnalysis()); + + llvm::PassBuilder PB; + PB.registerFunctionAnalyses(FAM); + + // FAM.registerPass([] { return llvm::BranchProbabilityAnalysis(); }); + + // better to run bpi for all the functions at the start itself i guess, then no issues here and there + for (auto &f : M) { + if (!f.isDeclaration()) { + getBPI(&f,FAM); + } + } + + for (auto &f : M) { + if (!f.isDeclaration()) { + + // BranchProbabilityInfo *BPI = &FAM.getResult(f); + + SmallVector funcStack; + // auto x = getBPI(&f, BPI); + // if(x != nullptr){ + // cout<<"atleast stuff is not empty" << "\n"; + // } + // for(auto entry : bpiMap){ + // Function *func = entry.first; + // BranchProbabilityInfo *bpi = entry.second; + + // outs() << func->getName() << "\n"; + // } + cout<<"tmp gets filled here and func2Vec gets called here, right?"<<"\n"; + auto tmp = func2Vec (f, funcStack, getBPI(&f, FAM)); + // auto tmp = func2Vec(f, funcStack, BPI); + funcVecMap[&f] = tmp; + } + } + + // printing the bpiMap over here, should contain the entire list of functions and their bpi + cout<<"printing the contents of bpiMap over here :"<<"\n"; + for (auto &entry : bpiMap) { + llvm::Function *func = entry.first; + llvm::BranchProbabilityInfo *bpi = entry.second; + + // Print the addresses of the Function and BranchProbabilityInfo pointers + std::cout << "Function pointer: " << func << "\n"; + std::cout << "BranchProbabilityInfo pointer: " << bpi << "\n"; + std::cout << "-------------------------\n"; + } + + // for (auto funcit : funcVecMap) { + // updateFuncVecMapWithCallee(funcit.first); + // } + + for (auto &f : M) { + if (!f.isDeclaration()) { + Vector tmp; + SmallVector funcStack; + tmp = funcVecMap[&f]; + + if (level == 'f') { + res += updatedRes(tmp, &f, &M); + res += "\n"; + noOfFunc++; + } + + // else if (level == 'p') { + std::transform(pgmVector.begin(), pgmVector.end(), tmp.begin(), + pgmVector.begin(), std::plus()); + // } + } + } + + if (level == 'p') { + if (cls != -1) + res += std::to_string(cls) + "\t"; + + for (auto i : pgmVector) { + if ((i <= 0.0001 && i > 0) || (i < 0 && i >= -0.0001)) { + i = 0; + } + res += std::to_string(i) + "\t"; + } + res += "\n"; + } + + if (o) + *o << res; + + if (missCount) { + std::string missEntry = + (M.getSourceFileName() + "\t" + std::to_string(dataMissCounter) + "\n"); + *missCount << missEntry; + } + + if (cyclicCount) + *cyclicCount << (M.getSourceFileName() + "\t" + + std::to_string(cyclicCounter) + "\n"); +} + +// This function will update funcVecMap by doing DFS starting from parent +// function +void IR2Vec_FA::updateFuncVecMap( + llvm::Function *function, + llvm::SmallSet &visitedFunctions) { + visitedFunctions.insert(function); + SmallVector funcStack; + funcStack.clear(); + auto tmpParent = func2Vec(*function, funcStack, bpiMap[function]); + // funcVecMap is updated with vectors returned by func2Vec + funcVecMap[function] = tmpParent; + auto calledFunctions = funcCallMap[function]; + for (auto &calledFunction : calledFunctions) { + if (calledFunction && !calledFunction->isDeclaration() && + visitedFunctions.count(calledFunction) == 0) { + // doing casting since calledFunctions is of type of const + // llvm::Function* and we need llvm::Function* as argument + auto *callee = const_cast(calledFunction); + // This function is called recursively to update funcVecMap + updateFuncVecMap(callee, visitedFunctions); + } + } +} + +void IR2Vec_FA::generateFlowAwareEncodingsForFunction( + std::ostream *o, std::string name, std::ostream *missCount, + std::ostream *cyclicCount) { + + int noOfFunc = 0; + for (auto &f : M) { + + auto Result = getActualName(&f); + if (!f.isDeclaration() && Result == name) { + // If funcName is matched with one of the functions in module, we + // will update funcVecMap of it and it's child functions recursively + llvm::SmallSet visitedFunctions; + updateFuncVecMap(&f, visitedFunctions); + } + } + // iterating over all functions in module instead of funcVecMap to preserve + // order + for (auto &f : M) { + if (funcVecMap.find(&f) != funcVecMap.end()) { + auto *function = const_cast(&f); + updateFuncVecMapWithCallee(function); + } + } + + for (auto &f : M) { + auto Result = getActualName(&f); + if (!f.isDeclaration() && Result == name) { + Vector tmp; + SmallVector funcStack; + tmp = funcVecMap[&f]; + + if (level == 'f') { + res += updatedRes(tmp, &f, &M); + res += "\n"; + noOfFunc++; + } + } + } + + if (o) + *o << res; + + if (missCount) { + std::string missEntry = + (M.getSourceFileName() + "\t" + std::to_string(dataMissCounter) + "\n"); + *missCount << missEntry; + } + + if (cyclicCount) + *cyclicCount << (M.getSourceFileName() + "\t" + + std::to_string(cyclicCounter) + "\n"); +} + +void IR2Vec_FA::topoDFS(int vertex, std::vector &Visited, + std::vector &visitStack) { + + Visited[vertex] = true; + + auto list = SCCAdjList[vertex]; + + for (auto nodes : list) { + if (Visited[nodes] == false) + topoDFS(nodes, Visited, visitStack); + } + + visitStack.push_back(vertex); +} + +std::vector IR2Vec_FA::topoOrder(int size) { + std::vector Visited(size, false); + std::vector visitStack; + + for (auto &nodes : SCCAdjList) { + if (Visited[nodes.first] == false) { + topoDFS(nodes.first, Visited, visitStack); + } + } + + return visitStack; +} + +void IR2Vec_FA::TransitiveReads(SmallVector &Killlist, + Instruction *Inst, BasicBlock *ParentBB) { + assert(Inst != nullptr); + unsigned operandNum; + bool isMemAccess = isMemOp(Inst->getOpcodeName(), operandNum, memAccessOps); + + if (!isMemAccess) + return; + auto parentI = dyn_cast(Inst->getOperand(operandNum)); + if (parentI == nullptr) + return; + if (ParentBB == parentI->getParent()) + Killlist.push_back(parentI); + TransitiveReads(Killlist, parentI, ParentBB); +} + +SmallVector +IR2Vec_FA::createKilllist(Instruction *Arg, Instruction *writeInst) { + + SmallVector KillList; + SmallVector tempList; + BasicBlock *ParentBB = writeInst->getParent(); + + unsigned opnum; + + for (User *U : Arg->users()) { + if (Instruction *UseInst = dyn_cast(U)) { + if (isMemOp(UseInst->getOpcodeName(), opnum, memWriteOps)) { + Instruction *OpInst = dyn_cast(UseInst->getOperand(opnum)); + if (OpInst && OpInst == Arg) + tempList.push_back(UseInst); + } + } + } + + for (auto I = tempList.rbegin(); I != tempList.rend(); I++) { + if (*I == writeInst) + break; + if (ParentBB == (*I)->getParent()) + KillList.push_back(*I); + } + + return KillList; +} + +// Vector IR2Vec_FA::func2Vec(Function &F, SmallVector &funcStack, BranchProbabilityInfo *bpi){ +Vector IR2Vec_FA::func2Vec(Function &F, + SmallVector &funcStack, + BranchProbabilityInfo *bpi) { + auto It = funcVecMap.find(&F); + if (It != funcVecMap.end()) { + return It->second; + } + + funcStack.push_back(&F); + + // instReachingDefsMap.clear(); + // allSCCs.clear(); + // reverseReachingDefsMap.clear(); + // SCCAdjList.clear(); + + Vector funcVector(DIM, 0); // Initialize zero vector + + MapVector> succMap; + MapVector cumulativeScore; + + if(bpi) { + // MapVector> succMap; + // MapVector cumulativeScore; + + for (auto &b : F) { + MapVector succs; + for (auto it = succ_begin(&b), et = succ_end(&b); it != et; ++it) { + BasicBlock *t = *it; + auto bp = bpi->getEdgeProbability(&b, t); + double prob = double(bp.getNumerator()) / double(bp.getDenominator()); + std::cout << "Probability : " << prob << "\n"; + succs[*it] = prob; + } + succMap[&b] = succs; + cumulativeScore[&b] = 0; + } + } + + ReversePostOrderTraversal RPOT(&F); + + bool isHeader = true; + if(bpi){ + for (auto *b : RPOT) { + if (isHeader) + cumulativeScore[b] = 1; + if (succMap.find(b) != succMap.end()) { + for (auto element : succMap[b]) { + auto currentPtr = cumulativeScore[b]; + cumulativeScore[element.first] = + (currentPtr * element.second) + cumulativeScore[element.first]; + } + } + isHeader = false; + } + + // cout<< "cumulative score here : " << "\n"; + // for(auto x : cumulativeScore){ + // cout<<"x.first : " << x.first<< "\n"; + // cout<<"x.second : "<< x.second<< "\n"; + // } + } + + // for (auto *b : RPOT) { + // unsigned opnum; + // SmallVector lists; + // for (auto &I : *b) { + // lists.clear(); + // if (isMemOp(I.getOpcodeName(), opnum, memWriteOps) && + // dyn_cast(I.getOperand(opnum))) { + // Instruction *argI = cast(I.getOperand(opnum)); + // lists = createKilllist(argI, &I); + // TransitiveReads(lists, argI, I.getParent()); + // if (argI->getParent() == I.getParent()) + // lists.push_back(argI); + // killMap[&I] = lists; + // } + // } + // } + + // for (auto *b : RPOT) { + // for (auto &I : *b) { + // for (int i = 0; i < I.getNumOperands(); i++) { + // if (isa(I.getOperand(i))) { + // auto RD = getReachingDefs(&I, i); + // if (instReachingDefsMap.find(&I) == instReachingDefsMap.end()) { + // instReachingDefsMap[&I] = RD; + // } else { + // auto RDList = instReachingDefsMap[&I]; + // RDList.insert(RDList.end(), RD.begin(), RD.end()); + // instReachingDefsMap[&I] = RDList; + // } + // } + // } + // } + // } + + // IR2VEC_DEBUG(for (auto &Inst + // : instReachingDefsMap) { + // auto RD = Inst.second; + // outs() << "(" << Inst.first << ")"; + // Inst.first->print(outs()); + // outs() << "\n RD : "; + // for (auto defs : RD) { + // defs->print(outs()); + // outs() << "(" << defs << ") "; + // } + // outs() << "\n"; + // }); + + // // one time Reversing instReachingDefsMap to be used to calculate SCCs + // for (auto &I : instReachingDefsMap) { + // auto RD = I.second; + // for (auto defs : RD) { + // if (reverseReachingDefsMap.find(defs) == reverseReachingDefsMap.end()) { + // llvm::SmallVector revDefs; + // revDefs.push_back(I.first); + // reverseReachingDefsMap[defs] = revDefs; + // } else { + // auto defVector = reverseReachingDefsMap[defs]; + // defVector.push_back(I.first); + // reverseReachingDefsMap[defs] = defVector; + // } + // } + // } + + // getAllSCC(); + + // std::sort(allSCCs.begin(), allSCCs.end(), + // [](llvm::SmallVector &a, + // llvm::SmallVector &b) { + // return a.size() < b.size(); + // }); + + // IR2VEC_DEBUG(int i = 0; for (auto &sets + // : allSCCs) { + // outs() << "set: " << i << "\n"; + // for (auto insts : sets) { + // insts->print(outs()); + // outs() << " " << insts << " "; + // } + // outs() << "\n"; + // i++; + // }); + + // for (int i = 0; i < allSCCs.size(); i++) { + // auto set = allSCCs[i]; + // for (int j = 0; j < set.size(); j++) { + // auto RD = instReachingDefsMap[set[j]]; + // if (!RD.empty()) { + // for (auto defs : RD) { + // for (int k = 0; k < allSCCs.size(); k++) { + // if (k == i) + // continue; + // auto sccSet = allSCCs[k]; + // if (std::find(sccSet.begin(), sccSet.end(), defs) != sccSet.end()) { + // // outs() << i << " depends on " << k << "\n"; + // if (SCCAdjList.find(k) == SCCAdjList.end()) { + // std::vector temp; + // temp.push_back(i); + // SCCAdjList[k] = temp; + // } else { + // auto temp = SCCAdjList[k]; + // if (std::find(temp.begin(), temp.end(), i) == temp.end()) + // temp.push_back(i); + // SCCAdjList[k] = temp; + // } + // } + // } + // } + // } + // } + // } + + // IR2VEC_DEBUG(outs() << "\nAdjList:\n"; for (auto &nodes + // : SCCAdjList) { + // outs() << "Adjlist for: " << nodes.first << "\n"; + // for (auto components : nodes.second) { + // outs() << components << " "; + // } + // outs() << "\n"; + // }); + + // std::vector stack; + + // stack = topoOrder(allSCCs.size()); + + // for (int i = 0; i < allSCCs.size(); i++) { + // if (std::find(stack.begin(), stack.end(), i) == stack.end()) { + // stack.insert(stack.begin(), i); + // } + // } + + // IR2VEC_DEBUG(outs() << "New topo order: \n"; for (auto sets + // : stack) { + // outs() << sets << " "; + // } outs() << "\n";); + + // SmallVector prevVec; + // Instruction *argToKill = nullptr; + + // while (stack.size() != 0) { + // int idx = stack.back(); + // stack.pop_back(); + // auto component = allSCCs[idx]; + // SmallMapVector partialInstValMap; + // if (component.size() == 1) { + // auto defs = component[0]; + // partialInstValMap[defs] = {}; + // getPartialVec(*defs, partialInstValMap); + // solveSingleComponent(*defs, partialInstValMap, funcStack); + // partialInstValMap.erase(defs); + // } else { + // cyclicCounter++; // for components with length more than 1 will + // // represent cycles + // for (auto defs : component) { + // partialInstValMap[defs] = {}; + // getPartialVec(*defs, partialInstValMap); + // } + + // if (!partialInstValMap.empty()) + // solveInsts(partialInstValMap, funcStack); + // } + // } + + for (auto *b : RPOT) { + bb2Vec(*b, funcStack); + Vector bbVector(DIM, 0); + // IR2VEC_DEBUG(outs() << "-------------------------------------------\n"); + for (auto &I : *b) { + auto It1 = livelinessMap.find(&I); + if (It1->second == true) { + // IR2VEC_DEBUG(I.print(outs()); outs() << "\n"); + auto vec = instVecMap.find(&I)->second; + // IR2VEC_DEBUG(outs() << vec[0] << "\n\n"); + std::transform(bbVector.begin(), bbVector.end(), vec.begin(), + bbVector.begin(), std::plus()); + } + } + + // IR2VEC_DEBUG(outs() << "-------------------------------------------\n"); + for (auto i : bbVector) { + if ((i <= 0.0001 && i > 0) || (i < 0 && i >= -0.0001)) { + i = 0; + } + } + + if(bpi){ + auto prob = cumulativeScore[b]; + Vector weightedBBVector; + + // main thing changes here + for(auto p : bbVector){ + // cout<< "value of p here : " << p<< "\n"; + weightedBBVector.push_back(prob * p); + } + + // cout << "weightedBBVector here : " << "\n"; + // for(auto x : weightedBBVector){ + // cout<()); + } + else{ + std::transform(funcVector.begin(), funcVector.end(), bbVector.begin(), + funcVector.begin(), std::plus()); + } + } + + // cout<< "funcVector here : "<getLoopFor(BB); + if (L) { + while (const Loop *Parent = L->getParentLoop()) + L = Parent; + } + return L; +} + +double IR2Vec_FA::getRDProb(const Instruction *src, const Instruction *tgt, + llvm::SmallVector writeSet) { + // if(bprob == 0) + // return 1; + // assert(instVecMap.find(src)!=instVecMap.end() && "Vector of the instruction + // should be available at this point"); + // if (bprob == 0) + // return 1; + // LLVM_DEBUG(errs() << "YOLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLOOO\n"); + // LLVM_DEBUG(src->dump()); + // LLVM_DEBUG(tgt->dump()); + // LLVM_DEBUG(errs() << "yooooooodoooaaaaaaaaaaaaaaawwwwwwwwwwwggg\n"); + auto srcParent = src->getParent(); + auto tgtParent = tgt->getParent(); + + SmallPtrSet writingBB; + + for (auto I : writeSet) + { + writingBB.insert(I->getParent()); + llvm::errs() << "Writing Basic Block: " << I->getParent()->getName() << "\n"; + } + + if (srcParent == tgtParent) { + // auto It1 = instVecMap.find(src); + // assert (It1 != instVecMap.end() && "Instruction should be defined in + // map"); return It1->second; + llvm::errs() << "Source and Target are in the same BasicBlock\n"; + return 1; + } + + SmallVector stack; + // SmallDenseMap visited; + // SmallDenseMap last_seen; + SmallMapVector visited; + SmallMapVector last_seen; + + auto curNode = srcParent; + auto curNodeTerminatorInst = curNode->getTerminator(); + bool flag = false; + double prob = 1; + llvm::errs() << "Starting traversal from: " << srcParent->getName() << "\n"; + do { + visited[curNode] = true; + if (flag) { + stack.pop_back(); + if (stack.empty()) + break; + curNode = stack.back(); + curNodeTerminatorInst = curNode->getTerminator(); + } else { + stack.push_back(curNode); + } + flag = true; + if (!last_seen[curNodeTerminatorInst]) { + last_seen[curNodeTerminatorInst] = 0; + } + for (unsigned i = last_seen[curNodeTerminatorInst]; + i < curNodeTerminatorInst->getNumSuccessors(); i++) { + last_seen[curNodeTerminatorInst]++; + auto succ = curNodeTerminatorInst->getSuccessor(i); + if (succ == tgtParent) { + // issues can happen here ? + + // auto bpi = bpiMap[(const_cast(stack.front())->getParent())]; + // MAKING CHANGES HERE: + Function* parent = (const_cast(stack.front())->getParent()); + auto it = bpiMap.find(parent); + cout<<"parent here : "<getName() << "\n"; + // auto bpi; + BranchProbabilityInfo *bpi; + if(it!=bpiMap.end()){ + cout<<"MEANS IT IS NOT EMPTY HERE"<<"\n"; + bpi = bpiMap[parent]; + } + else{ + cout<<"HOW IS IT COMING AS EMPTY ?"<< "\n"; + llvm::FunctionAnalysisManager FAM; + // FAM.add(new BranchProbabilityAnalysis()); + + // FAM.addPass(BranchProbabilityAnalysis()); + + llvm::PassBuilder PB; + PB.registerFunctionAnalyses(FAM); + bpiMap[parent]=getBPI(parent, FAM); + bpi = bpiMap[parent]; + } + cout<<"value of bpi :"<getEdgeProbability(prev, BB); + cout<<"is bp coming correctly :"<<&bp<<"\n"; + llvm::errs() << "Edge Probability " << prev->getName() << " -> " << BB->getName() << " : " << double(bp.getNumerator()) / bp.getDenominator() << "\n"; + prob = prob * double(bp.getNumerator()) / double(bp.getDenominator()); + prev = BB; + // LLVM_DEBUG(BB->dump()); + } + auto bp = bpi->getEdgeProbability(prev, succ); + llvm::errs() << "Final Edge Probability " << prev->getName() << " -> " << succ->getName() << " : " << double(bp.getNumerator()) / bp.getDenominator() << "\n"; + prob = prob * double(bp.getNumerator()) / double(bp.getDenominator()); + // LLVM_DEBUG(succ->dump()); + // LLVM_DEBUG(errs() << "alllllllllgoooooooooooooooodddddddddd\n"); + curNode = succ; + curNodeTerminatorInst = curNode->getTerminator(); + flag = false; + break; + } else if (!visited[succ] && writingBB.find(succ) == writingBB.end()) { + llvm::errs() << "Traversing to successor BasicBlock: " << succ->getName() << "\n"; + curNode = succ; + curNodeTerminatorInst = curNode->getTerminator(); + flag = false; + break; + } + } + } while (!stack.empty()); + + // LLVM_DEBUG(dbgs() << "Returning from RD Value\n"); + llvm::errs() << "Computed Probability: " << prob << "\n"; + cout<<"value of prob here , going out successfully :" << prob <<"\n"; + return prob; +} + +bool isPotentiallyReachableFromMany( + SmallVectorImpl &Worklist, BasicBlock *StopBB, + const SmallPtrSetImpl *ExclusionSet, + const DominatorTree *DT, const LoopInfo *LI) { + // When the stop block is unreachable, it's dominated from everywhere, + // regardless of whether there's a path between the two blocks. + if (DT && !DT->isReachableFromEntry(StopBB)) + DT = nullptr; + + // We can't skip directly from a block that dominates the stop block if the + // exclusion block is potentially in between. + if (ExclusionSet && !ExclusionSet->empty()) + DT = nullptr; + + // Normally any block in a loop is reachable from any other block in a loop, + // however excluded blocks might partition the body of a loop to make that + // untrue. + + SmallPtrSet LoopsWithHoles; + if (LI && ExclusionSet) { + for (auto BB : *ExclusionSet) { + if (const Loop *L = getOutermostLoop(LI, BB)) + LoopsWithHoles.insert(L); + } + } + + const Loop *StopLoop = LI ? getOutermostLoop(LI, StopBB) : nullptr; + + // Limit the number of blocks we visit. The goal is to avoid run-away + // compile times on large CFGs without hampering sensible code. Arbitrarily + // chosen. + unsigned Limit = 32; + + SmallPtrSet Visited; + do { + BasicBlock *BB = Worklist.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + if (BB == StopBB) + return true; + if (ExclusionSet && ExclusionSet->count(BB)) + continue; + if (DT && DT->dominates(BB, StopBB)) + return true; + + const Loop *Outer = nullptr; + if (LI) { + Outer = getOutermostLoop(LI, BB); + // If we're in a loop with a hole, not all blocks in the loop are + // reachable from all other blocks. That implies we can't simply + // jump to the loop's exit blocks, as that exit might need to pass + // through an excluded block. Clear Outer so we process BB's + // successors. + if (LoopsWithHoles.count(Outer)) + Outer = nullptr; + if (StopLoop && Outer == StopLoop) + return true; + } + + if (!--Limit) { + // We haven't been able to prove it one way or the other. + // Conservatively answer true -- that there is potentially a path. + return true; + } + + if (Outer) { + // All blocks in a single loop are reachable from all other blocks. + // From any of these blocks, we can skip directly to the exits of + // the loop, ignoring any other blocks inside the loop body. + Outer->getExitBlocks(Worklist); + } else { + Worklist.append(succ_begin(BB), succ_end(BB)); + } + } while (!Worklist.empty()); + + // We have exhausted all possible paths and are certain that 'To' can not be + // reached from 'From'. + return false; +} + +bool isPotentiallyReachable( + const Instruction *A, const Instruction *B, + const SmallPtrSetImpl *ExclusionSet, + const DominatorTree *DT, const LoopInfo *LI) { + assert(A->getParent()->getParent() == B->getParent()->getParent() && + "This analysis is function-local!"); + + SmallVector Worklist; + + if (A->getParent() == B->getParent()) { + // The same block case is special because it's the only time we're + // looking within a single block to see which instruction comes first. + // Once we start looking at multiple blocks, the first instruction of + // the block is reachable, so we only need to determine reachability + // between whole blocks. + BasicBlock *BB = const_cast(A->getParent()); + + // If the block is in a loop then we can reach any instruction in the + // block from any other instruction in the block by going around a + // backedge. + if (LI && LI->getLoopFor(BB) != nullptr) + return true; + + // Linear scan, start at 'A', see whether we hit 'B' or the end first. + for (BasicBlock::const_iterator I = A->getIterator(), E = BB->end(); I != E; + ++I) { + if (&*I == B) + return true; + } + + // Can't be in a loop if it's the entry block -- the entry block may not + // have predecessors. + if (BB == &BB->getParent()->getEntryBlock()) + return false; + + // Otherwise, continue doing the normal per-BB CFG walk. + Worklist.append(succ_begin(BB), succ_end(BB)); + + if (Worklist.empty()) { + // We've proven that there's no path! + return false; + } + } else { + Worklist.push_back(const_cast(A->getParent())); + } + + if (DT) { + if (DT->isReachableFromEntry(A->getParent()) && + !DT->isReachableFromEntry(B->getParent())) + return false; + if (!ExclusionSet || ExclusionSet->empty()) { + if (A->getParent() == &A->getParent()->getParent()->getEntryBlock() && + DT->isReachableFromEntry(B->getParent())) + return true; + if (B->getParent() == &A->getParent()->getParent()->getEntryBlock() && + DT->isReachableFromEntry(A->getParent())) + return false; + } + } + + return isPotentiallyReachableFromMany( + Worklist, const_cast(B->getParent()), ExclusionSet, DT, LI); +} + +SmallVector +IR2Vec_FA::getReachingDefs(const Instruction *I, unsigned loc) { + IR2VEC_DEBUG( + outs() + << "Call to getReachingDefs Started****************************\n"); + auto parent = dyn_cast(I->getOperand(loc)); + if (!parent) + return {}; + SmallVector RD; + SmallVector probableRD; + IR2VEC_DEBUG(outs() << "Inside RD for : "); + IR2VEC_DEBUG(I->print(outs()); outs() << "\n"); + + if (writeDefsMap[parent].empty()) { + RD.push_back(parent); + return RD; + } + + if (writeDefsMap[parent].size() >= 1) { + SmallMapVector, 16> + bbInstMap; + // Remove definitions which don't reach I + for (auto it : writeDefsMap[parent]) { + if (it != I && isPotentiallyReachable(it, I)) { + + probableRD.push_back(it); + } + } + probableRD.push_back(parent); + IR2VEC_DEBUG(outs() << "----PROBABLE RD---" + << "\n"); + for (auto i : probableRD) { + IR2VEC_DEBUG(i->print(outs()); outs() << "\n"); + bbInstMap[i->getParent()].push_back(i); + } + + IR2VEC_DEBUG(outs() << "contents of bbinstmap:\n"; for (auto i + : bbInstMap) { + for (auto j : i.second) { + j->print(outs()); + outs() << "\n"; + } + outs() << "+++++++++++++++++++++++++\n"; + }); + + // If there is a reachable write within I's basic block only that defn + // would reach always If there are more than one defn, take the + // immediate defn before I + if (!bbInstMap[I->getParent()].empty()) { + IR2VEC_DEBUG(outs() << "--------Within BB--------\n"); + IR2VEC_DEBUG(I->print(outs()); outs() << "\n"); + auto orderedVec = bbInstMap[I->getParent()]; + const Instruction *probableRD = nullptr; + for (auto &i : *(I->getParent())) { + if (&i == I) + break; + else { + if (std::find(orderedVec.begin(), orderedVec.end(), &i) != + orderedVec.end()) + probableRD = &i; + } + } + + if (probableRD != nullptr) { + IR2VEC_DEBUG(outs() << "Returning: "); + IR2VEC_DEBUG(probableRD->print(outs()); outs() << "\n"); + RD.push_back(probableRD); + return RD; + } + } + + IR2VEC_DEBUG(outs() << "--------Across BB--------\n"); + SmallVector toDelete; + for (auto it : bbInstMap) { + IR2VEC_DEBUG(outs() << "--------INSTMAP BEGIN--------\n"; + it.first->print(outs()); outs() << "\n"); + bool first = true; + for (auto it1 : bbInstMap[it.first]) { + if (first) { + first = false; + continue; + } + toDelete.push_back(it1); + IR2VEC_DEBUG(it1->print(outs()); outs() << "\n"); + } + IR2VEC_DEBUG(outs() << "--------INSTMAP END--------\n"); + } + auto tmp = probableRD; + probableRD = {}; + for (auto i : tmp) { + if (std::find(toDelete.begin(), toDelete.end(), i) == toDelete.end()) + probableRD.push_back(i); + } + + IR2VEC_DEBUG(I->print(outs()); outs() << "\n"; outs() << "probableRD: \n"; + for (auto i + : probableRD) i->print(outs()); + outs() << "\n"; outs() << "-----------------\n"); + + SmallPtrSet bbSet; + SmallMapVector refBBInstMap; + + for (auto i : probableRD) { + bbSet.insert(i->getParent()); + refBBInstMap[i->getParent()] = i; + IR2VEC_DEBUG(outs() << i->getParent()->getName().str() << "\n"); + } + for (auto i : bbSet) { + IR2VEC_DEBUG(i->print(outs()); outs() << "\n"); + auto exclusionSet = bbSet; + exclusionSet.erase(i); + if (isPotentiallyReachable(refBBInstMap[i], I, &exclusionSet, nullptr, + nullptr)) { + RD.push_back(refBBInstMap[i]); + IR2VEC_DEBUG(outs() << "refBBInstMap : "; + refBBInstMap[i]->print(outs()); outs() << "\n"); + } + } + IR2VEC_DEBUG( + outs() << "****************************\n"; + outs() << "Reaching defn for "; I->print(outs()); outs() << "\n"; + for (auto i + : RD) i->print(outs()); + outs() << "\n"; + outs() + << "Call to getReachingDefs Ended****************************\n"); + return RD; + } + + llvm_unreachable("unreachable"); + return {}; +} + +bool IR2Vec_FA::isMemOp(StringRef opcode, unsigned &operand, + SmallDenseMap map) { + bool isMemOperand = false; + auto It = map.find(opcode); + if (It != map.end()) { + isMemOperand = true; + operand = It->second; + } + return isMemOperand; +} + +/*---------------------------------------------------------------------------------- + Function to get Partial Vector of an instruction + ---------------------------------------------------------------------------------- +*/ +void IR2Vec_FA::getPartialVec( + const Instruction &I, + SmallMapVector &partialInstValMap) { + + if (instVecMap.find(&I) != instVecMap.end()) { + IR2VEC_DEBUG(outs() << "Returning from inst2Vec() I found in Map\n"); + return; + } + + Vector instVector(DIM, 0); + StringRef opcodeName = I.getOpcodeName(); + auto vec = getValue(opcodeName.str()); + IR2VEC_DEBUG(I.print(outs()); outs() << "\n"); + std::transform(instVector.begin(), instVector.end(), vec.begin(), + instVector.begin(), std::plus()); + partialInstValMap[&I] = instVector; + + IR2VEC_DEBUG(outs() << "contents of partialInstValMap:\n"; + for (auto i + : partialInstValMap) { + i.first->print(outs()); + outs() << "\n"; + }); + auto type = I.getType(); + + if (type->isVoidTy()) { + vec = getValue("voidTy"); + } else if (type->isFloatingPointTy()) { + vec = getValue("floatTy"); + } else if (type->isIntegerTy()) { + vec = getValue("integerTy"); + } else if (type->isFunctionTy()) { + vec = getValue("functionTy"); + } else if (type->isStructTy()) { + vec = getValue("structTy"); + } else if (type->isArrayTy()) { + vec = getValue("arrayTy"); + } else if (type->isPointerTy()) { + vec = getValue("pointerTy"); + } else if (type->isVectorTy()) { + vec = getValue("vectorTy"); + } else if (type->isEmptyTy()) { + vec = getValue("emptyTy"); + } else if (type->isLabelTy()) { + vec = getValue("labelTy"); + } else if (type->isTokenTy()) { + vec = getValue("tokenTy"); + } else if (type->isMetadataTy()) { + vec = getValue("metadataTy"); + } else { + vec = getValue("unknownTy"); + } + + scaleVector(vec, WT); + std::transform(instVector.begin(), instVector.end(), vec.begin(), + instVector.begin(), std::plus()); + + partialInstValMap[&I] = instVector; +} +/*---------------------------------------------------------------------------------- + Function to solve circular dependencies in Instructions + ---------------------------------------------------------------------------------- +*/ +void IR2Vec_FA::solveInsts( + llvm::SmallMapVector + &partialInstValMap, SmallVector &funcStack) { + std::map xI; + std::map Ix; + std::vector> A, B; + SmallMapVector, 16> + RDValMap; + unsigned pos = 0; + for (auto It : partialInstValMap) { + auto inst = It.first; + if (instVecMap.find(inst) == instVecMap.end()) { + Ix[inst] = pos; + xI[pos++] = inst; + std::vector tmp; + for (auto i : It.second) { + tmp.push_back((int)(i * 10) / 10.0); + } + B.push_back(tmp); + for (unsigned i = 0; i < inst->getNumOperands(); i++) { + if (isa(inst->getOperand(i))) { + auto f = getValue("function"); + if (isa(inst)) { + auto ci = dyn_cast(inst); + Function *func = ci->getCalledFunction(); + if (func) { + if (!func->isDeclaration() && std::find(funcStack.begin(), funcStack.end(), func) == + funcStack.end()) { + // Will be dealt with later + // change might be needed here, don't know for sure + Vector tempCall(DIM, 0); + // f = tempCall; + f = func2Vec(*func, funcStack, bpiMap[func]); + } + } + } + auto svtmp = f; + scaleVector(svtmp, WA); + std::vector vtmp(svtmp.begin(), svtmp.end()); + std::vector vec = B.back(); + IR2VEC_DEBUG(outs() << vec.back() << "\n"); + IR2VEC_DEBUG(outs() << vtmp.back() << "\n"); + B.pop_back(); + std::transform(vtmp.begin(), vtmp.end(), vec.begin(), vec.begin(), + std::plus()); + IR2VEC_DEBUG(outs() << vec.back() << "\n"); + B.push_back(vec); + } else if (isa(inst->getOperand(i)) && + !isa(inst->getOperand(i)->getType())) { + auto c = getValue("constant"); + auto svtmp = c; + scaleVector(svtmp, WA); + std::vector vtmp(svtmp.begin(), svtmp.end()); + std::vector vec = B.back(); + IR2VEC_DEBUG(outs() << vec.back() << "\n"); + IR2VEC_DEBUG(outs() << vtmp.back() << "\n"); + B.pop_back(); + std::transform(vtmp.begin(), vtmp.end(), vec.begin(), vec.begin(), + std::plus()); + IR2VEC_DEBUG(outs() << vec.back() << "\n"); + B.push_back(vec); + } else if (isa(inst->getOperand(i))) { + auto l = getValue("label"); + auto svtmp = l; + scaleVector(svtmp, WA); + std::vector vtmp(svtmp.begin(), svtmp.end()); + std::vector vec = B.back(); + IR2VEC_DEBUG(outs() << vec.back() << "\n"); + IR2VEC_DEBUG(outs() << vtmp.back() << "\n"); + B.pop_back(); + std::transform(vtmp.begin(), vtmp.end(), vec.begin(), vec.begin(), + std::plus()); + IR2VEC_DEBUG(outs() << vec.back() << "\n"); + B.push_back(vec); + } else { + /* + if (isa(inst->getOperand(i))) { + auto RD = getReachingDefs(inst, i); + for (auto i : RD) { + // Check if value of RD is precomputed + if (instVecMap.find(i) == instVecMap.end()) { + if (partialInstValMap.find(i) == partialInstValMap.end()) { + assert(partialInstValMap.find(i) != partialInstValMap.end() && + "Should not reach"); + } + if (RDValMap.find(inst) == RDValMap.end()) { + SmallMapVector tmp; + // change needed over here + tmp[i] = WA; + RDValMap[inst] = tmp; + } else { + RDValMap[inst][i] = WA; + } + } else { + auto svtmp = instVecMap[i]; + scaleVector(svtmp, WA); + std::vector vtmp(svtmp.begin(), svtmp.end()); + std::vector vec = B.back(); + IR2VEC_DEBUG(outs() << vec.back() << "\n"); + IR2VEC_DEBUG(outs() << vtmp.back() << "\n"); + B.pop_back(); + std::transform(vtmp.begin(), vtmp.end(), vec.begin(), + vec.begin(), std::plus()); + IR2VEC_DEBUG(outs() << vec.back() << "\n"); + B.push_back(vec); + } + } + } else if (isa(inst->getOperand(i)->getType())) { + auto l = getValue("pointer"); + auto svtmp = l; + scaleVector(svtmp, WA); + std::vector vtmp(svtmp.begin(), svtmp.end()); + std::vector vec = B.back(); + IR2VEC_DEBUG(outs() << vec.back() << "\n"); + IR2VEC_DEBUG(outs() << vtmp.back() << "\n"); + B.pop_back(); + std::transform(vtmp.begin(), vtmp.end(), vec.begin(), vec.begin(), + std::plus()); + IR2VEC_DEBUG(outs() << vec.back() << "\n"); + B.push_back(vec); + } else { + auto l = getValue("variable"); + auto svtmp = l; + scaleVector(svtmp, WA); + std::vector vtmp(svtmp.begin(), svtmp.end()); + std::vector vec = B.back(); + IR2VEC_DEBUG(outs() << vec.back() << "\n"); + IR2VEC_DEBUG(outs() << vtmp.back() << "\n"); + B.pop_back(); + std::transform(vtmp.begin(), vtmp.end(), vec.begin(), vec.begin(), + std::plus()); + IR2VEC_DEBUG(outs() << vec.back() << "\n"); + B.push_back(vec); + } + + */ + auto RD = getReachingDefs(inst, i); + for (auto i : RD) { + // Check if value of RD is precomputed + if (instVecMap.find(i) == instVecMap.end()) { + if (partialInstValMap.find(i) == partialInstValMap.end()) { + llvm_unreachable("Should not reach"); + } + if (RDValMap.find(inst) == RDValMap.end()) { + // SmallDenseMap tmp; + SmallMapVector tmp; + tmp[i] = WA * getRDProb(i, inst, RD); + RDValMap[inst] = tmp; + } else { + RDValMap[inst][i] = WA * getRDProb(i, inst, RD); + } + } else { + auto prob = getRDProb(i, inst, RD); + auto svtmp = instVecMap[i]; + scaleVector(svtmp, prob * WA); + std::vector vtmp(svtmp.begin(), svtmp.end()); + std::vector vec = B.back(); + // LLVM_DEBUG(dbgs() << vec.back() << "\n"); + // LLVM_DEBUG(dbgs() << vtmp.back() << "\n"); + B.pop_back(); + std::transform(vtmp.begin(), vtmp.end(), vec.begin(), + vec.begin(), std::plus()); + // LLVM_DEBUG(dbgs() << vec.back() << "\n"); + B.push_back(vec); + } + } + } + } + } + } + + for (unsigned i = 0; i < xI.size(); i++) { + std::vector tmp(xI.size(), 0); + A.push_back(tmp); + } + + for (unsigned i = 0; i < xI.size(); i++) { + A[i][i] = 1; + auto tmp = A[i]; + auto instRDVal = RDValMap[xI[i]]; + for (auto j : instRDVal) { + A[i][Ix[j.first]] = (int)((A[i][Ix[j.first]] - j.second) * 10) / 10.0; + } + } + + for (unsigned i = 0; i < B.size(); i++) { + auto Bvec = B[i]; + for (unsigned j = 0; j < B[i].size(); j++) { + B[i][j] = (int)(B[i][j] * 10) / 10.0; + } + } + + auto C = solve(A, B); + SmallMapVector, 16> + bbInstMap; + + for (unsigned i = 0; i < C.size(); i++) { + Vector tmp(C[i].begin(), C[i].end()); + IR2VEC_DEBUG(outs() << "inst:" + << "\t"; + xI[i]->print(outs()); outs() << "\nVAL: " << tmp[0] << "\n"); + + instVecMap[xI[i]] = tmp; + livelinessMap.try_emplace(xI[i], true); + + instSolvedBySolver.push_back(xI[i]); + bbInstMap[xI[i]->getParent()].push_back(xI[i]); + } + + for (auto BB : bbInstMap) { + unsigned opnum; + auto orderedInstVec = BB.second; + for (auto I : orderedInstVec) { + if (killMap.find(I) != killMap.end()) { + auto list = killMap[I]; + for (auto defs : list) { + auto It2 = livelinessMap.find(defs); + if (It2 == livelinessMap.end()) + livelinessMap.try_emplace(defs, false); + else + It2->second = false; + } + } + } + } +} + +/*---------------------------------------------------------------------------------- + Function to solve a single instruction usually forming a SCC + ---------------------------------------------------------------------------------- +*/ + +void IR2Vec_FA::solveSingleComponent( + const Instruction &I, + SmallMapVector &partialInstValMap, SmallVector &funcStack) { + + if (instVecMap.find(&I) != instVecMap.end()) { + IR2VEC_DEBUG(outs() << "Returning from inst2Vec() I found in Map\n"); + return; + } + + Vector instVector(DIM, 0); + StringRef opcodeName = I.getOpcodeName(); + + instVector = partialInstValMap[&I]; + + unsigned operandNum; + bool isMemWrite = isMemOp(opcodeName, operandNum, memWriteOps); + bool isCyclic = false; + Vector VecArgs(DIM, 0); + + SmallVector RDList; + RDList.clear(); + + for (unsigned i = 0; i < I.getNumOperands() /*&& !isCyclic*/; i++) { + Vector vecOp(DIM, 0); + if (isa(I.getOperand(i))) { + vecOp = getValue("function"); + if (isa(I)) { + auto ci = dyn_cast(&I); + Function *func = ci->getCalledFunction(); + if (func) { + if (!func->isDeclaration() && std::find(funcStack.begin(), funcStack.end(), func) == + funcStack.end()) { + // Will be dealt with later + // probably over here as well change ? + Vector tempCall(DIM, 0); + // vecOp = tempCall; + vecOp = func2Vec(*func, funcStack, bpiMap[func]); + + } + } + } + } + // Checking that the argument is not of pointer type because some + // non-numeric/alphabetic constants are also caught as pointer types + else if (isa(I.getOperand(i)) && + !isa(I.getOperand(i)->getType())) { + vecOp = getValue("constant"); + } else if (isa(I.getOperand(i))) { + vecOp = getValue("label"); + } else { + if (isa(I.getOperand(i))) { + auto RD = getReachingDefs(&I, i); + + if (!RD.empty()) { + vecOp = SmallVector(DIM, 0); + for (auto i : RD) { + // Check if value of RD is precomputed + if (instVecMap.find(i) == instVecMap.end()) { + if (partialInstValMap.find(i) == partialInstValMap.end()) { + partialInstValMap[i] = {}; + inst2Vec(*i, funcStack, partialInstValMap); + partialInstValMap.erase(i); + + if (std::find(instSolvedBySolver.begin(), + instSolvedBySolver.end(), + &I) != instSolvedBySolver.end()) + return; + + auto prob = getRDProb(i, &I, RD); + auto tmp = instVecMap[i]; + scaleVector(tmp, prob); + std::transform(tmp.begin(), tmp.end(), vecOp.begin(), vecOp.begin(), + std::plus()); + + } else { + isCyclic = true; + break; + } + } else { + auto prob = getRDProb(i, &I, RD); + auto tmp = instVecMap[i]; + scaleVector(tmp, prob); + std::transform(tmp.begin(), tmp.end(), vecOp.begin(), vecOp.begin(), + std::plus()); + } + } + } + + RDList.insert(RDList.end(), RD.begin(), RD.end()); + } else if (isa(I.getOperand(i)->getType())) { + vecOp = getValue("pointer"); + } else + vecOp = getValue("variable"); + } + + std::transform(VecArgs.begin(), VecArgs.end(), vecOp.begin(), + VecArgs.begin(), std::plus()); + // } + + Vector vecInst = Vector(DIM, 0); + + // if (!RDList.empty()) { + // for (auto i : RDList) { + // // Check if value of RD is precomputed + // if (instVecMap.find(i) == instVecMap.end()) { + + // /*Some phi instructions reach themselves and hence may not be in + // the instVecMap but should be in the partialInstValMap*/ + + // if (partialInstValMap.find(i) == partialInstValMap.end()) { + // assert(partialInstValMap.find(i) != partialInstValMap.end() && + // "Should have been in instvecmap or partialmap"); + // } + // } else { + // std::transform(instVecMap[i].begin(), instVecMap[i].end(), + // vecInst.begin(), vecInst.begin(), std::plus()); + // } + // } + // } + + if (!isCyclic) { + std::transform(VecArgs.begin(), VecArgs.end(), vecInst.begin(), + VecArgs.begin(), std::plus()); + + IR2VEC_DEBUG(outs() << VecArgs[0]); + + scaleVector(VecArgs, WA); + IR2VEC_DEBUG(outs() << VecArgs.front()); + // std::transform(instVector.begin(), instVector.end(), VecArgs.begin(), + // instVector.begin(), std::plus()); + std::transform(instVector.begin(), instVector.end(), vecOp.begin(), + instVector.begin(), std::plus()); + IR2VEC_DEBUG(outs() << instVector.front()); + + instVecMap[&I] = instVector; + livelinessMap.try_emplace(&I, true); + + if (killMap.find(&I) != killMap.end()) { + auto list = killMap[&I]; + for (auto defs : list) { + auto It2 = livelinessMap.find(defs); + if (It2 == livelinessMap.end()) + livelinessMap.try_emplace(defs, false); + else + It2->second = false; + } + } + } + assert(isCyclic == false && "A Single Component should not have a cycle!"); + } +} + +/*---------------------------------------------------------------------------------- + Function to solve left over instructions after all dependencies are solved + ---------------------------------------------------------------------------------- +*/ + +void IR2Vec_FA::inst2Vec( + const Instruction &I, SmallVector &funcStack, + SmallMapVector &partialInstValMap) { + + if (instVecMap.find(&I) != instVecMap.end()) { + IR2VEC_DEBUG(outs() << "Returning from inst2Vec() I found in Map\n"); + return; + } + // cout<<"ENTERING INST2VEC"<<"\n"; + + Vector instVector(DIM, 0); + StringRef opcodeName = I.getOpcodeName(); + auto vec = getValue(opcodeName.str()); + IR2VEC_DEBUG(I.print(outs()); outs() << "\n"); + std::transform(instVector.begin(), instVector.end(), vec.begin(), + instVector.begin(), std::plus()); + partialInstValMap[&I] = instVector; + + IR2VEC_DEBUG(outs() << "contents of partialInstValMap:\n"; + for (auto i + : partialInstValMap) { + i.first->print(outs()); + outs() << "\n"; + }); + + auto type = I.getType(); + + if (type->isVoidTy()) { + vec = getValue("voidTy"); + } else if (type->isFloatingPointTy()) { + vec = getValue("floatTy"); + } else if (type->isIntegerTy()) { + vec = getValue("integerTy"); + } else if (type->isFunctionTy()) { + vec = getValue("functionTy"); + } else if (type->isStructTy()) { + vec = getValue("structTy"); + } else if (type->isArrayTy()) { + vec = getValue("arrayTy"); + } else if (type->isPointerTy()) { + vec = getValue("pointerTy"); + } else if (type->isVectorTy()) { + vec = getValue("vectorTy"); + } else if (type->isEmptyTy()) { + vec = getValue("emptyTy"); + } else if (type->isLabelTy()) { + vec = getValue("labelTy"); + } else if (type->isTokenTy()) { + vec = getValue("tokenTy"); + } else if (type->isMetadataTy()) { + vec = getValue("metadataTy"); + } else { + vec = getValue("unknownTy"); + } + scaleVector(vec, WT); + std::transform(instVector.begin(), instVector.end(), vec.begin(), + instVector.begin(), std::plus()); + partialInstValMap[&I] = instVector; + + unsigned operandNum; + bool isMemWrite = isMemOp(opcodeName, operandNum, memWriteOps); + bool isCyclic = false; + Vector VecArgs(DIM, 0); + + SmallVector RDList; + RDList.clear(); + + for (unsigned i = 0; i < I.getNumOperands() /*&& !isCyclic*/; i++) { + Vector vecOp(DIM, 0); + if (isa(I.getOperand(i))) { + vecOp = getValue("function"); + if (isa(I)) { + auto ci = dyn_cast(&I); + Function *func = ci->getCalledFunction(); + if (func) { + // if (!func->isDeclaration()) { + if (!func->isDeclaration() && std::find(funcStack.begin(), funcStack.end(), func) == + funcStack.end()) { + // Will be dealt with later + Vector tempCall(DIM, 0); + // vecOp = tempCall; + cout<<"NOT ABLE TO FIND FUNC SOMEHOW ?"<<"\n"; + vecOp = func2Vec(*func, funcStack, bpiMap[func]); + } + } + } + } + + // old code : + + else if (isa(I.getOperand(i)) && + !isa(I.getOperand(i)->getType())) { + // out << " constant "; + vec = getValue("constant"); + } else if (isa(I.getOperand(i))) { + // out << " label "; + vec = getValue("label"); + } else { + // out << " variable "; + if (isa(I.getOperand(i)->getType())) + vec = getValue("pointer"); + else + vec = getValue("variable"); + if (isa(I.getOperand(i))) { + auto RD = getReachingDefs(&I, i); + // For every RD, get its contribution to the final vector + if (!RD.empty()) { + vec = SmallVector(DIM, 0); + for (auto i : RD) { + // Check if value of RD is precomputed + if (instVecMap.find(i) == instVecMap.end()) { + if (partialInstValMap.find(i) == partialInstValMap.end()) { + partialInstValMap[i] = {}; + inst2Vec(*i, funcStack, partialInstValMap); + partialInstValMap.erase(i); + + if (std::find(instSolvedBySolver.begin(), + instSolvedBySolver.end(), + &I) != instSolvedBySolver.end()) + return; + + auto prob = getRDProb(i, &I, RD); + auto tmp = instVecMap[i]; + scaleVector(tmp, prob); + std::transform(tmp.begin(), tmp.end(), vec.begin(), vec.begin(), + std::plus()); + + } else { + isCyclic = true; + break; + } + } else { + auto prob = getRDProb(i, &I, RD); + auto tmp = instVecMap[i]; + scaleVector(tmp, prob); + std::transform(tmp.begin(), tmp.end(), vec.begin(), vec.begin(), + std::plus()); + } + } + } + // if(!isCyclic) + // vec = lookupOrInsertIntoMap(inst, vec); + } + } + + if (!isCyclic) { + // LLVM_DEBUG(dbgs() << vec[0]); + scaleVector(vec, WA); + // LLVM_DEBUG(dbgs() << vec.front()); + std::transform(instVector.begin(), instVector.end(), vec.begin(), + instVector.begin(), std::plus()); + // LLVM_DEBUG(dbgs() << instVector.front()); + + partialInstValMap[&I] = instVector; + } + } + + if (isCyclic) { + // LLVM_DEBUG(dbgs() << "XX------------Cyclic dependncy in the " + // "IRs---------------------XX \n"); + cyclicCounter++; + // There is a chance that all operands of an instruction has not been + // processed. In such a case for a cyclic dependencies, process all unseen + // operands now. + const auto tmp = partialInstValMap; + for (auto It : tmp) { + auto inst = It.first; + for (unsigned i = 0; i < inst->getNumOperands(); i++) { + if (isa(inst->getOperand(i)) || + isa(inst->getOperand(i)) || + isa(inst->getOperand(i))) + continue; + + else { + auto RD = getReachingDefs(inst, i); + for (auto i : RD) { + // Check if value of RD is precomputed + if (instVecMap.find(i) == instVecMap.end()) { + if (partialInstValMap.find(i) == partialInstValMap.end()) { + partialInstValMap[i] = {}; + inst2Vec(*i, funcStack, partialInstValMap); + partialInstValMap.erase(i); + + if (std::find(instSolvedBySolver.begin(), + instSolvedBySolver.end(), + &I) != instSolvedBySolver.end()) + return; + } + } + } + } + } + } + std::map xI; + std::map Ix; + std::vector> A, B; + /* SmallDenseMap> + RDValMap; */ + SmallMapVector, 16> + RDValMap; + unsigned pos = 0; + for (auto It : partialInstValMap) { + auto inst = It.first; + if (instVecMap.find(inst) == instVecMap.end()) { + Ix[inst] = pos; + xI[pos++] = inst; + std::vector tmp; + for (auto i : It.second) { + tmp.push_back((int)(i * 10) / 10.0); + // tmp.push_back(i); + } + B.push_back(tmp); + for (unsigned i = 0; i < inst->getNumOperands(); i++) { + if (isa(inst->getOperand(i))) { + // out << " function "; + auto f = getValue("function"); + if (isa(inst)) { + auto ci = dyn_cast(inst); + Function *func = ci->getCalledFunction(); + if (func) { + if (!func->isDeclaration() && + std::find(funcStack.begin(), funcStack.end(), func) == + funcStack.end()) { + // issues may be arising here ? + cout<<"SECOND TIME IN INST2VEC, SOMEHOW FUNC IS EMPTY"<<"\n"; + f = func2Vec(*func, funcStack, bpiMap[func]); + } + } + } + auto svtmp = f; + scaleVector(svtmp, WA); + std::vector vtmp(svtmp.begin(), svtmp.end()); + std::vector vec = B.back(); + // LLVM_DEBUG(dbgs() << vec.back() << "\n"); + // LLVM_DEBUG(dbgs() << vtmp.back() << "\n"); + B.pop_back(); + std::transform(vtmp.begin(), vtmp.end(), vec.begin(), vec.begin(), + std::plus()); + // LLVM_DEBUG(dbgs() << vec.back() << "\n"); + B.push_back(vec); + } else if (isa(inst->getOperand(i))) { + // out << " constant "; + auto c = getValue("constant"); + auto svtmp = c; + scaleVector(svtmp, WA); + std::vector vtmp(svtmp.begin(), svtmp.end()); + std::vector vec = B.back(); + // LLVM_DEBUG(dbgs() << vec.back() << "\n"); + // LLVM_DEBUG(dbgs() << vtmp.back() << "\n"); + B.pop_back(); + std::transform(vtmp.begin(), vtmp.end(), vec.begin(), vec.begin(), + std::plus()); + // LLVM_DEBUG(dbgs() << vec.back() << "\n"); + B.push_back(vec); + } else if (isa(inst->getOperand(i))) { + // out << " label "; + auto l = getValue("label"); + + auto svtmp = l; + scaleVector(svtmp, WA); + std::vector vtmp(svtmp.begin(), svtmp.end()); + std::vector vec = B.back(); + // LLVM_DEBUG(dbgs() << vec.back() << "\n"); + // LLVM_DEBUG(dbgs() << vtmp.back() << "\n"); + B.pop_back(); + std::transform(vtmp.begin(), vtmp.end(), vec.begin(), vec.begin(), + std::plus()); + // LLVM_DEBUG(dbgs() << vec.back() << "\n"); + B.push_back(vec); + } else { + auto RD = getReachingDefs(inst, i); + for (auto i : RD) { + // Check if value of RD is precomputed + if (instVecMap.find(i) == instVecMap.end()) { + if (partialInstValMap.find(i) == partialInstValMap.end()) { + llvm_unreachable("Should not reach"); + } + if (RDValMap.find(inst) == RDValMap.end()) { + // SmallDenseMap tmp; + SmallMapVector tmp; + tmp[i] = WA * getRDProb(i, inst, RD); + RDValMap[inst] = tmp; + } else { + RDValMap[inst][i] = WA * getRDProb(i, inst, RD); + } + } else { + auto prob = getRDProb(i, inst, RD); + auto svtmp = instVecMap[i]; + scaleVector(svtmp, prob * WA); + std::vector vtmp(svtmp.begin(), svtmp.end()); + std::vector vec = B.back(); + // LLVM_DEBUG(dbgs() << vec.back() << "\n"); + // LLVM_DEBUG(dbgs() << vtmp.back() << "\n"); + B.pop_back(); + std::transform(vtmp.begin(), vtmp.end(), vec.begin(), + vec.begin(), std::plus()); + // LLVM_DEBUG(dbgs() << vec.back() << "\n"); + B.push_back(vec); + } + } + } + } + } + } + + for (unsigned i = 0; i < xI.size(); i++) { + std::vector tmp(xI.size(), 0); + A.push_back(tmp); + } + + for (unsigned i = 0; i < xI.size(); i++) { + A[i][i] = 1; + auto tmp = A[i]; + auto instRDVal = RDValMap[xI[i]]; + for (auto j : instRDVal) { + // To-Do: If j.first not found in Ix? + A[i][Ix[j.first]] = (int)((A[i][Ix[j.first]] - j.second) * 10) / 10.0; + // A[i][Ix[j.first]] = A[i][Ix[j.first]] - j.second; + } + } + + for (unsigned i = 0; i < B.size(); i++) { + auto Bvec = B[i]; + for (unsigned j = 0; j < B[i].size(); j++) { + B[i][j] = (int)(B[i][j] * 10) / 10.0; + } + } + + auto C = solve(A, B); + // SmallDenseMap> + // bbInstMap; + SmallMapVector, 16> + bbInstMap; + for (unsigned i = 0; i < C.size(); i++) { + SmallVector tmp(C[i].begin(), C[i].end()); + // LLVM_DEBUG(dbgs() << "inst:" + // << "\t"; + // xI[i]->dump(); dbgs() << "VAL: " << tmp[0] << "\n"); + + instVecMap.try_emplace(xI[i], tmp); + // instVecMap.insert(std::make_pair(xI, std::move(tmp))); + livelinessMap.try_emplace(xI[i], true); + + instSolvedBySolver.push_back(xI[i]); + bbInstMap[xI[i]->getParent()].push_back(xI[i]); + } + + for (auto BB : bbInstMap) { + unsigned opnum; + auto orderedInstVec = BB.second; + // Sorting not needed? + // sort(orderedInstVec.begin(), orderedInstVec.end()); + for (auto I : orderedInstVec) { + if (isMemOp(I->getOpcodeName(), opnum, memWriteOps) && + dyn_cast(I->getOperand(opnum))) { + // LLVM_DEBUG(dbgs() << I->getParent()->getParent()->getName() << "\n"); + // LLVM_DEBUG(I->dump()); + killAndUpdate(dyn_cast(I->getOperand(opnum)), + instVecMap[I]); + } + } + } + // LLVM_DEBUG(dbgs() << "\nYY------------Cyclic dependncy in the " + // "IRs---------------------YY\n"); + } + + else { + instVecMap.try_emplace(&I, instVector); + livelinessMap.try_emplace(&I, true); + + // kill and update + if (isMemWrite && dyn_cast(I.getOperand(operandNum))) { + // LLVM_DEBUG(I.dump()); + killAndUpdate(dyn_cast(I.getOperand(operandNum)), + instVector); + } + } + // Checking that the argument is not of pointer type because some + // non-numeric/alphabetic constants are also caught as pointer types + // else if (isa(I.getOperand(i)) && + // !isa(I.getOperand(i)->getType())) { + // vecOp = getValue("constant"); + // } else if (isa(I.getOperand(i))) { + // vecOp = getValue("label"); + // } else { + // if (isa(I.getOperand(i))) { + // // over here, a lot of stuff was happening previously + // auto RD = getReachingDefs(&I, i); + // // let's see how it goes + // if (!RD.empty()) { + // vecOp = SmallVector(DIM, 0); + // for (auto i : RD) { + // // Check if value of RD is precomputed + // if (instVecMap.find(i) == instVecMap.end()) { + // if (partialInstValMap.find(i) == partialInstValMap.end()) { + // partialInstValMap[i] = {}; + // inst2Vec(*i, funcStack, partialInstValMap); + // partialInstValMap.erase(i); + + // if (std::find(instSolvedBySolver.begin(), + // instSolvedBySolver.end(), + // &I) != instSolvedBySolver.end()) + // return; + + // auto prob = getRDProb(i, &I, RD); + // auto tmp = instVecMap[i]; + // scaleVector(tmp, prob); + // std::transform(tmp.begin(), tmp.end(), vecOp.begin(), vecOp.begin(), + // std::plus()); + + // } else { + // isCyclic = true; + // break; + // } + // } else { + // auto prob = getRDProb(i, &I, RD); + // auto tmp = instVecMap[i]; + // scaleVector(tmp, prob); + // std::transform(tmp.begin(), tmp.end(), vecOp.begin(), vecOp.begin(), + // std::plus()); + // } + // } + // } + + // RDList.insert(RDList.end(), RD.begin(), RD.end()); + + // } else if (isa(I.getOperand(i)->getType())) + // vecOp = getValue("pointer"); + // else + // vecOp = getValue("variable"); + // } + + // std::transform(VecArgs.begin(), VecArgs.end(), vecOp.begin(), + // VecArgs.begin(), std::plus()); + // // } // moving this bracket to keep the !isCyclic inside the loop body + + // Vector vecInst = Vector(DIM, 0); + + // if (!RDList.empty()) { + // for (auto i : RDList) { + // // changes might be needed over here + // // Check if value of RD is precomputed + // if (instVecMap.find(i) == instVecMap.end()) { + // assert(instVecMap.find(i) != instVecMap.end() && + // "All RDs should have been solved by Topo Order!"); + // } else { + // std::transform(instVecMap[i].begin(), instVecMap[i].end(), + // vecInst.begin(), vecInst.begin(), std::plus()); + // } + // } + // } + + // if (!isCyclic) { + // std::transform(VecArgs.begin(), VecArgs.end(), vecInst.begin(), + // VecArgs.begin(), std::plus()); + + // IR2VEC_DEBUG(outs() << VecArgs[0]); + + // scaleVector(VecArgs, WA); + // IR2VEC_DEBUG(outs() << VecArgs.front()); + // // std::transform(instVector.begin(), instVector.end(), VecArgs.begin(), + // // instVector.begin(), std::plus()); + // // making change here to make it similar to IR2vec-Rd + // std::transform(instVector.begin(), instVector.end(), vecOp.begin(), + // instVector.begin(), std::plus()); + // IR2VEC_DEBUG(outs() << instVector.front()); + // instVecMap[&I] = instVector; + // livelinessMap.try_emplace(&I, true); + + // if (killMap.find(&I) != killMap.end()) { + // auto list = killMap[&I]; + // for (auto defs : list) { + // auto It2 = livelinessMap.find(defs); + // if (It2 == livelinessMap.end()) + // livelinessMap.try_emplace(defs, false); + // else + // It2->second = false; + // } + // } + // } + // assert(isCyclic == false && "All dependencies should have been solved!"); + } + +/*---------------------------------------------------------------------------------- + Utility function : Traverses Reaching definitions + ---------------------------------------------------------------------------------- +*/ + +void IR2Vec_FA::traverseRD( + const llvm::Instruction *inst, + std::unordered_map &Visited, + llvm::SmallVector &timeStack) { + + auto RDit = instReachingDefsMap.find(inst); + + Visited[inst] = true; + + if (RDit != instReachingDefsMap.end()) { + + auto RD = RDit->second; + + for (auto defs : RD) { + if (Visited.find(defs) == Visited.end()) + traverseRD(defs, Visited, timeStack); + } + } + // All the children (RDs) of current node is done push to timeStack + timeStack.push_back(inst); +} + +void IR2Vec_FA::DFSUtil( + const llvm::Instruction *inst, + std::unordered_map &Visited, + llvm::SmallVector &set) { + + Visited[inst] = true; + auto RD = reverseReachingDefsMap[inst]; + + for (auto defs : RD) { + if (Visited.find(defs) == Visited.end()) { + set.push_back(defs); + DFSUtil(defs, Visited, set); + } + } +} + +/*---------------------------------------------------------------------------------- + Utility function : Creates and returns all SCCs + ---------------------------------------------------------------------------------- +*/ + +void IR2Vec_FA::getAllSCC() { + + std::unordered_map Visited; + + llvm::SmallVector timeStack; + + for (auto &I : instReachingDefsMap) { + if (Visited.find(I.first) == Visited.end()) { + traverseRD(I.first, Visited, timeStack); + } + } + + IR2VEC_DEBUG(for (auto &defs : timeStack) { outs() << defs << "\n"; }); + + Visited.clear(); + + // Second pass getting SCCs + while (timeStack.size() != 0) { + auto inst = timeStack.back(); + timeStack.pop_back(); + if (Visited.find(inst) == Visited.end()) { + llvm::SmallVector set; + set.push_back(inst); + DFSUtil(inst, Visited, set); + if (set.size() != 0) + allSCCs.push_back(set); + } + } +} + +void IR2Vec_FA::bb2Vec(BasicBlock &B, SmallVector &funcStack) { + SmallMapVector partialInstValMap; + + for (auto &I : B) { + + partialInstValMap[&I] = {}; + IR2VEC_DEBUG(outs() << "XX------------ Call from bb2vec function " + "Started---------------------XX\n"); + inst2Vec(I, funcStack, partialInstValMap); + IR2VEC_DEBUG(outs() << "YY------------Call from bb2vec function " + "Ended---------------------YY\n"); + partialInstValMap.erase(&I); + } +} + +// INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) + +// void IR2Vec_FA::getAnalysisUsage(AnalysisUsage &AU) const{ +// AU.addRequired(); +// AU.addRequired(); +// AU.addRequired(); +// AU.setPreservesAll(); +// } + +// extern "C" LLVM_ATTRIBUTE_WEAK ::llvm::PassPluginLibraryInfo llvmGetPassPluginInfo() { +// return { +// LLVM_PLUGIN_API_VERSION, "IR2Vec_FA", LLVM_VERSION_STRING, +// [](PassBuilder &PB) { +// PB.registerPipelineParsingCallback( +// [](StringRef Name, ModulePassManager &MPM, ArrayRef) { +// if (Name == "IR2Vec_FA") { +// // FPM.addPass(createFunctionToLoopPassAdaptor(LoopRotatePass(true,true))); +// MPM.addPass(IR2Vec_FA()); + +// return true; +// } +// return false; +// }); +// } + + +// }; +// } + +// extern "C" LLVM_ATTRIBUTE_WEAK ::llvm::PassPluginLibraryInfo llvmGetPassPluginInfo() { +// return { +// LLVM_PLUGIN_API_VERSION, "MyPassPlugin", LLVM_VERSION_STRING, +// [](PassBuilder &PB) { +// // Register your analysis pass +// PB.registerFunctionAnalyses([](FunctionAnalysisManager &FAM) { +// FAM.registerPass([] { return BranchProbabilityAnalysis(); }); +// }); + +// // If you have a new pass, register it like this +// // PB.registerPipelineParsingCallback( +// // [](StringRef Name, FunctionPassManager &FPM, ArrayRef) { +// // if (Name == "my-new-pass") { +// // FPM.addPass(MyNewPass()); +// // return true; +// // } +// // return false; +// // }); +// } +// }; +// } + + From 75dd8b64713bba487c416a39326d3e78e0d9a045 Mon Sep 17 00:00:00 2001 From: iamaayushrivastava Date: Tue, 25 Feb 2025 16:26:20 +0530 Subject: [PATCH 3/3] Added models for hyperparameter tuning and inference --- .../hypertuning/mlp_model.py | 44 +++ .../hypertuning/model_tuner.py | 345 ++++++++++++++++++ .../models/histogram_model.py | 228 ++++++++++++ .../models/ir2vec_fa_model.py | 273 ++++++++++++++ .../models/ir2vec_static_model.py | 270 ++++++++++++++ .../models/ir2vec_sym_model.py | 267 ++++++++++++++ .../models/milepost_model.py | 311 ++++++++++++++++ 7 files changed, 1738 insertions(+) create mode 100644 hyperparameter-tuning/hypertuning/mlp_model.py create mode 100644 hyperparameter-tuning/hypertuning/model_tuner.py create mode 100644 hyperparameter-tuning/models/histogram_model.py create mode 100644 hyperparameter-tuning/models/ir2vec_fa_model.py create mode 100644 hyperparameter-tuning/models/ir2vec_static_model.py create mode 100644 hyperparameter-tuning/models/ir2vec_sym_model.py create mode 100644 hyperparameter-tuning/models/milepost_model.py diff --git a/hyperparameter-tuning/hypertuning/mlp_model.py b/hyperparameter-tuning/hypertuning/mlp_model.py new file mode 100644 index 0000000..309c4dc --- /dev/null +++ b/hyperparameter-tuning/hypertuning/mlp_model.py @@ -0,0 +1,44 @@ +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader, TensorDataset, Dataset +import ray +from ray import tune +from ray.tune.schedulers import ASHAScheduler +from ray.tune.search.optuna import OptunaSearch +import pandas as pd +import logging +import numpy as np + + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S" +) +logger = logging.getLogger(__name__) + +class MLP(nn.Module): + def __init__(self, input_dim, num_classes, num_layers, units_per_layer, dropout, normalize_input, activation): + super(MLP, self).__init__() + + logger.info("Initializing MLP model...") + + layers = [] + for i in range(num_layers): + in_features = input_dim if i == 0 else units_per_layer[i - 1] + out_features = units_per_layer[i] + layers.append(nn.Linear(in_features, out_features)) + layers.append(nn.BatchNorm1d(out_features)) # Always use BatchNorm + layers.append(activation) + if dropout > 0: + layers.append(nn.Dropout(dropout)) + layers.append(nn.Linear(units_per_layer[-1], num_classes)) + self.net = nn.Sequential(*layers) + self.normalize_input = normalize_input + logger.info("MLP model initialized.") + + def forward(self, x): + if self.normalize_input: + x = nn.functional.normalize(x, p=2, dim=1) # L2 Normalization + return self.net(x) \ No newline at end of file diff --git a/hyperparameter-tuning/hypertuning/model_tuner.py b/hyperparameter-tuning/hypertuning/model_tuner.py new file mode 100644 index 0000000..5838856 --- /dev/null +++ b/hyperparameter-tuning/hypertuning/model_tuner.py @@ -0,0 +1,345 @@ +import ray.train +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader, TensorDataset, Dataset +import ray +from ray import tune +from ray.tune.schedulers import ASHAScheduler +from ray.tune.search.optuna import OptunaSearch +import pandas as pd +import logging +import json +import os +import numpy as np +import random +import tempfile +from ray import train, tune +import sys +# sys.path.append("/home/intern24009/IR2Vec-Classification/tune-ir2vec/") +sys.path.append("/home/cs24mtech02001/Program-Classification/ir2vec-model-tuning/") +from mlp_model import MLP +from datetime import datetime + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S" +) +logger = logging.getLogger(__name__) + +class CSVDataset(Dataset): + def __init__(self, file_path): + print(f"Loading dataset from: {file_path}") + + try: + self.data = pd.read_csv(file_path, delimiter='\t', header=None) + # print(f"First 5 rows of the dataset:\n{self.data.head()}") + except Exception as e: + print(f"Error reading CSV: {e}") + return + + try: + self.labels = torch.tensor(self.data.iloc[:, 0].values, dtype=torch.long) + self.features = torch.tensor(self.data.iloc[:, 1:].values, dtype=torch.float32) + except Exception as e: + print(f"Error processing data: {e}") + return + + # print(f"Column data types:\n{self.data.dtypes}") + + if not pd.api.types.is_numeric_dtype(self.data.iloc[:, 0]): + print("Error: Non-numeric labels detected in the first column.") + return + + # Adjust labels to be 0-based (subtract 1 for 1-based labels) + self.labels = self.labels - 1 # Make labels 0-based + + print("Dataset loaded successfully.") + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return self.features[idx], self.labels[idx] + + +# Define the MLP model +# class MLP(nn.Module): +# def __init__(self, input_dim, num_classes, num_layers, units_per_layer, dropout, normalize_input, activation): +# super(MLP, self).__init__() + +# logger.info("Initializing MLP model...") + +# layers = [] +# for i in range(num_layers): +# in_features = input_dim if i == 0 else units_per_layer +# layers.append(nn.Linear(in_features, units_per_layer)) +# layers.append(nn.BatchNorm1d(units_per_layer)) # Always use BatchNorm +# layers.append(activation) +# if dropout > 0: +# layers.append(nn.Dropout(dropout)) +# layers.append(nn.Linear(units_per_layer, num_classes)) +# self.net = nn.Sequential(*layers) +# self.normalize_input = normalize_input +# logger.info("MLP model initialized.") + +# def forward(self, x): +# if self.normalize_input: +# x = nn.functional.normalize(x, p=2, dim=1) # L2 Normalization +# return self.net(x) + +# Training function +def train_model(config, checkpoint_dir=None): + # Simulated dataset (replace with your dataset) + logger.info(f"Trial Config: num_layers={config['num_layers']}, units_per_layer={config['units_per_layer']}") + + logger.info("Starting training process...") + input_dim = 300 # For IR2Vec, DIM=300 + num_classes = 342 + + train_dataset_path="/Pramana/IR2Vec/Codeforces-Profiled-Dataset/profile-aware-embeddings/O0/training.csv" + test_dataset_path="/Pramana/IR2Vec/Codeforces-Profiled-Dataset/profile-aware-embeddings/O0/testing.csv" + val_dataset_path="/Pramana/IR2Vec/Codeforces-Profiled-Dataset/profile-aware-embeddings/O0/val.csv" + + train_dataset = CSVDataset(train_dataset_path) + val_dataset = CSVDataset(val_dataset_path) + test_dataset = CSVDataset(test_dataset_path) + + train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True) + val_loader = DataLoader(val_dataset, batch_size=config["batch_size"], shuffle=False) + test_loader = DataLoader(test_dataset, batch_size=config["batch_size"], shuffle=False) + + logger.info("Datasets and DataLoaders prepared for codeforces-ir2vec-fa-dynamic-O0-model, gpu cuda:0") + + # Initialize model + model = MLP( + input_dim=input_dim, + num_classes=num_classes, + num_layers=config["num_layers"], + units_per_layer=config["units_per_layer"], + dropout=config["dropout"], + normalize_input=config["normalize_input"], + activation=config["activation"] + ) + + device = "cuda" if torch.cuda.is_available() else "cpu" + # print(f"Using device: {device}") + logger.info("This is cuda:0") + + model.to(device) + # print(f"Model moved to {device}") + + # Define loss and optimizer + criterion = nn.CrossEntropyLoss() + optimizer = getattr(optim, config["optimizer"])( + model.parameters(), lr=config["lr"] + ) + + best_val_accuracy = 0.0 + + # Training loop + logger.info("Starting training loop...") + for epoch in range(config["epochs"]): + model.train() + running_loss = 0.0 + correct_train = 0 + total_train = 0 + + # Train the model + for batch in train_loader: + inputs, labels = batch + inputs, labels = inputs.to(device), labels.to(device) + + # logger.info(f"Labels range: min={labels.min()}, max={labels.max()}") + + optimizer.zero_grad() + outputs = model(inputs) + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() + + running_loss += loss.item() + + # Calculate train accuracy + _, predicted = torch.max(outputs, 1) + total_train += labels.size(0) + correct_train += (predicted == labels).sum().item() + + train_loss = running_loss / len(train_loader) + train_accuracy = correct_train / total_train + + # Evaluate on validation data + model.eval() + running_val_loss = 0.0 + correct_val = 0 + total_val = 0 + + with torch.no_grad(): + for batch in val_loader: + inputs, labels = batch + inputs, labels = inputs.to(device), labels.to(device) + + outputs = model(inputs) + loss = criterion(outputs, labels) + running_val_loss += loss.item() + + # Calculate validation accuracy + _, predicted = torch.max(outputs, 1) + total_val += labels.size(0) + correct_val += (predicted == labels).sum().item() + + val_loss = running_val_loss / len(val_loader) + val_accuracy = correct_val / total_val + + logger.info(f"Epoch [{epoch+1}/{config['epochs']}]: Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, " + f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}") + + # if val_accuracy>best_val_accuracy: + # best_val_accuracy = val_accuracy + with tune.checkpoint_dir(step=epoch) as checkpoint_dir: + model_path = os.path.join(checkpoint_dir, "model_checkpoint.model") + torch.save(model, model_path) + print(f"Model checkpoint saved at {model_path}") + + tune.report(train_loss=train_loss, val_loss=val_loss, train_accuracy=train_accuracy, val_accuracy=val_accuracy) + +def custom_serializer(obj): + if isinstance(obj, torch.Tensor): + return obj.tolist() + return str(obj) + +# Main function to run Ray Tune +def main(): + input_dim = 300 # Example input dimension + num_classes = 342 # Example number of classes # POJ-104 + epochs = 2000 + # # Hyperparameter search space + # config = { + # "input_dim": input_dim, + # "num_classes": num_classes, + # "num_layers": tune.randint(1, 5), + # "units_per_layer": tune.choice([64, 128, 256, 512]), + # "dropout": tune.uniform(0.0, 0.2), + # "normalize_input": tune.choice([True, False]), + # "activation": tune.choice([nn.ReLU(), nn.LeakyReLU(), nn.Tanh(), nn.SiLU()]), + # "optimizer": tune.choice(["Adam", "SGD"]), + # "lr": tune.loguniform(1e-4, 1e-1), + # "batch_size": tune.choice([16, 32, 64, 128, 256, 512, 1024]), + # "epochs": 5000, + # } + + config = { + "input_dim": input_dim, + "num_classes": num_classes, + "num_layers": tune.randint(3, 8), + # "units_per_layer": tune.choice([64, 128, 256, 512]), + # "units_per_layer": tune.sample_from(lambda spec : np.random.randint(64, high=2048, size=spec.config.num_layers)), + "units_per_layer": tune.sample_from(lambda spec: [ random.choice([64, 128, 256, 512]) for _ in range(spec.config["num_layers"])]), + # "dropout": tune.sample_from(lambda spec : np.random.uniform(0, high=0.3, size=spec.config.num_layers)), + # "units_per_layer": tune.sample_from(lambda spec: generate_units_per_layer({"num_layers": spec.config["num_layers"]})), + # "units_per_layer": tune.sample_from(lambda spec: [random.choice([64, 128, 256, 512]) for _ in range(4)]), + "dropout": tune.uniform(0.0, 0.3), + "normalize_input": tune.choice([True, False]), + "activation": tune.choice([nn.ReLU(), nn.LeakyReLU(), nn.Tanh(), nn.SiLU()]), + "optimizer": tune.choice(["Adam"]), #tune.choice(["Adam", "SGD"]), + "lr": tune.loguniform(1e-4, 1e-2), + "batch_size": tune.choice([32, 64, 128, 256, 512, 1024]), + "epochs": epochs, + } + + # Define scheduler and search algorithm + scheduler = ASHAScheduler( + # metric="val_accuracy", # Use validation loss for early stopping + # mode="max", + max_t=epochs, + grace_period=25, + reduction_factor=2 + ) + + # search_alg = OptunaSearch(metric="val_accuracy", mode="max") + + # # Run Ray Tune + # ray.init() + # analysis = tune.run( + # train_model, + # config=config, + # metric="val_accuracy", + # mode="max", + # scheduler=scheduler, + # search_alg=search_alg, + # num_samples=1000, + # max_concurrent_trials=4, + # resources_per_trial={"cpu": 10, "gpu": 0.25} + # ) + ray.init(_temp_dir="/Pramana/IR2Vec/ir2vec_tuned_models") + analysis = tune.run( + train_model, + config=config, + metric="val_accuracy", + mode="max", + keep_checkpoints_num=5, + # checkpoint_score_attr="val_accuracy", + scheduler=scheduler, + # search_alg=search_alg, + num_samples=1000, + max_concurrent_trials=4, + resources_per_trial={"cpu": 10, "gpu": 0.125}, + local_dir="/Pramana/IR2Vec/ir2vec_tuned_models/tmp/ray_results" + ) + + best_trial = analysis.get_best_trial(metric="val_accuracy", mode="max", scope="all") + best_checkpoint = analysis.get_best_checkpoint(best_trial, metric="val_accuracy", mode="max") + print(f"Best checkpoint saved at: {best_checkpoint}") + + timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + + # Print the best result + # logger.info("Best hyperparameters found were:") + # logger.info(analysis.best_config) + + best_config = analysis.best_config + logger.info("Best hyperparameters found were:") + logger.info(best_config) + + best_trial = analysis.get_best_trial(metric="val_accuracy", mode="max", scope="all") + best_results = best_trial.last_result + logger.info(f"Best results: {best_results}") + + results = { + "best_config": best_config, + "best_results": best_results, + "input_csv_paths": { + "train": "/Pramana/IR2Vec/Codeforces-Profiled-Dataset/profile-aware-embeddings/O0/training.csv", + "val": "/Pramana/IR2Vec/Codeforces-Profiled-Dataset/profile-aware-embeddings/O0/val.csv", + "test": "/Pramana/IR2Vec/Codeforces-Profiled-Dataset/profile-aware-embeddings/O0/testing.csv", + }, + } + trials_data = [] + for trial in analysis.trials: + trial_data = trial.config + trial_data.update(trial.last_result) + trials_data.append(trial_data) + + trials_df = pd.DataFrame(trials_data) + + trials_table_path = os.path.join("results", f"{timestamp}_ir2vec_O0_dynamic_codeforces_hyperparameter_tuning_results_sample_1000_epoch_2000.csv") + os.makedirs("results", exist_ok=True) + + trials_df.to_csv(trials_table_path, index=False) + + results["all_trials"] = trials_data + + output_dir = "results" + os.makedirs(output_dir, exist_ok=True) + + # Save the results to a JSON file + result_file_path = os.path.join(output_dir, f"{timestamp}_ir2vec_O0_dynamic_codeforces_tune_results_sample_1000_epoch_2000.json") + with open(result_file_path, "w") as f: + json.dump(results, f, indent=4, default=custom_serializer) + + logger.info(f"Results saved to {result_file_path}") + logger.info(f"Trials table saved to {trials_table_path}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/hyperparameter-tuning/models/histogram_model.py b/hyperparameter-tuning/models/histogram_model.py new file mode 100644 index 0000000..458cfd4 --- /dev/null +++ b/hyperparameter-tuning/models/histogram_model.py @@ -0,0 +1,228 @@ +import os +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.metrics import classification_report, confusion_matrix, accuracy_score + +# Import TensorFlow Keras +from tensorflow import keras +from tensorflow.keras import optimizers +from tensorflow.keras.utils import to_categorical +from tensorflow.keras.layers import (Activation, Dense, Dropout, BatchNormalization) +from tensorflow.keras.models import Sequential +from tensorflow.keras.activations import swish as SiLU +from tensorflow.keras.activations import relu +from tensorflow.keras.models import load_model +import argparse +import pickle + +# Model definition + +# config': {'input_dim': 65, 'num_classes': 98, 'num_layers': 4, 'units_per_layer': [512, 128, 512, 256], 'dropout': 0.27774903408254686, 'normalize_input': False, 'activation': SiLU(), 'optimizer': 'Adam', 'lr': 0.0009554640387111394, 'batch_size': 1024, 'epochs': 2000} + +# Histogram-O0 +# def getModel(input_dim, output_dim): +# model = Sequential() + +# model.add(Dense(512, input_shape=(input_dim,), kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# # model.add(BatchNormalization()) +# model.add(Activation(SiLU)) +# model.add(Dropout(0.27774903408254686)) + +# model.add(Dense(128, kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# # model.add(BatchNormalization()) +# model.add(Activation(SiLU)) +# model.add(Dropout(0.27774903408254686)) + +# model.add(Dense(512, kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# # model.add(BatchNormalization()) +# model.add(Activation(SiLU)) +# model.add(Dropout(0.27774903408254686)) + +# model.add(Dense(256, kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# # model.add(BatchNormalization()) +# model.add(Activation(SiLU)) +# model.add(Dropout(0.27774903408254686)) + +# model.add(Dense(output_dim, kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation('softmax')) + +# opt = keras.optimizers.Adam(learning_rate=0.0009554640387111394) +# model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) +# model.summary() + +# return model + +# config': {'input_dim': 65, 'num_classes': 98, 'num_layers': 4, 'units_per_layer': [512, 512, 256, 256], 'dropout': 0.22272317313484666, 'normalize_input': False, 'activation': ReLU(), 'optimizer': 'Adam', 'lr': 0.0004475656736901494, 'batch_size': 128, 'epochs': 2000} + +# Histogram-O3 +def getModel(input_dim, output_dim): + model = Sequential() + + model.add(Dense(512, input_shape=(input_dim,), kernel_initializer=keras.initializers.glorot_normal(seed=None))) + # model.add(BatchNormalization()) + model.add(Activation(relu)) + model.add(Dropout(0.22272317313484666)) + + model.add(Dense(512, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + # model.add(BatchNormalization()) + model.add(Activation(relu)) + model.add(Dropout(0.22272317313484666)) + + model.add(Dense(256, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + # model.add(BatchNormalization()) + model.add(Activation(relu)) + model.add(Dropout(0.22272317313484666)) + + model.add(Dense(256, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + # model.add(BatchNormalization()) + model.add(Activation(relu)) + model.add(Dropout(0.22272317313484666)) + + model.add(Dense(output_dim, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation('softmax')) + + opt = keras.optimizers.Adam(learning_rate=0.0004475656736901494) + model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) + model.summary() + + return model + +# Load data from directory +def load_data_from_directory(directory): + data = [] + labels = [] + classes = sorted(os.listdir(directory)) # Ensure consistent label mapping + class_to_label = {cls: idx for idx, cls in enumerate(classes)} + + for cls in classes: + class_path = os.path.join(directory, cls) + if os.path.isdir(class_path): + for file_name in os.listdir(class_path): + if file_name.endswith(".npz"): + file_path = os.path.join(class_path, file_name) + try: + loaded = np.load(file_path)["values"] + data.append(loaded.flatten()) + labels.append(class_to_label[cls]) + except Exception as e: + print(f"Failed to load {file_path}: {e}") + + return np.array(data), np.array(labels) + +# Prepare train and test data +# def prepare_data(train_dir, test_dir): +# X_train, y_train = load_data_from_directory(train_dir) +# X_test, y_test = load_data_from_directory(test_dir) + +# return X_train, y_train, X_test, y_test + +def prepare_data(train_dir, test_dir, val_dir=None): + X_train, y_train = load_data_from_directory(train_dir) + X_test, y_test = load_data_from_directory(test_dir) + X_val, y_val = None, None + + if val_dir: + X_val, y_val = load_data_from_directory(val_dir) + + return X_train, y_train, X_test, y_test, X_val, y_val + +# Main function +def main(): + # Paths to the train and test directories + train_dir = "/Pramana/IR2Vec/Yali-Embeddings/histogram/O3/codeforces/train/codeforcestrainO3" + test_dir = "/Pramana/IR2Vec/Yali-Embeddings/histogram/O3/codeforces/test/codeforcestestO3" + val_dir="/Pramana/IR2Vec/Yali-Embeddings/histogram/O3/codeforces/val/codeforcesvalO3" + + # train_dir = "/Pramana/IR2Vec/Yali-Embeddings/histogram/O0/codeforces/train/codeforcestrainO0" + # test_dir = "/Pramana/IR2Vec/Yali-Embeddings/histogram/O0/codeforces/test/codeforcestestO0" + # val_dir="/Pramana/IR2Vec/Yali-Embeddings/histogram/O0/codeforces/val/codeforcesvalO0" + + # train_dir = "/Pramana/IR2Vec/Yali-Embeddings/histogram/O3/codejam/codejamtrainO3" + # test_dir = "/Pramana/IR2Vec/Yali-Embeddings/histogram/O3/codejam/codejamtestO3" + # val_dir="/Pramana/IR2Vec/Yali-Embeddings/histogram/O3/codejam/codejamvalO3" + + # # Prepare data + # X_train, y_train, X_test, y_test = prepare_data(train_dir, test_dir) + + # # Check data shapes + # print(f"Training data shape: {X_train.shape}") + # print(f"Training labels shape: {y_train.shape}") + # print(f"Testing data shape: {X_test.shape}") + # print(f"Testing labels shape: {y_test.shape}") + + # # One-hot encode labels + # num_classes = len(np.unique(y_train)) + # y_train = to_categorical(y_train, num_classes) + # y_test = to_categorical(y_test, num_classes) + + # # No train-test split for validation, using all X_train and y_train for training + # model = getModel(X_train.shape[1], num_classes) + + # Prepare data + X_train, y_train, X_test, y_test, X_val, y_val = prepare_data(train_dir, test_dir, val_dir) + + # Check data shapes + print(f"Training data shape: {X_train.shape}") + print(f"Training labels shape: {y_train.shape}") + print(f"Testing data shape: {X_test.shape}") + print(f"Testing labels shape: {y_test.shape}") + if X_val is not None and y_val is not None: + print(f"Validation data shape: {X_val.shape}") + print(f"Validation labels shape: {y_val.shape}") + + # One-hot encode labels + num_classes = len(np.unique(y_train)) + y_train = to_categorical(y_train, num_classes) + y_test = to_categorical(y_test, num_classes) + if X_val is not None and y_val is not None: + y_val = to_categorical(y_val, num_classes) + + # No train-test split for validation, using X_val and y_val for validation + model = getModel(X_train.shape[1], num_classes) + + # mc = keras.callbacks.ModelCheckpoint( + # filepath='/home/cs24mtech02001/IR2Vec-Classification/weights/milepost-O0/codeforces/weights_epoch_{epoch:08d}.weights.keras', + # save_weights_only=True, + # save_freq=500 + # ) + + # mc = keras.callbacks.ModelCheckpoint( + # filepath='/home/cs24mtech02001/Program-Classification/ir2vec-model-weights-O3/codeforces/histogram/weights_epoch_{epoch:08d}.weights.h5', + # save_weights_only=True, + # save_freq=500) + + mc = keras.callbacks.ModelCheckpoint( + filepath="/home/cs24mtech02001/IR2Vec-Classification/weights/histogram-O3/codeforces/weights_epoch_{epoch:08d}.weights.h5", + save_weights_only=True, + save_best_only=True, + monitor="val_loss", + mode="min" + ) + + # Train the model with validation data + model.fit( + X_train, + y_train, + validation_data=(X_val, y_val) if X_val is not None and y_val is not None else None, + batch_size=128, + epochs=2000, + verbose=1, + callbacks=[mc] + ) + + # Evaluate model + y_pred = np.argmax(model.predict(X_test), axis=1) + y_true = np.argmax(y_test, axis=1) + print(f"Accuracy: {accuracy_score(y_true, y_pred):.13f}") + + # Save the trained model + model.save("codeforces-O3-histogram-ir2vec-hypertuned-model.h5") + print("Saved model to disk as 'Kodanda-codeforces-O3-histogram-ir2vec-hypertuned-model.keras'.") + + return model + +# Execute the script +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/hyperparameter-tuning/models/ir2vec_fa_model.py b/hyperparameter-tuning/models/ir2vec_fa_model.py new file mode 100644 index 0000000..5982b39 --- /dev/null +++ b/hyperparameter-tuning/models/ir2vec_fa_model.py @@ -0,0 +1,273 @@ +import numpy as np +import pandas as pd +from sklearn.decomposition import IncrementalPCA +from sklearn.metrics import accuracy_score +from sklearn.model_selection import train_test_split + +# Import TensorFlow Keras +from tensorflow import keras +from tensorflow.keras import optimizers +from tensorflow.keras.layers import (Activation, Dense, Dropout, BatchNormalization) +from tensorflow.keras.activations import swish as SiLU +from tensorflow.keras.activations import tanh as Tanh +from tensorflow.keras.models import Sequential +from tensorflow.keras.models import load_model +import argparse +import pickle + +# Flowaware - O0 + +# {'input_dim': 300, 'num_classes': 98, 'num_layers': 5, 'units_per_layer': [512, 128, 512, 128, 512], 'dropout': 0.28877129358258796, 'normalize_input': True, 'activation': SiLU(), 'optimizer': 'Adam', 'lr': 0.00014769411188154336, 'batch_size': 32, 'epochs': 5000} + + +# def getModel(input_dim, output_dim): +# model = Sequential() + +# # Input Layer +# model.add(Dense(512, input_shape=(input_dim,), kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation(SiLU)) +# model.add(Dropout(0.28877129358258796)) + +# # Hidden Layer 2 +# model.add(Dense(128, kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation(SiLU)) +# model.add(Dropout(0.28877129358258796)) + +# # Hidden Layer 3 +# model.add(Dense(512, kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation(SiLU)) +# model.add(Dropout(0.28877129358258796)) + +# # Hidden Layer 4 +# model.add(Dense(128, kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation(SiLU)) +# model.add(Dropout(0.28877129358258796)) + +# # Hidden Layer 5 +# model.add(Dense(512, kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation(SiLU)) +# model.add(Dropout(0.28877129358258796)) + +# # Output Layer +# model.add(Dense(output_dim, kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation('softmax')) + +# # Optimizer +# opt = keras.optimizers.Adam(learning_rate=0.00014769411188154336) +# model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy']) + +# model.summary() +# return model + +# {'input_dim': 300, 'num_classes': 98, 'num_layers': 3, 'units_per_layer': [128, 256, 512], 'dropout': 0.21644468951221385, 'normalize_input': True, 'activation': Tanh(), 'optimizer': 'Adam', 'lr': 0.0001302138918461736, 'batch_size': 64, 'epochs': 5000} + +# Flowaware - O3 +def getModel(input_dim, output_dim): + model = Sequential() + + # Input Layer + model.add(Dense(128, input_shape=(input_dim,), kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation(Tanh)) + model.add(Dropout(0.21644468951221385)) + + # Hidden Layer 2 + model.add(Dense(256, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation(Tanh)) + model.add(Dropout(0.21644468951221385)) + + # Hidden Layer 3 + model.add(Dense(512, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation(Tanh)) + model.add(Dropout(0.21644468951221385)) + + # Output Layer + model.add(Dense(output_dim, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation('softmax')) + + # Optimizer + opt = keras.optimizers.Adam(learning_rate=0.0001302138918461736) + model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy']) + + model.summary() + return model + +# train the model on the given data +def train(x_train, y_train, x_test, y_test, x_val, y_val, epochs, batch_size, model): + X_min = x_train.min() + X_max = x_train.max() + num_classes = np.unique(y_train).shape[0] + print(f" Number of classes: {num_classes}") + + x_train = (x_train - X_min) / (X_max - X_min) + x_train = np.array(x_train) + y_train = np.array(y_train) + y_train = y_train - 1 + print(f"\nAfter subtracting -1 from labels: {y_train}") + print(f"\nAfter subtracting -1 from labels: {np.unique(y_train).shape[0]}") + + y_train = keras.utils.to_categorical(y_train, num_classes) + print(y_train) + + # PCA transformation + ipca = IncrementalPCA(n_components=300) + ipca.fit(x_train) + x_train = ipca.transform(x_train) + + val_tuple = None + if x_val is not None: + x_val = (x_val - X_min) / (X_max - X_min) + x_val = np.array(x_val) + y_val = np.array(y_val) + y_val = y_val - 1 + y_val = keras.utils.to_categorical(y_val, num_classes) + x_val = ipca.transform(x_val) + val_tuple = (x_val, y_val) + + # Setup model and training parameters + # mc = keras.callbacks.ModelCheckpoint(filepath='/home/intern24009/tune-ir2vec/hypertuned-models/O3/fa/codeforces/weights{epoch:08d}.h5', save_weights_only=False, save_freq='epoch', period=500) + + mc = keras.callbacks.ModelCheckpoint(filepath='/home/cs24mtech02001/IR2Vec-Classification/weights/fa-O3/codejam-O3/weights{epoch:08d}.keras', save_weights_only=False, save_freq='epoch') + + if model is None: + model = getModel(x_train.shape[1], num_classes) + + model.fit(x_train, + y_train, + batch_size=batch_size, + epochs=epochs, + verbose=1, + validation_data=val_tuple, callbacks=[mc]) + + # model.save("/home/cs22mtech12011/Aayush-IR2Vec/codeforces-O3-fa-hypertuned-ir2vec-model.h5") + model.save("codejam-O3-fa-hypertuned-ir2vec-model.keras") + print("Saved model to disk --> Kodanda-codejam-O3-fa-hypertuned-ir2vec-model.keras") + + if x_test is not None: + x_test = (x_test - X_min) / (X_max - X_min) + x_test = np.array(x_test) + y_test = np.array(y_test) + y_test = y_test - 1 + y_test = keras.utils.to_categorical(y_test, num_classes) + x_test = ipca.transform(x_test) + score = model.evaluate(x_test, y_test, verbose=0) + print('Test Accuracy (After Training) : {acc:.13f}%'.format(acc=score[1]*100)) + + with open('dictionary.pkl', 'wb') as f: + pickle.dump(num_classes, f) + pickle.dump(X_min, f) + pickle.dump(X_max, f) + pickle.dump(ipca, f) + + +# test the learnt model on the data +def test(X, targetLabel, model): + with open('dictionary.pkl', 'rb') as f: + num_classes = pickle.load(f) + X_min = pickle.load(f) + X_max = pickle.load(f) + ipca=pickle.load(f) + + X = (X - X_min) / (X_max - X_min) + X = np.array(X) + targetLabel = np.array(targetLabel) + targetLabel = targetLabel - 1 + targetLabel = keras.utils.to_categorical(targetLabel, num_classes) + X = ipca.transform(X) + + score = model.evaluate(X, targetLabel, verbose=0) + print('Test accuracy : {acc:.13f}%'.format(acc=score[1]*100)) + +# Entry Point of the program +if __name__ == '__main__': + + # train_file = '/Pramana/IR2Vec/O3/A-ir2vec-17.x/codeforces/embeddings/fa/training.csv' + + # test_file = '/Pramana/IR2Vec/O3/A-ir2vec-17.x/codeforces/embeddings/fa/testing.csv' + + # val_file = '/Pramana/IR2Vec/O3/A-ir2vec-17.x/codeforces/embeddings/fa/val.csv' + + # train_file = '/Pramana/IR2Vec/IR2Vec-ProgramClassification/datasets-17.x-O0/codejam/fa/training.csv' + + # test_file = '/Pramana/IR2Vec/IR2Vec-ProgramClassification/datasets-17.x-O0/codejam/fa/testing.csv' + + # val_file = '/Pramana/IR2Vec/IR2Vec-ProgramClassification/datasets-17.x-O0/codejam/fa/val.csv' + + train_file = '/Pramana/IR2Vec/O3/A-ir2vec-17.x/codejam/embeddings/fa/training.csv' + + test_file = '/Pramana/IR2Vec/O3/A-ir2vec-17.x/codejam/embeddings/fa/testing.csv' + + val_file = '/Pramana/IR2Vec/O3/A-ir2vec-17.x/codejam/embeddings/fa/val.csv' + + epochs = 2000 + batch_size = 64 + + model = None # No pre-trained model is being loaded + + # trained/Learnt model is required for the testing phase. + if test_file is None and train_file is None: + print("Enter training or testing data") + exit() + + X_test = None + y_test = None + if test_file is not None: + X_test = pd.read_csv(test_file, sep='\t', header=None) + y_test = X_test.loc[:, 0] + X_test = X_test.loc[:, 1:] + X_test.columns = range(X_test.shape[1]) + + print("Test Set:") + print(f"X_test shape: {X_test.shape}") + print(f"y_test unique counts: \n{y_test.value_counts()}") + + if train_file is not None: + X = pd.read_csv(train_file, sep='\t', header=None) + Y = X.loc[:, 0] + X = X.loc[:, 1:] + X.columns = range(X.shape[1]) + + print("Train Set:") + print(f"X_train shape: {X.shape}") + print(f"y_train unique counts: \n{Y.value_counts()}") + + X_val = None + y_val = None + if val_file is not None: + X_val = pd.read_csv(val_file, sep='\t', header=None) + y_val = X_val.loc[:, 0] + X_val = X_val.loc[:, 1:] + X_val.columns = range(X_val.shape[1]) + + print("Validation Set:") + print(f"X_val shape: {X_val.shape}") + print(f"y_val unique counts: \n{y_val.value_counts()}") + + train(X, Y, X_test, y_test, X_val, y_val, epochs, batch_size, model) + + # Load the model checkpoint + # model_checkpoint_path = "/home/intern24009/tune-ir2vec/hypertuned-models/O3/fa/codeforces/weights00000878.keras" + # model = load_model(model_checkpoint_path, custom_objects={'swish': SiLU}) # Include custom_objects if using custom activation functions + + # Continue training from the checkpoint + # train(X, Y, X_test, y_test, X_val, y_val, epochs, batch_size, model) + + elif test_file is not None: + + if model is None: + print('***********************Model is not passed in the testing**************') + exit() + + # Skip model loading if it's not being used + print("Model not loaded; skipping testing.") + # You could directly use a trained model here if you have one + # test(X_test, y_test, model) \ No newline at end of file diff --git a/hyperparameter-tuning/models/ir2vec_static_model.py b/hyperparameter-tuning/models/ir2vec_static_model.py new file mode 100644 index 0000000..b64db35 --- /dev/null +++ b/hyperparameter-tuning/models/ir2vec_static_model.py @@ -0,0 +1,270 @@ +import numpy as np +import pandas as pd +from sklearn.decomposition import IncrementalPCA +from sklearn.metrics import accuracy_score +from sklearn.model_selection import train_test_split + +# Import TensorFlow Keras +from tensorflow import keras +from tensorflow.keras import optimizers +from tensorflow.keras.layers import (Activation, Dense, Dropout, BatchNormalization) +from tensorflow.keras.activations import swish as SiLU +from tensorflow.keras.models import Sequential +from tensorflow.keras.models import load_model +import argparse +import pickle + +# Static - O0 +# def getModel(input_dim, output_dim): +# model = Sequential() + +# # Input Layer +# model.add(Dense(128, input_shape=(input_dim,), kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation(SiLU)) +# model.add(Dropout(0.14414245564202546)) + +# # Hidden Layer 2 +# model.add(Dense(256, kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation(SiLU)) +# model.add(Dropout(0.14414245564202546)) + +# # Hidden Layer 3 +# model.add(Dense(128, kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation(SiLU)) +# model.add(Dropout(0.14414245564202546)) + +# # Hidden Layer 4 +# model.add(Dense(128, kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation(SiLU)) +# model.add(Dropout(0.14414245564202546)) + +# # Hidden Layer 5 +# model.add(Dense(256, kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation(SiLU)) +# model.add(Dropout(0.14414245564202546)) + +# # Output Layer +# model.add(Dense(output_dim, kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation('softmax')) + +# # Optimizer +# opt = keras.optimizers.Adam(learning_rate=0.0002935566846936451) +# model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy']) + +# model.summary() +# return model + +# 'input_dim': 300, 'num_classes': 98, 'num_layers': 3, 'units_per_layer': [512, 128, 512], 'dropout': 0.26164844577753404, 'normalize_input': True, 'activation': SiLU(), 'optimizer': 'Adam', 'lr': 0.00021151571457278203, 'batch_size': 1024, 'epochs': 2000 + +# Static - O3 +def getModel(input_dim, output_dim): + model = Sequential() + + # Input Layer + model.add(Dense(512, input_shape=(input_dim,), kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation(SiLU)) + model.add(Dropout(0.26164844577753404)) + + # Hidden Layer 2 + model.add(Dense(128, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation(SiLU)) + model.add(Dropout(0.26164844577753404)) + + # Hidden Layer 3 + model.add(Dense(512, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation(SiLU)) + model.add(Dropout(0.26164844577753404)) + + # Output Layer + model.add(Dense(output_dim, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation('softmax')) + + # Optimizer + opt = keras.optimizers.Adam(learning_rate=0.00021151571457278203) + model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy']) + + model.summary() + return model + +# train the model on the given data +def train(x_train, y_train, x_test, y_test, x_val, y_val, epochs, batch_size, model): + X_min = x_train.min() + X_max = x_train.max() + num_classes = np.unique(y_train).shape[0] + print(f" Number of classes: {num_classes}") + + x_train = (x_train - X_min) / (X_max - X_min) + x_train = np.array(x_train) + y_train = np.array(y_train) + y_train = y_train - 1 + print(f"\nAfter subtracting -1 from labels: {y_train}") + print(f"\nAfter subtracting -1 from labels: {np.unique(y_train).shape[0]}") + + y_train = keras.utils.to_categorical(y_train, num_classes) + print(y_train) + + # PCA transformation + ipca = IncrementalPCA(n_components=300) + ipca.fit(x_train) + x_train = ipca.transform(x_train) + + val_tuple = None + if x_val is not None: + x_val = (x_val - X_min) / (X_max - X_min) + x_val = np.array(x_val) + y_val = np.array(y_val) + y_val = y_val - 1 + y_val = keras.utils.to_categorical(y_val, num_classes) + x_val = ipca.transform(x_val) + val_tuple = (x_val, y_val) + + # Setup model and training parameters + # mc = keras.callbacks.ModelCheckpoint(filepath='/home/intern24009/tune-ir2vec/hypertuned-models/O0/fa/codeforces/weights{epoch:08d}.h5', save_weights_only=False, save_freq='epoch', period=500) + + mc = keras.callbacks.ModelCheckpoint(filepath='/home/cs24mtech02001/IR2Vec-Classification/weights/static-O3/codejam-O3/weights{epoch:08d}.keras', save_weights_only=False, save_freq='epoch') + + if model is None: + model = getModel(x_train.shape[1], num_classes) + + model.fit(x_train, + y_train, + batch_size=batch_size, + epochs=epochs, + verbose=1, + validation_data=val_tuple, callbacks=[mc]) + + model.save("/home/cs24mtech02001/IR2Vec-Classification/codejam-O3-fa-static-hypertuned-ir2vec-model.h5") + print("Saved model to disk") + + if x_test is not None: + x_test = (x_test - X_min) / (X_max - X_min) + x_test = np.array(x_test) + y_test = np.array(y_test) + y_test = y_test - 1 + y_test = keras.utils.to_categorical(y_test, num_classes) + x_test = ipca.transform(x_test) + score = model.evaluate(x_test, y_test, verbose=0) + print('Test Accuracy (After Training) : {acc:.13f}%'.format(acc=score[1]*100)) + + with open('dictionary.pkl', 'wb') as f: + pickle.dump(num_classes, f) + pickle.dump(X_min, f) + pickle.dump(X_max, f) + pickle.dump(ipca, f) + + +# test the learnt model on the data +def test(X, targetLabel, model): + with open('dictionary.pkl', 'rb') as f: + num_classes = pickle.load(f) + X_min = pickle.load(f) + X_max = pickle.load(f) + ipca=pickle.load(f) + + X = (X - X_min) / (X_max - X_min) + X = np.array(X) + targetLabel = np.array(targetLabel) + targetLabel = targetLabel - 1 + targetLabel = keras.utils.to_categorical(targetLabel, num_classes) + X = ipca.transform(X) + + score = model.evaluate(X, targetLabel, verbose=0) + print('Test accuracy : {acc:.13f}%'.format(acc=score[1]*100)) + +# Entry Point of the program +if __name__ == '__main__': + + # train_file = '/Pramana/IR2Vec/IR2Vec-Embeddings/O0/Codeforces/csv/training.csv' + + # test_file = '/Pramana/IR2Vec/IR2Vec-Embeddings/O0/Codeforces/csv/testing.csv' + + # val_file = '/Pramana/IR2Vec/IR2Vec-Embeddings/O0/Codeforces/csv/val.csv' + + # train_file = '/Pramana/IR2Vec/IR2Vec-Embeddings/O3/Codeforces/csv/training.csv' + + # test_file = '/Pramana/IR2Vec/IR2Vec-Embeddings/O3/Codeforces/csv/testing.csv' + + # val_file = '/Pramana/IR2Vec/IR2Vec-Embeddings/O3/Codeforces/csv/val.csv' + + train_file = '/Pramana/IR2Vec/IR2Vec-Embeddings/O3/Codejam/csv/training.csv' + + test_file = '/Pramana/IR2Vec/IR2Vec-Embeddings/O3/Codejam/csv/testing.csv' + + val_file = '/Pramana/IR2Vec/IR2Vec-Embeddings/O3/Codejam/csv/val.csv' + + epochs = 2000 + # batch_size = 32 + batch_size = 1024 + + # model = None # No pre-trained model is being loaded + # model = "/home/intern24009/tune-ir2vec/hypertuned-models/O0/fa/codeforces/weights00001973.keras" + model = None + + # trained/Learnt model is required for the testing phase. + if test_file is None and train_file is None: + print("Enter training or testing data") + exit() + + X_test = None + y_test = None + if test_file is not None: + X_test = pd.read_csv(test_file, sep='\t', header=None) + y_test = X_test.loc[:, 0] + X_test = X_test.loc[:, 1:] + X_test.columns = range(X_test.shape[1]) + + print("Test Set:") + print(f"X_test shape: {X_test.shape}") + print(f"y_test unique counts: \n{y_test.value_counts()}") + + if train_file is not None: + X = pd.read_csv(train_file, sep='\t', header=None) + Y = X.loc[:, 0] + X = X.loc[:, 1:] + X.columns = range(X.shape[1]) + + print("Train Set:") + print(f"X_train shape: {X.shape}") + print(f"y_train unique counts: \n{Y.value_counts()}") + + X_val = None + y_val = None + if val_file is not None: + X_val = pd.read_csv(val_file, sep='\t', header=None) + y_val = X_val.loc[:, 0] + X_val = X_val.loc[:, 1:] + X_val.columns = range(X_val.shape[1]) + + print("Validation Set:") + print(f"X_val shape: {X_val.shape}") + print(f"y_val unique counts: \n{y_val.value_counts()}") + + train(X, Y, X_test, y_test, X_val, y_val, epochs, batch_size, model) + + # model_checkpoint_path = "/home/intern24009/tune-ir2vec/hypertuned-models/O0/fa/codeforces/weights00001973.keras" + + # model = load_model(model_checkpoint_path, custom_objects={'swish': SiLU}) # Include custom_objects if using custom activation functions + + # # Continue training from the checkpoint + # train(X, Y, X_test, y_test, X_val, y_val, epochs, batch_size, model) + + elif test_file is not None: + + if model is None: + print('***********************Model is not passed in the testing**************') + exit() + + # Skip model loading if it's not being used + print("Model not loaded; skipping testing.") + # You could directly use a trained model here if you have one + # test(X_test, y_test, model) \ No newline at end of file diff --git a/hyperparameter-tuning/models/ir2vec_sym_model.py b/hyperparameter-tuning/models/ir2vec_sym_model.py new file mode 100644 index 0000000..b92c52c --- /dev/null +++ b/hyperparameter-tuning/models/ir2vec_sym_model.py @@ -0,0 +1,267 @@ +import numpy as np +import pandas as pd +from sklearn.decomposition import IncrementalPCA +from sklearn.metrics import accuracy_score +from sklearn.model_selection import train_test_split + +# Import TensorFlow Keras +from tensorflow import keras +from tensorflow.keras import optimizers +from tensorflow.keras.layers import (Activation, Dense, Dropout, BatchNormalization) +from tensorflow.keras.activations import swish as SiLU +from tensorflow.keras.models import Sequential +import argparse +import pickle + +# 'input_dim': 300, 'num_classes': 98, 'num_layers': 3, 'units_per_layer': [256, 512, 128], 'dropout': 0.26394223847024845, 'nor malize_input': True, 'activation': SiLU(), 'optimizer': 'Adam', 'lr': 0.00012758233973417935, 'batch_size': 64, 'epochs': 5000}" + +# Symbolic - O0 +# def getModel(input_dim, output_dim): +# model = Sequential() + +# # Input Layer +# model.add(Dense(256, input_shape=(input_dim,), kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation(SiLU)) +# model.add(Dropout(0.26394223847024845)) + +# # Hidden Layer 2 +# model.add(Dense(512, kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation(SiLU)) +# model.add(Dropout(0.26394223847024845)) + +# # Hidden Layer 3 +# model.add(Dense(128, kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation(SiLU)) +# model.add(Dropout(0.26394223847024845)) + +# # Output Layer +# model.add(Dense(output_dim, kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation('softmax')) + +# # Optimizer +# opt = keras.optimizers.Adam(learning_rate=0.00012758233973417935) +# model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy']) + +# model.summary() +# return model + +# Symbolic - O3 + +# {'input_dim': 300, 'num_classes': 98, 'num_layers': 5, 'units_per_layer': [256, 512, 256, 128, 512], 'dropout': 0.1743626588566297, 'normalize_input': True, 'activation': SiLU(), 'optimizer': 'Adam', 'lr': 0.0001028291067528109, 'batch_size': 32, 'epochs': 5000} + +def getModel(input_dim, output_dim): + model = Sequential() + + # Input Layer + model.add(Dense(256, input_shape=(input_dim,), kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation(SiLU)) + model.add(Dropout(0.1743626588566297)) + + # Hidden Layer 2 + model.add(Dense(512, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation(SiLU)) + model.add(Dropout(0.1743626588566297)) + + # Hidden Layer 3 + model.add(Dense(256, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation(SiLU)) + model.add(Dropout(0.1743626588566297)) + + # Hidden Layer 4 + model.add(Dense(128, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation(SiLU)) + model.add(Dropout(0.1743626588566297)) + + # Hidden Layer 5 + model.add(Dense(512, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation(SiLU)) + model.add(Dropout(0.1743626588566297)) + + # Output Layer + model.add(Dense(output_dim, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation('softmax')) + + # Optimizer + opt = keras.optimizers.Adam(learning_rate=0.0001028291067528109) + model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy']) + + model.summary() + return model + +# train the model on the given data +def train(x_train, y_train, x_test, y_test, x_val, y_val, epochs, batch_size, model): + X_min = x_train.min() + X_max = x_train.max() + num_classes = np.unique(y_train).shape[0] + print(f" Number of classes: {num_classes}") + + x_train = (x_train - X_min) / (X_max - X_min) + x_train = np.array(x_train) + y_train = np.array(y_train) + y_train = y_train - 1 + print(f"\nAfter subtracting -1 from labels: {y_train}") + print(f"\nAfter subtracting -1 from labels: {np.unique(y_train).shape[0]}") + + y_train = keras.utils.to_categorical(y_train, num_classes) + print(y_train) + + # PCA transformation + ipca = IncrementalPCA(n_components=300) + ipca.fit(x_train) + x_train = ipca.transform(x_train) + + val_tuple = None + if x_val is not None: + x_val = (x_val - X_min) / (X_max - X_min) + x_val = np.array(x_val) + y_val = np.array(y_val) + y_val = y_val - 1 + y_val = keras.utils.to_categorical(y_val, num_classes) + x_val = ipca.transform(x_val) + val_tuple = (x_val, y_val) + + # Setup model and training parameters + # mc = keras.callbacks.ModelCheckpoint(filepath='/home/cs24mtech02001/IR2Vec-Classification/weights/static-O0/codejam/weights{epoch:08d}.h5', save_weights_only=False, save_freq='epoch', period=500) + + mc = keras.callbacks.ModelCheckpoint( + filepath='/home/cs24mtech02001/IR2Vec-Classification/weights/sym-O3/codejam/weights{epoch:08d}.weights.h5', + save_weights_only=True, + save_best_only=True, + monitor="val_loss", + mode="min" + ) + + if model is None: + model = getModel(x_train.shape[1], num_classes) + + model.fit(x_train, + y_train, + batch_size=batch_size, + epochs=epochs, + verbose=1, + validation_data=val_tuple, callbacks=[mc]) + + # model.save("codejam-O0-sym-hypertuned-ir2vec-model.h5") + # print("Saved model to disk --> Pramana-codejam-O0-sym-hypertuned-ir2vec-model") + + model.save("codejam-O3-sym-hypertuned-ir2vec-model.h5") + print("Saved model to disk --> Pramana-codejam-O3-sym-hypertuned-ir2vec-model") + + if x_test is not None: + x_test = (x_test - X_min) / (X_max - X_min) + x_test = np.array(x_test) + y_test = np.array(y_test) + y_test = y_test - 1 + y_test = keras.utils.to_categorical(y_test, num_classes) + x_test = ipca.transform(x_test) + score = model.evaluate(x_test, y_test, verbose=0) + print('Test Accuracy (After Training) : {acc:.13f}%'.format(acc=score[1]*100)) + + with open('dictionary.pkl', 'wb') as f: + pickle.dump(num_classes, f) + pickle.dump(X_min, f) + pickle.dump(X_max, f) + pickle.dump(ipca, f) + + +# test the learnt model on the data +def test(X, targetLabel, model): + with open('dictionary.pkl', 'rb') as f: + num_classes = pickle.load(f) + X_min = pickle.load(f) + X_max = pickle.load(f) + ipca=pickle.load(f) + + X = (X - X_min) / (X_max - X_min) + X = np.array(X) + targetLabel = np.array(targetLabel) + targetLabel = targetLabel - 1 + targetLabel = keras.utils.to_categorical(targetLabel, num_classes) + X = ipca.transform(X) + + score = model.evaluate(X, targetLabel, verbose=0) + print('Test accuracy : {acc:.13f}%'.format(acc=score[1]*100)) + +# Entry Point of the program +if __name__ == '__main__': + + # Codejam-O0 + # train_file = '/Pramana/IR2Vec/IR2Vec-ProgramClassification/datasets-17.x-O0/codejam/sym/training.csv' + + # test_file = '/Pramana/IR2Vec/IR2Vec-ProgramClassification/datasets-17.x-O0/codejam/sym/testing.csv' + + # val_file = '/Pramana/IR2Vec/IR2Vec-ProgramClassification/datasets-17.x-O0/codejam/sym/val.csv' + + # Codejam-O3 + train_file = '/Pramana/IR2Vec/O3/A-ir2vec-17.x/codejam/embeddings/sym/training.csv' + + test_file = '/Pramana/IR2Vec/O3/A-ir2vec-17.x/codejam/embeddings/sym/testing.csv' + + val_file = '/Pramana/IR2Vec/O3/A-ir2vec-17.x/codejam/embeddings/sym/val.csv' + + epochs = 2000 + batch_size = 32 + + model = None # No pre-trained model is being loaded + + # trained/Learnt model is required for the testing phase. + if test_file is None and train_file is None: + print("Enter training or testing data") + exit() + + X_test = None + y_test = None + if test_file is not None: + X_test = pd.read_csv(test_file, sep='\t', header=None) + y_test = X_test.loc[:, 0] + X_test = X_test.loc[:, 1:] + X_test.columns = range(X_test.shape[1]) + + print("Test Set:") + print(f"X_test shape: {X_test.shape}") + print(f"y_test unique counts: \n{y_test.value_counts()}") + + if train_file is not None: + X = pd.read_csv(train_file, sep='\t', header=None) + Y = X.loc[:, 0] + X = X.loc[:, 1:] + X.columns = range(X.shape[1]) + + print("Train Set:") + print(f"X_train shape: {X.shape}") + print(f"y_train unique counts: \n{Y.value_counts()}") + + X_val = None + y_val = None + if val_file is not None: + X_val = pd.read_csv(val_file, sep='\t', header=None) + y_val = X_val.loc[:, 0] + X_val = X_val.loc[:, 1:] + X_val.columns = range(X_val.shape[1]) + + print("Validation Set:") + print(f"X_val shape: {X_val.shape}") + print(f"y_val unique counts: \n{y_val.value_counts()}") + + train(X, Y, X_test, y_test, X_val, y_val, epochs, batch_size, model) + + elif test_file is not None: + + if model is None: + print('***********************Model is not passed in the testing**************') + exit() + + # Skip model loading if it's not being used + print("Model not loaded; skipping testing.") + # You could directly use a trained model here if you have one + # test(X_test, y_test, model) \ No newline at end of file diff --git a/hyperparameter-tuning/models/milepost_model.py b/hyperparameter-tuning/models/milepost_model.py new file mode 100644 index 0000000..c10ba39 --- /dev/null +++ b/hyperparameter-tuning/models/milepost_model.py @@ -0,0 +1,311 @@ +import os +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.metrics import classification_report, confusion_matrix, accuracy_score + +# Import TensorFlow Keras +from tensorflow import keras +from tensorflow.keras import optimizers +from tensorflow.keras.utils import to_categorical +from tensorflow.keras.layers import (Activation, Dense, Dropout, BatchNormalization) +from tensorflow.keras.models import Sequential +from tensorflow.keras.activations import swish as SiLU +from tensorflow.keras.models import load_model +import argparse +import pickle + +# Model definition + +# 'config': {'input_dim': 56, 'num_classes': 98, 'num_layers': 4, 'units_per_layer': [256, 128, 512, 512], 'dropout': 0.26338369031159503, 'normalize_input': True, 'activation': SiLU(), 'optimizer': 'Adam', 'lr': 0.0001271463116097739, 'batch_size': 128, 'epochs': 2000} + +# Milepost-O0 +# def getModel(input_dim, output_dim): +# model = Sequential() + +# # Input Layer +# model.add(Dense(256, input_shape=(input_dim,), kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation(SiLU)) +# model.add(Dropout(0.26338369031159503)) + +# # Hidden Layer 2 +# model.add(Dense(128, kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation(SiLU)) +# model.add(Dropout(0.26338369031159503)) + +# # Hidden Layer 3 +# model.add(Dense(512, kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation(SiLU)) +# model.add(Dropout(0.26338369031159503)) + +# # Hidden Layer 4 +# model.add(Dense(512, kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation(SiLU)) +# model.add(Dropout(0.26338369031159503)) + +# model.add(Dense(output_dim, kernel_initializer=keras.initializers.glorot_normal(seed=None))) +# model.add(BatchNormalization()) +# model.add(Activation('softmax')) + +# opt = keras.optimizers.Adam(learning_rate=0.0001271463116097739) +# model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy']) +# model.summary() + +# return model + +# config': {'input_dim': 56, 'num_classes': 98, 'num_layers': 5, 'units_per_layer': [256, 256, 512, 128, 256], 'dropout': 0.20077533375677442, 'normalize_input': True, 'activation': SiLU(), 'optimizer': 'Adam', 'lr': 0.0009488463996149118, 'batch_size': 32, 'epochs': 5000} + +# Milepost-O3 +def getModel(input_dim, output_dim): + model = Sequential() + + # Input Layer + model.add(Dense(256, input_shape=(input_dim,), kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation(SiLU)) + model.add(Dropout(0.20077533375677442)) + + # Hidden Layer 2 + model.add(Dense(256, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation(SiLU)) + model.add(Dropout(0.20077533375677442)) + + # Hidden Layer 3 + model.add(Dense(512, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation(SiLU)) + model.add(Dropout(0.20077533375677442)) + + # Hidden Layer 4 + model.add(Dense(128, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation(SiLU)) + model.add(Dropout(0.20077533375677442)) + + # Hidden Layer 5 + model.add(Dense(256, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation(SiLU)) + model.add(Dropout(0.20077533375677442)) + + model.add(Dense(output_dim, kernel_initializer=keras.initializers.glorot_normal(seed=None))) + model.add(BatchNormalization()) + model.add(Activation('softmax')) + + opt = keras.optimizers.Adam(learning_rate=0.0009488463996149118) + model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy']) + model.summary() + + return model + +# Load data from directory +# def load_data_from_directory(directory): +# data = [] +# labels = [] +# classes = sorted(os.listdir(directory)) # Ensure consistent label mapping +# class_to_label = {cls: idx for idx, cls in enumerate(classes)} + +# for cls in classes: +# class_path = os.path.join(directory, cls) +# if os.path.isdir(class_path): +# for file_name in os.listdir(class_path): +# if file_name.endswith(".npz"): +# file_path = os.path.join(class_path, file_name) +# try: +# loaded = np.load(file_path)["values"] +# data.append(loaded.flatten()) +# labels.append(class_to_label[cls]) +# except Exception as e: +# print(f"Failed to load {file_path}: {e}") + +# return np.array(data), np.array(labels) + +def load_data_from_directory(directory): + data = [] + labels = [] + classes = sorted(os.listdir(directory)) # Ensure consistent label mapping + class_to_label = {cls: idx for idx, cls in enumerate(classes)} + + for cls in classes: + class_path = os.path.join(directory, cls) + if os.path.isdir(class_path): + for file_name in os.listdir(class_path): + if file_name.endswith(".npz"): + file_path = os.path.join(class_path, file_name) + try: + loaded = np.load(file_path)["values"] + data.append(loaded.flatten()) + labels.append(class_to_label[cls]) + except Exception as e: + print(f"Failed to load {file_path}: {e}") + + # Replace empty arrays with zeros + for idx, element in enumerate(data): + if len(element) == 0: + data[idx] = np.zeros((56,)) # Replace empty elements with zeros of size 56 + + # Convert data to consistent size + max_features = 56 # Assuming size 56 for all non-empty data + data = [x[:max_features] if len(x) > max_features else np.pad(x, (0, max_features - len(x)), 'constant') for x in data] + + # Debugging information + unique_lengths = set(len(x) for x in data) + print(f"Total unique data shapes after fix: {len(unique_lengths)}") + print(f"Unique lengths: {unique_lengths}") + + return np.array(data), np.array(labels) + +# Prepare train and test data +# def prepare_data(train_dir, test_dir): +# X_train, y_train = load_data_from_directory(train_dir) +# X_test, y_test = load_data_from_directory(test_dir) + +# return X_train, y_train, X_test, y_test + +# # Main function +# def main(): +# # Paths to the train and test directories +# train_dir = "/Pramana/IR2Vec/Yali-Embeddings/milepost/O3/codeforces/codeforcestrainO3" +# test_dir = "/Pramana/IR2Vec/Yali-Embeddings/milepost/O3/codeforces/codeforcestestO3" + +# # Prepare data +# X_train, y_train, X_test, y_test = prepare_data(train_dir, test_dir) + +# # Check data shapes +# print(f"Training data shape: {X_train.shape}") +# print(f"Training labels shape: {y_train.shape}") +# print(f"Testing data shape: {X_test.shape}") +# print(f"Testing labels shape: {y_test.shape}") + +# # One-hot encode labels +# num_classes = len(np.unique(y_train)) +# y_train = to_categorical(y_train, num_classes) +# y_test = to_categorical(y_test, num_classes) + +# # No train-test split for validation, using all X_train and y_train for training +# model = getModel(X_train.shape[1], num_classes) + +# mc = keras.callbacks.ModelCheckpoint( +# filepath='/home/cs24mtech02001/IR2Vec-Classification/weights/milepost-O0/codeforces/weights_epoch_{epoch:08d}.weights.keras', +# save_weights_only=True, +# save_freq=500) + + +# # Train the model +# model.fit(X_train, +# y_train, +# batch_size=128, +# epochs=2000, +# verbose=1, +# callbacks=[mc]) + +# # Evaluate model +# y_pred = np.argmax(model.predict(X_test), axis=1) +# y_true = np.argmax(y_test, axis=1) +# # print("Classification Report:") +# # print(classification_report(y_true, y_pred)) +# # print("Confusion Matrix:") +# # print(confusion_matrix(y_true, y_pred)) +# print(f"Accuracy: {accuracy_score(y_true, y_pred):.13f}") + +# # Save the trained model +# model.save("codeforces-O0-milepost-ir2vec-hypertuned-model.h5") +# print("Saved model to disk as 'codeforces-O0-milepost-ir2vec-model.keras'.") + +# return model + +# # Execute the script +# if __name__ == "__main__": +# main() + +# Prepare train, test, and validation data +def prepare_data(train_dir, test_dir, val_dir=None): + X_train, y_train = load_data_from_directory(train_dir) + X_test, y_test = load_data_from_directory(test_dir) + X_val, y_val = None, None + + if val_dir: + X_val, y_val = load_data_from_directory(val_dir) + + return X_train, y_train, X_test, y_test, X_val, y_val + +# Main function +def main(): + # Paths to the train, test, and validation directories + train_dir = "/Pramana/IR2Vec/Milepost/O3/codeforcestrainO3" + test_dir = "/Pramana/IR2Vec/Milepost/O3/codeforcestestO3" + val_dir = "/Pramana/IR2Vec/Milepost/O3/codeforcesvalO3" # Replace with your validation data path + + # train_dir = "/Pramana/IR2Vec/Yali-Embeddings/milepost/O0/codeforces/train/codeforcestrainO0" + # test_dir = "/Pramana/IR2Vec/Yali-Embeddings/milepost/O0/codeforces/test/codeforcestestO0" + # val_dir = "/Pramana/IR2Vec/Yali-Embeddings/milepost/O0/codeforces/val/codeforcesvalO0" + + # train_dir = "/Pramana/IR2Vec/Milepost/O0/codejamtrainO0" + # test_dir = "/Pramana/IR2Vec/Milepost/O0/codejamtestO0" + # val_dir = "/Pramana/IR2Vec/Milepost/O0/codejamvalO0" + + # Prepare data + X_train, y_train, X_test, y_test, X_val, y_val = prepare_data(train_dir, test_dir, val_dir) + + # Check data shapes + print(f"Training data shape: {X_train.shape}") + print(f"Training labels shape: {y_train.shape}") + print(f"Testing data shape: {X_test.shape}") + print(f"Testing labels shape: {y_test.shape}") + if X_val is not None and y_val is not None: + print(f"Validation data shape: {X_val.shape}") + print(f"Validation labels shape: {y_val.shape}") + + # One-hot encode labels + num_classes = len(np.unique(y_train)) + y_train = to_categorical(y_train, num_classes) + y_test = to_categorical(y_test, num_classes) + if X_val is not None and y_val is not None: + y_val = to_categorical(y_val, num_classes) + + # No train-test split for validation, using X_val and y_val for validation + model = getModel(X_train.shape[1], num_classes) + + # mc = keras.callbacks.ModelCheckpoint( + # filepath='/home/cs24mtech02001/IR2Vec-Classification/weights/milepost-O0/codeforces/weights_epoch_{epoch:08d}.weights.keras', + # save_weights_only=True, + # save_freq=500 + # ) + + mc = keras.callbacks.ModelCheckpoint( + filepath="/home/cs24mtech02001/IR2Vec-Classification/weights/milepost-O3/codeforces/weights_epoch_{epoch:08d}.weights.h5", + save_weights_only=True, + save_best_only=True, + monitor="val_loss", + mode="min" + ) + + # Train the model with validation data + model.fit( + X_train, + y_train, + validation_data=(X_val, y_val) if X_val is not None and y_val is not None else None, + batch_size=32, + epochs=2000, + verbose=1, + callbacks=[mc] + ) + + # Evaluate model + y_pred = np.argmax(model.predict(X_test), axis=1) + y_true = np.argmax(y_test, axis=1) + print(f"Accuracy: {accuracy_score(y_true, y_pred):.13f}") + + # Save the trained model + model.save("new-codeforces-O3-milepost-ir2vec-hypertuned-model.h5") + print("Saved model to disk as 'Kodanda-new-codeforces-on-new-data-O3-milepost-ir2vec-hypertuned-model.keras'.") + + return model + +# Execute the script +if __name__ == "__main__": + main() \ No newline at end of file