diff --git a/Download-licenses-Script/database-foss.py b/Download-licenses-Script/database-foss.py index f0d76f5b7..93365eb79 100644 --- a/Download-licenses-Script/database-foss.py +++ b/Download-licenses-Script/database-foss.py @@ -10,7 +10,7 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. @@ -20,6 +20,7 @@ import json import os + def main(): download = "..\\Original-DB-Foss-Dataset" os.makedirs(download, exist_ok=True) @@ -27,8 +28,9 @@ def main(): response = urlopen(url) data_json = json.loads(response.read()) for licenses in data_json: - with open(download+'\\'+licenses["rf_shortname"], 'w', encoding ='utf-8') as o1: - o1.write(licenses["rf_text"]) + with open(download+'\\'+licenses["rf_shortname"], 'w', encoding='utf-8') as o1: + o1.write(licenses["rf_text"]) + if __name__ == "__main__": main() diff --git a/Download-licenses-Script/exceptions.py b/Download-licenses-Script/exceptions.py index 4e3c7b94f..a30e144bf 100644 --- a/Download-licenses-Script/exceptions.py +++ b/Download-licenses-Script/exceptions.py @@ -10,7 +10,7 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. @@ -20,25 +20,29 @@ import json import os + def extract_exceptions(): - """ - There are 41 files of SPDX exception in licenseListVersion: 3.13 - url of latest SPDX Exception release = https://spdx.org/licenses/exceptions.json - """ - download = "..\\Original-SPDX-Dataset" - os.makedirs(download, exist_ok=True) - url = 'https://spdx.org/licenses/exceptions.json' - response = urlopen(url) - data_json = json.loads(response.read()) - - for license in data_json["exceptions"]: - license["reference"] = license["reference"].replace("./","https://spdx.org/licenses/",1) - url2 = license["reference"] - response2 = urlopen(url2) - data_json2 = json.loads(response2.read()) - - with open(download+'\\'+license["licenseExceptionId"], 'w') as o1: - o1.write(data_json2["licenseExceptionText"]) + """ + There are 41 files of SPDX exception in licenseListVersion: 3.13 + url of latest SPDX Exception release = https://spdx.org/licenses/exceptions.json + """ + + download = "..\\Original-SPDX-Dataset" + os.makedirs(download, exist_ok=True) + url = 'https://spdx.org/licenses/exceptions.json' + response = urlopen(url) + data_json = json.loads(response.read()) + + for license in data_json["exceptions"]: + license["reference"] = license["reference"].replace( + "./", "https://spdx.org/licenses/", 1) + url2 = license["reference"] + response2 = urlopen(url2) + data_json2 = json.loads(response2.read()) + + with open(download+'\\'+license["licenseExceptionId"], 'w') as o1: + o1.write(data_json2["licenseExceptionText"]) + if __name__ == "__main__": - extract_exceptions() \ No newline at end of file + extract_exceptions() diff --git a/Download-licenses-Script/spdx.py b/Download-licenses-Script/spdx.py index 0f94888ac..5e44baa10 100644 --- a/Download-licenses-Script/spdx.py +++ b/Download-licenses-Script/spdx.py @@ -10,7 +10,7 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. @@ -20,24 +20,27 @@ import json import os + def extract_spdx(): - """ - There are 460 files of SPDX licenses in licenseListVersion: 3.13 - url of latest SPDX Exception release = https://spdx.org/licenses/licenses.json - """ - download = "..\\Original-SPDX-Dataset" - os.makedirs(download, exist_ok=True) - url = 'https://spdx.org/licenses/licenses.json' - response = urlopen(url) - data_json = json.loads(response.read()) - - for license in data_json["licenses"]: - url2 = 'https://spdx.org/licenses/licenses.json' - response2 = urlopen(url2) - data_json2 = json.loads(response2.read()) - - with open(download+'\\'+license["licenseId"], 'w', encoding='utf-8') as o1: - o1.write(data_json2["licenseText"]) + """ + There are 460 files of SPDX licenses in licenseListVersion: 3.13 + url of latest SPDX Exception release = https://spdx.org/licenses/licenses.json + """ + + download = "..\\Original-SPDX-Dataset" + os.makedirs(download, exist_ok=True) + url = 'https://spdx.org/licenses/licenses.json' + response = urlopen(url) + data_json = json.loads(response.read()) + + for license in data_json["licenses"]: + url2 = 'https://spdx.org/licenses/licenses.json' + response2 = urlopen(url2) + data_json2 = json.loads(response2.read()) + + with open(download+'\\'+license["licenseId"], 'w', encoding='utf-8') as o1: + o1.write(data_json2["licenseText"]) + if __name__ == "__main__": - extract_spdx() \ No newline at end of file + extract_spdx() diff --git a/README.md b/README.md index d455cf2c5..46f69f619 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -

Minerva Dataset Generation

+

Minerva Dataset Generation

Project Overview

@@ -84,4 +84,4 @@ Using Nomos to validate generated files. This is a base line regex-based And to use multiple cores to validate files (here I am using 3 cores) : ``` sudo nomos -J -d -n 3 -``` \ No newline at end of file +``` diff --git a/Script-Initial-Split/count.py b/Script-Initial-Split/count.py index 26dc1df3e..3065f4ded 100644 --- a/Script-Initial-Split/count.py +++ b/Script-Initial-Split/count.py @@ -10,7 +10,7 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. @@ -20,22 +20,34 @@ import pandas as pd import argparse + def main(path): file = [] count = [] for filename in os.listdir(path): file.append(filename) - lst = os.listdir(os.path.join(path,filename)) # dir is your directory path + # dir is your directory path + lst = os.listdir(os.path.join(path, filename)) number_files = len(lst) - count.append(number_files) + count.append(number_files) - data = pd.DataFrame({"files":file,"count":count}) + data = pd.DataFrame({"files": file, "count": count}) data.to_csv("Count.csv", index=False) + if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('path', help='Pass a directory to find original licenses') + parser.add_argument( + 'path', help='Pass a directory to find original licenses') + args = parser.parse_args() - path = args.path + + try: + path = args.path + if not os.path.isdir(path): + raise TypeError + except TypeError: + print("Valid directory not provided") + main(path) diff --git a/Script-Initial-Split/initial_split.py b/Script-Initial-Split/initial_split.py index f1df64a08..ad5d41cef 100644 --- a/Script-Initial-Split/initial_split.py +++ b/Script-Initial-Split/initial_split.py @@ -10,86 +10,101 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. """ + import re import os import argparse -def splitter(file,dirname): - + +def splitter(file, dirname): + history = [] - with open(file, 'r', encoding= 'unicode_escape') as f: + with open(file, 'r', encoding='unicode_escape') as f: content = f.readlines() - # you may also want to remove whitespace characters like `\n` at the end of each line + +# you may also want to remove whitespace characters +# like `\n` at the end of each line + text = " ".join(content) content = text.split(". ") content = [x.strip() for x in content] para = "" - for comb in range(1,len(content)): + for comb in range(1, len(content)): for i in range(0, len(content)-comb+1, comb): - if len(history)>1000: + if len(history) > 1000: history = list(set(history)) - if len(history)>1000: + if len(history) > 1000: break - para = para + " " + content[i] - para = re.sub("\s\s+" , " ", para) + para = para + " " + content[i] + para = re.sub("\s\s+", " ", para) para = para.strip() if para not in history: history.append(para) history = list(set(history)) - generate_files(file,history,dirname) + generate_files(file, history, dirname) - -def generate_files(file,history,dirname): + +def generate_files(file, history, dirname): counter = 0 os.makedirs(dirname, exist_ok=True) for texts in history: - counter+=1 - name = dirname + '-{}.txt'.format(counter) - with open(os.path.join(dirname,name), 'w', encoding= 'unicode_escape') as o1: - o1.write(texts) - naive_approach(file,dirname,counter) + counter += 1 + name = dirname + f"""-{counter}.txt""" + with open(os.path.join(dirname, name), + 'w', encoding='unicode_escape') as o1: + o1.write(texts) + naive_approach(file, dirname, counter) -def naive_approach(file,dirname,counter): + +def naive_approach(file, dirname, counter): os.makedirs(dirname, exist_ok=True) - with open(file, 'r', encoding= 'unicode_escape') as f: + with open(file, 'r', encoding='unicode_escape') as f: para = sum(line.isspace() for line in f) + 1 - with open(file, 'r+', encoding= 'unicode_escape') as f: + with open(file, 'r+', encoding='unicode_escape') as f: contents = f.read() content = contents.split('\n\n') for i in range(para): counter += 1 - name = dirname + '-{}.txt'.format(counter) + name = dirname + f"""-{counter}.txt""" try: - with open(os.path.join(dirname,name), 'w', encoding= 'unicode_escape') as o1: + with open(os.path.join(dirname, name), + 'w', encoding='unicode_escape') as o1: o1.write(str(content[i])) - except: + + except EnvironmentError: break + def main(path): - for roots, dirs, files in os.walk(path,topdown=True): + for roots, dirs, files in os.walk(path, topdown=True): for name in files: dirname = os.path.splitext(name)[0] - file = os.path.join(path,name) - splitter(file,dirname) + file = os.path.join(path, name) + splitter(file, dirname) + if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('path', help='Pass a directory to find original licenses') + parser.add_argument( + 'path', help='Pass a directory to find original licenses') + args = parser.parse_args() - path = args.path - - if path.isdir(): - main(path) - else: - print("Invalid directory") - + + try: + path = args.path + if not os.path.isdir(path): + raise TypeError + except TypeError: + print("Valid directory not provided") + + main(path) diff --git a/markov/__init__.py b/markov/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/markov/helper.py b/markov/helper.py index cb11ac369..9ae0be8f5 100644 --- a/markov/helper.py +++ b/markov/helper.py @@ -10,37 +10,53 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. """ + import os -from os import walk -from os.path import splitext -from os.path import join import pandas as pd +import re +import string + + +def preprocessing_text(text): + text = re.sub(r'\w*\d\w*', '', text) + text = re.sub("[\n]+", "\n", text) + text = text.strip() + punctuationNoPeriod = "[" + "(" + ")" + "]" + text = re.sub(punctuationNoPeriod, "", text) + text = text.translate(str.maketrans('', '', string.punctuation)) + text = re.sub(r"\b[a-zA-Z]\b", "", text) + text = re.sub("[\s]+", " ", text) + text = text.replace('"', '') + return text + def read_directory(path): barlist = list() for root, dirs, files in os.walk(path): - for f in files: - if splitext(f)[1].lower() == ".txt": - barlist.append(os.path.join(root, f)) - #print(barlist) + for f in files: + if os.path.splitext(f)[1].lower() == ".txt": + barlist.append(os.path.join(root, f)) return barlist + def file_vocab(filename): - vfile = os.path.join("../Original-SPDX-Dataset",filename + '.txt') - with open(vfile, 'r', encoding = 'unicode_escape') as f: + vfile = os.path.join("../Original-SPDX-Dataset", filename + '.txt') + with open(vfile, 'r', encoding='unicode_escape') as f: vocab = f.read() return vocab + def file_regex(filepath, regexcsv): - licensename = os.path.sep.join(filepath.split(os.path.sep)[0:-1]).split(os.path.sep)[-1] + licensename = os.path.sep.join(filepath.split( + os.path.sep)[0:-1]).split(os.path.sep)[-1] df = pd.read_csv(regexcsv) - var = df.loc[df.Licenses==licensename,'Regex'] + var = df.loc[df.Licenses == licensename, 'Regex'] if var.shape[0] == 0: return "" else: - return var.values[0] \ No newline at end of file + return var.values[0] diff --git a/markov/markov.py b/markov/markov.py index 28fcefdc5..c2ce750f0 100644 --- a/markov/markov.py +++ b/markov/markov.py @@ -10,74 +10,79 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. """ + import string import re import intxeger import random from collections import defaultdict -#import sys -# sys.path.append('ngram/ngram') -def regex_expansion(prevsen,latersen,text): - result = [] - while(len(result)<100): - final_regex = "" - num = random.randint(1,4) - for i in range(num): - final_regex = final_regex + generate_sen(markov_chain(text),random.randint(1,32),text)+" " - fregex_ = prevsen + final_regex + latersen - ans = licensestatement_(fregex_) - for i in ans: - i = re.sub("[\s]+", " ",i) - if i not in result: - result.append(i) - return result + +def regex_expansion(prevsen, latersen, text): + result = [] + while(len(result) < 100): + final_regex = "" + num = random.randint(1, 4) + for i in range(num): + final_regex = final_regex + \ + generate_sen(markov_chain(text), + random.randint(1, 32), text)+" " + fregex_ = prevsen + final_regex + latersen + ans = licensestatement_(fregex_) + for i in ans: + i = re.sub("[\s]+", " ", i) + if i not in result: + result.append(i) + return result + def generate_sen(chain, count, text): - text = re.sub(r'\w*\d\w*', '', text) - words = text.split(' ') - words = [word.translate({ord(c): "" for c in "!@#$%^&*()[]{};:,./<>?\|`~-=_+"}) for word in words] - words = [word.lower() for word in words] - word1 = random.choice(words) - sentence = word1 - for i in range(count-2): - try: - word2 = random.choice(chain[word1]) - word1 = word2 - sentence += ' ' + word2 - except: - continue - return sentence + text = re.sub(r'\w*\d\w*', '', text) + words = text.split(' ') + words = [word.translate({ord(c): "" for c in "!@#$%^&*()[]{};:,./<>?\|`~-=_+"}) for word in words] + words = [word.lower() for word in words] + word1 = random.choice(words) + sentence = word1 + for i in range(count-2): + try: + word2 = random.choice(chain[word1]) + word1 = word2 + sentence += ' ' + word2 + except Exception: + continue + return sentence + def markov_chain(text): - words = text.split(' ') - words = [word.translate({ord(c): "" for c in "!@#$%^&*()[]{};:,./<>?\|`~-=_+"}) for word in words] - words = [word.lower() for word in words] - m_dict = defaultdict(list) - for current_word, next_word in zip(words[0:-1], words[1:]): - current_word = re.sub(r'\w*\d\w*', '', current_word) - m_dict[current_word].append(next_word) - m_dict = dict(m_dict) - return m_dict + words = text.split(' ') + words = [word.translate({ord(c): "" for c in "!@#$%^&*()[]{};:,./<>?\|`~-=_+"}) for word in words] + words = [word.lower() for word in words] + m_dict = defaultdict(list) + for current_word, next_word in zip(words[0:-1], words[1:]): + current_word = re.sub(r'\w*\d\w*', '', current_word) + m_dict[current_word].append(next_word) + m_dict = dict(m_dict) + return m_dict + def licensestatement_(regex_): - x = intxeger.build(regex_) - res=x.sample(N=1) - result = res - i=2 - while True: - try: - result = res - result = list(set(result)) - if len(result) >10: - return result - res = x.sample(N=i) - i+=1 - except: - break - return result + x = intxeger.build(regex_) + res = x.sample(N=1) + result = res + i = 2 + while True: + try: + result = res + result = list(set(result)) + if len(result) > 10: + return result + res = x.sample(N=i) + i += 1 + except Exception: + break + return result diff --git a/markov/markov_licenses.py b/markov/markov_licenses.py index fb36beb39..bb3c689b5 100644 --- a/markov/markov_licenses.py +++ b/markov/markov_licenses.py @@ -10,68 +10,62 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. """ + import os -from os import walk -from os.path import splitext -from os.path import join -#import sys -#sys.path.append('regex/ngram') -from preprocess import * -from markov import * -from helper import read_directory, file_vocab, file_regex +from .preprocess import * +from .markov import * +from .helper import read_directory, file_vocab, file_regex import argparse -import pandas as pd -import random import pathlib import multiprocessing +import numpy as np -def chunkIt(seq, num): - avg = len(seq) / float(num) - out = [] - last = 0.0 - while last < len(seq): - out.append(seq[int(last):int(last + avg)]) - last += avg +def chunkIt(seq, num): + return np.array_split(seq, num) - return out def main(files, regexcsv): pathlib.Path("markovfiles").mkdir(parents=True, exist_ok=True) - # files = read_directory(path) for file in files: - filename = os.path.sep.join(file.split(os.path.sep)[0:-1]).split(os.path.sep)[-1] - with open(file, 'r', encoding = 'unicode_escape') as f: + filename = os.path.sep.join(file.split(os.path.sep)[ + 0:-1]).split(os.path.sep)[-1] + with open(file, 'r', encoding='unicode_escape') as f: content = f.read() - + vocabulary = file_vocab(filename) regex = file_regex(file, regexcsv) regex = regex.strip().replace('"', '') - - if len(regex)==0: + + if len(regex) == 0: continue - os.makedirs(os.path.join("markovfiles",filename), exist_ok=True) + os.makedirs(os.path.join("markovfiles", filename), exist_ok=True) preregex = regex.split("(.{1,32} (AND|OR)){1,4}")[0] secregex = regex.split("(.{1,32} (AND|OR)){1,4}")[-1] expansion = [] - expansion = regex_expansion(preregex,secregex,vocabulary) - lst = os.listdir(os.path.join("markovfiles",filename)) + expansion = regex_expansion(preregex, secregex, vocabulary) + lst = os.listdir(os.path.join("markovfiles", filename)) count = len(lst) - + for ind in range(len(expansion)): - count+=1 - with open(os.path.join(os.path.join(os.path.join("markovfiles",filename),'{}-{}.txt'.format(filename,count))), 'w', encoding = 'unicode_escape') as o1: + count += 1 + with open(os.path.join(os.path.join( + os.path.join("markovfiles", filename), + f"""{filename}-{count}.txt""")), 'w', + encoding='unicode_escape') as o1: + o1.write(content + '.' + expansion[ind]) + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( @@ -91,17 +85,33 @@ def main(files, regexcsv): ) args = parser.parse_args() - inputpath = args.inputpath - regexcsv = args.regexcsv - n = int(args.cores) + + try: + inputpath = args.inputpath + if not os.path.isdir(inputpath): + raise TypeError + except TypeError: + print("Valid License directory not provided") + + try: + regexcsv = args.regexcsv + if not os.path.isfile(regexcsv): + raise TypeError + if os.path.splitext(regexcsv)[1].lower() != ".csv": + raise TypeError + except TypeError: + print("Invalid File or path provided. Expected csv filepath") + + try: + n = int(args.cores) + if n <= 0: + raise ValueError + except ValueError: + print("Number of cores cannot be a string, zero or a negative number") samples = read_directory(inputpath) ls = chunkIt(samples, n) - list_data = [] - - for i in range(len(ls)): - list_data.append((ls[i], regexcsv)) + list_data = [(ls[i], regexcsv) for i in range(len(ls))] with multiprocessing.Pool(processes=n) as pool: pool.starmap(main, list_data) - diff --git a/markov/preprocess.py b/markov/preprocess.py deleted file mode 100644 index 95fb87372..000000000 --- a/markov/preprocess.py +++ /dev/null @@ -1,31 +0,0 @@ -""" - Copyright (C) 2021 Shreya Singh (shreya.out@gmail.com) - - SPDX-License-Identifier: GPL-2.0 - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License - version 2 as published by the Free Software Foundation. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -""" -import re -import string - -def preprocessing_text(text): - text = re.sub(r'\w*\d\w*', '', text) - text = re.sub("[\n]+", "\n",text) - text = text.strip() - punctuationNoPeriod = "[" + "(" + ")" + "]" - text = re.sub(punctuationNoPeriod, "", text) - text = text.translate(str.maketrans('', '', string.punctuation)) - text = re.sub(r"\b[a-zA-Z]\b", "", text) - text = re.sub("[\s]+", " ",text) - text = text.replace('"', '') - return text diff --git a/ngram/licenses.py b/ngram/licenses.py index a53f1b701..00b919b19 100644 --- a/ngram/licenses.py +++ b/ngram/licenses.py @@ -10,99 +10,99 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. """ + + import os -from os import walk -from os.path import splitext -from os.path import join -from preprocessing import * -from regex_handling import * -from ngram import * +from .preprocessing import * +from .regex_handling import * +from .ngram import * import argparse import pandas as pd import random import pathlib import multiprocessing +import numpy as np + def chunkIt(seq, num): - avg = len(seq) / float(num) - out = [] - last = 0.0 - - while last < len(seq): - out.append(seq[int(last):int(last + avg)]) - last += avg + return np.array_split(seq, num) - return out def read_directory(path): barlist = list() for root, dirs, files in os.walk(path): - for f in files: - if splitext(f)[1].lower() == ".txt": - barlist.append(os.path.join(root, f)) - #print(barlist) + for f in files: + if os.path.splitext(f)[1].lower() == ".txt": + barlist.append(os.path.join(root, f)) return barlist + def file_vocab(filename): - vfile = os.path.join("../Original-SPDX-Dataset",filename + '.txt') - # licensename = filepath.split('\\')[-1] - with open(vfile, 'r', encoding = 'unicode_escape') as f: + vfile = os.path.join("../Original-SPDX-Dataset", filename + '.txt') + with open(vfile, 'r', encoding='unicode_escape') as f: vocab = f.read() return vocab + def file_regex(filepath, regexcsv): - licensename = os.path.sep.join(filepath.split(os.path.sep)[0:-1]).split(os.path.sep)[-1] + licensename = os.path.sep.join(filepath.split( + os.path.sep)[0:-1]).split(os.path.sep)[-1] df = pd.read_csv(regexcsv) - var = df.loc[df.Licenses==licensename,'Regex'] + var = df.loc[df.Licenses == licensename, 'Regex'] if var.shape[0] == 0: return "" else: return var.values[0] + def main(files, regexcsv): pathlib.Path("ngramfiles").mkdir(parents=True, exist_ok=True) - # files = read_directory(path) - + for file in files: - filename = os.path.sep.join(file.split(os.path.sep)[0:-1]).split(os.path.sep)[-1] - with open(file, 'r', encoding = 'unicode_escape') as f: + filename = os.path.sep.join(file.split(os.path.sep)[ + 0:-1]).split(os.path.sep)[-1] + with open(file, 'r', encoding='unicode_escape') as f: content = f.read() vocabulary = file_vocab(filename) regex = file_regex(file, regexcsv) regex = regex.strip().replace('"', '') - if len(regex)==0: - # print('Regex not found for -> ', filename) + if len(regex) == 0: continue - os.makedirs(os.path.join("ngramfiles",filename), exist_ok=True) + os.makedirs(os.path.join("ngramfiles", filename), exist_ok=True) preregex = regex.split("(.{1,32} (AND|OR)){1,4}")[0] secregex = regex.split("(.{1,32} (AND|OR)){1,4}")[-1] expansion = [] - for ind in range(2,8): - m = create_ngram_model(ind,file) - for i in range(1,len(vocabulary)): + for ind in range(2, 8): + m = create_ngram_model(ind, file) + for i in range(1, len(vocabulary)): random.seed(i) - generated_text = m.generate_text(np.random.randint(6,31)) + generated_text = m.generate_text(np.random.randint(6, 31)) generated_text = preprocessing_text(generated_text).lower() expansion.append(generated_text) expansion = list(set(expansion)) - expansion_regex = regex_expansion(preregex,expansion,secregex) - lst = os.listdir(os.path.join("ngramfiles",filename)) + expansion_regex = regex_expansion(preregex, expansion, secregex) + lst = os.listdir(os.path.join("ngramfiles", filename)) count = len(lst) for ind in range(len(expansion_regex)): - count+=1 - with open(os.path.join(os.path.join("ngramfiles",filename),'{}-{}.txt'.format(filename,count)), 'w', encoding = 'unicode_escape') as o1: + count += 1 + with open(os.path.join( + os.path.join("ngramfiles", filename), + f"""{filename}-{count}.txt"""), + 'w', encoding='unicode_escape') as o1: + o1.write(content + '.' + expansion_regex[ind]) + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( @@ -123,19 +123,33 @@ def main(files, regexcsv): ) args = parser.parse_args() - inputpath = args.inputpath - regexcsv = args.regexcsv - n = int(args.cores) + + try: + inputpath = args.inputpath + if not os.path.isdir(inputpath): + raise TypeError + except TypeError: + print("Valid License directory not provided") + + try: + regexcsv = args.regexcsv + if not os.path.isfile(regexcsv): + raise TypeError + if os.path.splitext(regexcsv)[1].lower() != ".csv": + raise TypeError + except TypeError: + print("Invalid File or path provided. Expected csv filepath") + + try: + n = int(args.cores) + if n <= 0: + raise ValueError + except ValueError: + print("Number of cores cannot be a string, zero or a negative number") samples = read_directory(inputpath) ls = chunkIt(samples, n) - list_data = [] - - for i in range(len(ls)): - list_data.append((ls[i], regexcsv)) + list_data = [(ls[i], regexcsv) for i in range(len(ls))] with multiprocessing.Pool(processes=n) as pool: pool.starmap(main, list_data) - - # main(inputpath, regexcsv) - diff --git a/ngram/ngram.py b/ngram/ngram.py index 757c30f67..2ba574bbd 100644 --- a/ngram/ngram.py +++ b/ngram/ngram.py @@ -10,11 +10,13 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. """ + + import string import re from collections import defaultdict @@ -22,10 +24,13 @@ from preprocessing import tokenize import random + def get_ngrams(n: int, tokens: list) -> list: tokens = (n-1)*['']+tokens - l = [(tuple([tokens[i-p-1] for p in reversed(range(n-1))]), tokens[i]) for i in range(n-1, len(tokens))] - return l + lr = [(tuple([tokens[i-p-1] for p in reversed(range(n-1))]), tokens[i]) + for i in range(n-1, len(tokens))] + return lr + class NgramModel(object): @@ -97,4 +102,4 @@ def create_ngram_model(n, path): # add back the fullstop sentence += '.' m.update(sentence) - return m \ No newline at end of file + return m diff --git a/ngram/preprocessing.py b/ngram/preprocessing.py index c4acee94d..d19c34a12 100644 --- a/ngram/preprocessing.py +++ b/ngram/preprocessing.py @@ -10,7 +10,7 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. @@ -19,20 +19,22 @@ import string from typing import List + def preprocessing_text(text): text = re.sub(r'\w*\d\w*', '', text) - text = re.sub("[\n]+", "\n",text) + text = re.sub("[\n]+", "\n", text) text = text.strip() punctuationNoPeriod = "[" + "(" + ")" + "]" text = re.sub(punctuationNoPeriod, "", text) text = text.translate(str.maketrans('', '', string.punctuation)) - text = re.sub(r"\b[a-zA-Z]\b", "", text) - text = re.sub("[\s]+", " ",text) - text = text.replace('"', '') + text = re.sub(r"\b[a-zA-Z]\b", "", text) + text = re.sub("[\s]+", " ", text) + text = text.replace('"', '') return text + def tokenize(text: str) -> List[str]: for punct in string.punctuation: text = text.replace(punct, ' '+punct+' ') t = text.split() - return t \ No newline at end of file + return t diff --git a/ngram/regex_handling.py b/ngram/regex_handling.py index 0df8637ef..b0c36e60d 100644 --- a/ngram/regex_handling.py +++ b/ngram/regex_handling.py @@ -10,11 +10,12 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. """ + import string import re import intxeger @@ -22,45 +23,49 @@ import numpy as np import random + def licensestatement_(regex_): - x = intxeger.build(regex_) - res=x.sample(N=1) - result = res - i=2 - while True: - try: - result = res - result = list(set(result)) - if len(result) >10: - return result - res = x.sample(N=i) - i+=1 - except: - break - return result + x = intxeger.build(regex_) + res = x.sample(N=1) + result = res + i = 2 + while True: + try: + result = res + result = list(set(result)) + if len(result) > 10: + return result + res = x.sample(N=i) + i += 1 + except Exception: + break + return result + + +def regex_expansion(prevsen, input_list, latersen): + res_ = [] + while(len(res_) < 200): + final_regex = "" + num = random.randint(1, 4) + for i in range(num): + final_regex = final_regex + \ + np.random.choice(input_list)+" (and|or) " + fregex_ = prevsen + " " + final_regex + " " + latersen + ans = licensestatement_(fregex_) + for i in ans: + i = re.sub("[\s]+", " ", i) + res_.append(i) + return res_ -def regex_expansion(prevsen,input_list,latersen): - res_=[] - while(len(res_)<200): - final_regex = "" - num = random.randint(1,4) - for i in range(num): - final_regex = final_regex + np.random.choice(input_list)+" (and|or) " - fregex_ = prevsen + " " + final_regex + " " + latersen - ans = licensestatement_(fregex_) - for i in ans: - i = re.sub("[\s]+", " ",i) - res_.append(i) - return res_ def generate_statements(): - for i in range(2,20): - m = create_ngram_model(i,key) - - for i in range(1,len(text)): - random.seed(i) - generated_text = m.generate_text(np.random.randint(6,31)) - generated_text = clean_license(generated_text) - generated_text = generated_text.lower() - expansion.append(generated_text) - expansion = list(set(expansion)) \ No newline at end of file + for i in range(2, 20): + m = create_ngram_model(i, key) + + for i in range(1, len(text)): + random.seed(i) + generated_text = m.generate_text(np.random.randint(6, 31)) + generated_text = clean_license(generated_text) + generated_text = generated_text.lower() + expansion.append(generated_text) + expansion = list(set(expansion)) diff --git a/requirements.txt b/requirements.txt index 38a6919db..8c1654492 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -pandas==1.2.4 -numpy==1.18.2 +pandas>=1.2.4 +numpy>=1.18.2 intxeger==0.1.1