diff --git a/Download-licenses-Script/database-foss.py b/Download-licenses-Script/database-foss.py
index f0d76f5b7..93365eb79 100644
--- a/Download-licenses-Script/database-foss.py
+++ b/Download-licenses-Script/database-foss.py
@@ -10,7 +10,7 @@
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
-
+
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
@@ -20,6 +20,7 @@
import json
import os
+
def main():
download = "..\\Original-DB-Foss-Dataset"
os.makedirs(download, exist_ok=True)
@@ -27,8 +28,9 @@ def main():
response = urlopen(url)
data_json = json.loads(response.read())
for licenses in data_json:
- with open(download+'\\'+licenses["rf_shortname"], 'w', encoding ='utf-8') as o1:
- o1.write(licenses["rf_text"])
+ with open(download+'\\'+licenses["rf_shortname"], 'w', encoding='utf-8') as o1:
+ o1.write(licenses["rf_text"])
+
if __name__ == "__main__":
main()
diff --git a/Download-licenses-Script/exceptions.py b/Download-licenses-Script/exceptions.py
index 4e3c7b94f..a30e144bf 100644
--- a/Download-licenses-Script/exceptions.py
+++ b/Download-licenses-Script/exceptions.py
@@ -10,7 +10,7 @@
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
-
+
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
@@ -20,25 +20,29 @@
import json
import os
+
def extract_exceptions():
- """
- There are 41 files of SPDX exception in licenseListVersion: 3.13
- url of latest SPDX Exception release = https://spdx.org/licenses/exceptions.json
- """
- download = "..\\Original-SPDX-Dataset"
- os.makedirs(download, exist_ok=True)
- url = 'https://spdx.org/licenses/exceptions.json'
- response = urlopen(url)
- data_json = json.loads(response.read())
-
- for license in data_json["exceptions"]:
- license["reference"] = license["reference"].replace("./","https://spdx.org/licenses/",1)
- url2 = license["reference"]
- response2 = urlopen(url2)
- data_json2 = json.loads(response2.read())
-
- with open(download+'\\'+license["licenseExceptionId"], 'w') as o1:
- o1.write(data_json2["licenseExceptionText"])
+ """
+ There are 41 files of SPDX exception in licenseListVersion: 3.13
+ url of latest SPDX Exception release = https://spdx.org/licenses/exceptions.json
+ """
+
+ download = "..\\Original-SPDX-Dataset"
+ os.makedirs(download, exist_ok=True)
+ url = 'https://spdx.org/licenses/exceptions.json'
+ response = urlopen(url)
+ data_json = json.loads(response.read())
+
+ for license in data_json["exceptions"]:
+ license["reference"] = license["reference"].replace(
+ "./", "https://spdx.org/licenses/", 1)
+ url2 = license["reference"]
+ response2 = urlopen(url2)
+ data_json2 = json.loads(response2.read())
+
+ with open(download+'\\'+license["licenseExceptionId"], 'w') as o1:
+ o1.write(data_json2["licenseExceptionText"])
+
if __name__ == "__main__":
- extract_exceptions()
\ No newline at end of file
+ extract_exceptions()
diff --git a/Download-licenses-Script/spdx.py b/Download-licenses-Script/spdx.py
index 0f94888ac..5e44baa10 100644
--- a/Download-licenses-Script/spdx.py
+++ b/Download-licenses-Script/spdx.py
@@ -10,7 +10,7 @@
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
-
+
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
@@ -20,24 +20,27 @@
import json
import os
+
def extract_spdx():
- """
- There are 460 files of SPDX licenses in licenseListVersion: 3.13
- url of latest SPDX Exception release = https://spdx.org/licenses/licenses.json
- """
- download = "..\\Original-SPDX-Dataset"
- os.makedirs(download, exist_ok=True)
- url = 'https://spdx.org/licenses/licenses.json'
- response = urlopen(url)
- data_json = json.loads(response.read())
-
- for license in data_json["licenses"]:
- url2 = 'https://spdx.org/licenses/licenses.json'
- response2 = urlopen(url2)
- data_json2 = json.loads(response2.read())
-
- with open(download+'\\'+license["licenseId"], 'w', encoding='utf-8') as o1:
- o1.write(data_json2["licenseText"])
+ """
+ There are 460 files of SPDX licenses in licenseListVersion: 3.13
+ url of latest SPDX Exception release = https://spdx.org/licenses/licenses.json
+ """
+
+ download = "..\\Original-SPDX-Dataset"
+ os.makedirs(download, exist_ok=True)
+ url = 'https://spdx.org/licenses/licenses.json'
+ response = urlopen(url)
+ data_json = json.loads(response.read())
+
+ for license in data_json["licenses"]:
+ url2 = 'https://spdx.org/licenses/licenses.json'
+ response2 = urlopen(url2)
+ data_json2 = json.loads(response2.read())
+
+ with open(download+'\\'+license["licenseId"], 'w', encoding='utf-8') as o1:
+ o1.write(data_json2["licenseText"])
+
if __name__ == "__main__":
- extract_spdx()
\ No newline at end of file
+ extract_spdx()
diff --git a/README.md b/README.md
index d455cf2c5..46f69f619 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-
Minerva Dataset Generation
+Minerva Dataset Generation
Project Overview
@@ -84,4 +84,4 @@ Using Nomos to validate generated files. This is a base line regex-based
And to use multiple cores to validate files (here I am using 3 cores) :
```
sudo nomos -J -d -n 3
-```
\ No newline at end of file
+```
diff --git a/Script-Initial-Split/count.py b/Script-Initial-Split/count.py
index 26dc1df3e..3065f4ded 100644
--- a/Script-Initial-Split/count.py
+++ b/Script-Initial-Split/count.py
@@ -10,7 +10,7 @@
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
-
+
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
@@ -20,22 +20,34 @@
import pandas as pd
import argparse
+
def main(path):
file = []
count = []
for filename in os.listdir(path):
file.append(filename)
- lst = os.listdir(os.path.join(path,filename)) # dir is your directory path
+ # dir is your directory path
+ lst = os.listdir(os.path.join(path, filename))
number_files = len(lst)
- count.append(number_files)
+ count.append(number_files)
- data = pd.DataFrame({"files":file,"count":count})
+ data = pd.DataFrame({"files": file, "count": count})
data.to_csv("Count.csv", index=False)
+
if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('path', help='Pass a directory to find original licenses')
+ parser.add_argument(
+ 'path', help='Pass a directory to find original licenses')
+
args = parser.parse_args()
- path = args.path
+
+ try:
+ path = args.path
+ if not os.path.isdir(path):
+ raise TypeError
+ except TypeError:
+ print("Valid directory not provided")
+
main(path)
diff --git a/Script-Initial-Split/initial_split.py b/Script-Initial-Split/initial_split.py
index f1df64a08..ad5d41cef 100644
--- a/Script-Initial-Split/initial_split.py
+++ b/Script-Initial-Split/initial_split.py
@@ -10,86 +10,101 @@
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
-
+
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""
+
import re
import os
import argparse
-def splitter(file,dirname):
-
+
+def splitter(file, dirname):
+
history = []
- with open(file, 'r', encoding= 'unicode_escape') as f:
+ with open(file, 'r', encoding='unicode_escape') as f:
content = f.readlines()
- # you may also want to remove whitespace characters like `\n` at the end of each line
+
+# you may also want to remove whitespace characters
+# like `\n` at the end of each line
+
text = " ".join(content)
content = text.split(". ")
content = [x.strip() for x in content]
para = ""
- for comb in range(1,len(content)):
+ for comb in range(1, len(content)):
for i in range(0, len(content)-comb+1, comb):
- if len(history)>1000:
+ if len(history) > 1000:
history = list(set(history))
- if len(history)>1000:
+ if len(history) > 1000:
break
- para = para + " " + content[i]
- para = re.sub("\s\s+" , " ", para)
+ para = para + " " + content[i]
+ para = re.sub("\s\s+", " ", para)
para = para.strip()
if para not in history:
history.append(para)
history = list(set(history))
- generate_files(file,history,dirname)
+ generate_files(file, history, dirname)
-
-def generate_files(file,history,dirname):
+
+def generate_files(file, history, dirname):
counter = 0
os.makedirs(dirname, exist_ok=True)
for texts in history:
- counter+=1
- name = dirname + '-{}.txt'.format(counter)
- with open(os.path.join(dirname,name), 'w', encoding= 'unicode_escape') as o1:
- o1.write(texts)
- naive_approach(file,dirname,counter)
+ counter += 1
+ name = dirname + f"""-{counter}.txt"""
+ with open(os.path.join(dirname, name),
+ 'w', encoding='unicode_escape') as o1:
+ o1.write(texts)
+ naive_approach(file, dirname, counter)
-def naive_approach(file,dirname,counter):
+
+def naive_approach(file, dirname, counter):
os.makedirs(dirname, exist_ok=True)
- with open(file, 'r', encoding= 'unicode_escape') as f:
+ with open(file, 'r', encoding='unicode_escape') as f:
para = sum(line.isspace() for line in f) + 1
- with open(file, 'r+', encoding= 'unicode_escape') as f:
+ with open(file, 'r+', encoding='unicode_escape') as f:
contents = f.read()
content = contents.split('\n\n')
for i in range(para):
counter += 1
- name = dirname + '-{}.txt'.format(counter)
+ name = dirname + f"""-{counter}.txt"""
try:
- with open(os.path.join(dirname,name), 'w', encoding= 'unicode_escape') as o1:
+ with open(os.path.join(dirname, name),
+ 'w', encoding='unicode_escape') as o1:
o1.write(str(content[i]))
- except:
+
+ except EnvironmentError:
break
+
def main(path):
- for roots, dirs, files in os.walk(path,topdown=True):
+ for roots, dirs, files in os.walk(path, topdown=True):
for name in files:
dirname = os.path.splitext(name)[0]
- file = os.path.join(path,name)
- splitter(file,dirname)
+ file = os.path.join(path, name)
+ splitter(file, dirname)
+
if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('path', help='Pass a directory to find original licenses')
+ parser.add_argument(
+ 'path', help='Pass a directory to find original licenses')
+
args = parser.parse_args()
- path = args.path
-
- if path.isdir():
- main(path)
- else:
- print("Invalid directory")
-
+
+ try:
+ path = args.path
+ if not os.path.isdir(path):
+ raise TypeError
+ except TypeError:
+ print("Valid directory not provided")
+
+ main(path)
diff --git a/markov/__init__.py b/markov/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/markov/helper.py b/markov/helper.py
index cb11ac369..9ae0be8f5 100644
--- a/markov/helper.py
+++ b/markov/helper.py
@@ -10,37 +10,53 @@
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
-
+
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""
+
import os
-from os import walk
-from os.path import splitext
-from os.path import join
import pandas as pd
+import re
+import string
+
+
+def preprocessing_text(text):
+ text = re.sub(r'\w*\d\w*', '', text)
+ text = re.sub("[\n]+", "\n", text)
+ text = text.strip()
+ punctuationNoPeriod = "[" + "(" + ")" + "]"
+ text = re.sub(punctuationNoPeriod, "", text)
+ text = text.translate(str.maketrans('', '', string.punctuation))
+ text = re.sub(r"\b[a-zA-Z]\b", "", text)
+ text = re.sub("[\s]+", " ", text)
+ text = text.replace('"', '')
+ return text
+
def read_directory(path):
barlist = list()
for root, dirs, files in os.walk(path):
- for f in files:
- if splitext(f)[1].lower() == ".txt":
- barlist.append(os.path.join(root, f))
- #print(barlist)
+ for f in files:
+ if os.path.splitext(f)[1].lower() == ".txt":
+ barlist.append(os.path.join(root, f))
return barlist
+
def file_vocab(filename):
- vfile = os.path.join("../Original-SPDX-Dataset",filename + '.txt')
- with open(vfile, 'r', encoding = 'unicode_escape') as f:
+ vfile = os.path.join("../Original-SPDX-Dataset", filename + '.txt')
+ with open(vfile, 'r', encoding='unicode_escape') as f:
vocab = f.read()
return vocab
+
def file_regex(filepath, regexcsv):
- licensename = os.path.sep.join(filepath.split(os.path.sep)[0:-1]).split(os.path.sep)[-1]
+ licensename = os.path.sep.join(filepath.split(
+ os.path.sep)[0:-1]).split(os.path.sep)[-1]
df = pd.read_csv(regexcsv)
- var = df.loc[df.Licenses==licensename,'Regex']
+ var = df.loc[df.Licenses == licensename, 'Regex']
if var.shape[0] == 0:
return ""
else:
- return var.values[0]
\ No newline at end of file
+ return var.values[0]
diff --git a/markov/markov.py b/markov/markov.py
index 28fcefdc5..c2ce750f0 100644
--- a/markov/markov.py
+++ b/markov/markov.py
@@ -10,74 +10,79 @@
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
-
+
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""
+
import string
import re
import intxeger
import random
from collections import defaultdict
-#import sys
-# sys.path.append('ngram/ngram')
-def regex_expansion(prevsen,latersen,text):
- result = []
- while(len(result)<100):
- final_regex = ""
- num = random.randint(1,4)
- for i in range(num):
- final_regex = final_regex + generate_sen(markov_chain(text),random.randint(1,32),text)+" "
- fregex_ = prevsen + final_regex + latersen
- ans = licensestatement_(fregex_)
- for i in ans:
- i = re.sub("[\s]+", " ",i)
- if i not in result:
- result.append(i)
- return result
+
+def regex_expansion(prevsen, latersen, text):
+ result = []
+ while(len(result) < 100):
+ final_regex = ""
+ num = random.randint(1, 4)
+ for i in range(num):
+ final_regex = final_regex + \
+ generate_sen(markov_chain(text),
+ random.randint(1, 32), text)+" "
+ fregex_ = prevsen + final_regex + latersen
+ ans = licensestatement_(fregex_)
+ for i in ans:
+ i = re.sub("[\s]+", " ", i)
+ if i not in result:
+ result.append(i)
+ return result
+
def generate_sen(chain, count, text):
- text = re.sub(r'\w*\d\w*', '', text)
- words = text.split(' ')
- words = [word.translate({ord(c): "" for c in "!@#$%^&*()[]{};:,./<>?\|`~-=_+"}) for word in words]
- words = [word.lower() for word in words]
- word1 = random.choice(words)
- sentence = word1
- for i in range(count-2):
- try:
- word2 = random.choice(chain[word1])
- word1 = word2
- sentence += ' ' + word2
- except:
- continue
- return sentence
+ text = re.sub(r'\w*\d\w*', '', text)
+ words = text.split(' ')
+ words = [word.translate({ord(c): "" for c in "!@#$%^&*()[]{};:,./<>?\|`~-=_+"}) for word in words]
+ words = [word.lower() for word in words]
+ word1 = random.choice(words)
+ sentence = word1
+ for i in range(count-2):
+ try:
+ word2 = random.choice(chain[word1])
+ word1 = word2
+ sentence += ' ' + word2
+ except Exception:
+ continue
+ return sentence
+
def markov_chain(text):
- words = text.split(' ')
- words = [word.translate({ord(c): "" for c in "!@#$%^&*()[]{};:,./<>?\|`~-=_+"}) for word in words]
- words = [word.lower() for word in words]
- m_dict = defaultdict(list)
- for current_word, next_word in zip(words[0:-1], words[1:]):
- current_word = re.sub(r'\w*\d\w*', '', current_word)
- m_dict[current_word].append(next_word)
- m_dict = dict(m_dict)
- return m_dict
+ words = text.split(' ')
+ words = [word.translate({ord(c): "" for c in "!@#$%^&*()[]{};:,./<>?\|`~-=_+"}) for word in words]
+ words = [word.lower() for word in words]
+ m_dict = defaultdict(list)
+ for current_word, next_word in zip(words[0:-1], words[1:]):
+ current_word = re.sub(r'\w*\d\w*', '', current_word)
+ m_dict[current_word].append(next_word)
+ m_dict = dict(m_dict)
+ return m_dict
+
def licensestatement_(regex_):
- x = intxeger.build(regex_)
- res=x.sample(N=1)
- result = res
- i=2
- while True:
- try:
- result = res
- result = list(set(result))
- if len(result) >10:
- return result
- res = x.sample(N=i)
- i+=1
- except:
- break
- return result
+ x = intxeger.build(regex_)
+ res = x.sample(N=1)
+ result = res
+ i = 2
+ while True:
+ try:
+ result = res
+ result = list(set(result))
+ if len(result) > 10:
+ return result
+ res = x.sample(N=i)
+ i += 1
+ except Exception:
+ break
+ return result
diff --git a/markov/markov_licenses.py b/markov/markov_licenses.py
index fb36beb39..bb3c689b5 100644
--- a/markov/markov_licenses.py
+++ b/markov/markov_licenses.py
@@ -10,68 +10,62 @@
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
-
+
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""
+
import os
-from os import walk
-from os.path import splitext
-from os.path import join
-#import sys
-#sys.path.append('regex/ngram')
-from preprocess import *
-from markov import *
-from helper import read_directory, file_vocab, file_regex
+from .preprocess import *
+from .markov import *
+from .helper import read_directory, file_vocab, file_regex
import argparse
-import pandas as pd
-import random
import pathlib
import multiprocessing
+import numpy as np
-def chunkIt(seq, num):
- avg = len(seq) / float(num)
- out = []
- last = 0.0
- while last < len(seq):
- out.append(seq[int(last):int(last + avg)])
- last += avg
+def chunkIt(seq, num):
+ return np.array_split(seq, num)
- return out
def main(files, regexcsv):
pathlib.Path("markovfiles").mkdir(parents=True, exist_ok=True)
- # files = read_directory(path)
for file in files:
- filename = os.path.sep.join(file.split(os.path.sep)[0:-1]).split(os.path.sep)[-1]
- with open(file, 'r', encoding = 'unicode_escape') as f:
+ filename = os.path.sep.join(file.split(os.path.sep)[
+ 0:-1]).split(os.path.sep)[-1]
+ with open(file, 'r', encoding='unicode_escape') as f:
content = f.read()
-
+
vocabulary = file_vocab(filename)
regex = file_regex(file, regexcsv)
regex = regex.strip().replace('"', '')
-
- if len(regex)==0:
+
+ if len(regex) == 0:
continue
- os.makedirs(os.path.join("markovfiles",filename), exist_ok=True)
+ os.makedirs(os.path.join("markovfiles", filename), exist_ok=True)
preregex = regex.split("(.{1,32} (AND|OR)){1,4}")[0]
secregex = regex.split("(.{1,32} (AND|OR)){1,4}")[-1]
expansion = []
- expansion = regex_expansion(preregex,secregex,vocabulary)
- lst = os.listdir(os.path.join("markovfiles",filename))
+ expansion = regex_expansion(preregex, secregex, vocabulary)
+ lst = os.listdir(os.path.join("markovfiles", filename))
count = len(lst)
-
+
for ind in range(len(expansion)):
- count+=1
- with open(os.path.join(os.path.join(os.path.join("markovfiles",filename),'{}-{}.txt'.format(filename,count))), 'w', encoding = 'unicode_escape') as o1:
+ count += 1
+ with open(os.path.join(os.path.join(
+ os.path.join("markovfiles", filename),
+ f"""{filename}-{count}.txt""")), 'w',
+ encoding='unicode_escape') as o1:
+
o1.write(content + '.' + expansion[ind])
+
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
@@ -91,17 +85,33 @@ def main(files, regexcsv):
)
args = parser.parse_args()
- inputpath = args.inputpath
- regexcsv = args.regexcsv
- n = int(args.cores)
+
+ try:
+ inputpath = args.inputpath
+ if not os.path.isdir(inputpath):
+ raise TypeError
+ except TypeError:
+ print("Valid License directory not provided")
+
+ try:
+ regexcsv = args.regexcsv
+ if not os.path.isfile(regexcsv):
+ raise TypeError
+ if os.path.splitext(regexcsv)[1].lower() != ".csv":
+ raise TypeError
+ except TypeError:
+ print("Invalid File or path provided. Expected csv filepath")
+
+ try:
+ n = int(args.cores)
+ if n <= 0:
+ raise ValueError
+ except ValueError:
+ print("Number of cores cannot be a string, zero or a negative number")
samples = read_directory(inputpath)
ls = chunkIt(samples, n)
- list_data = []
-
- for i in range(len(ls)):
- list_data.append((ls[i], regexcsv))
+ list_data = [(ls[i], regexcsv) for i in range(len(ls))]
with multiprocessing.Pool(processes=n) as pool:
pool.starmap(main, list_data)
-
diff --git a/markov/preprocess.py b/markov/preprocess.py
deleted file mode 100644
index 95fb87372..000000000
--- a/markov/preprocess.py
+++ /dev/null
@@ -1,31 +0,0 @@
-"""
- Copyright (C) 2021 Shreya Singh (shreya.out@gmail.com)
-
- SPDX-License-Identifier: GPL-2.0
-
- This program is free software; you can redistribute it and/or
- modify it under the terms of the GNU General Public License
- version 2 as published by the Free Software Foundation.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-"""
-import re
-import string
-
-def preprocessing_text(text):
- text = re.sub(r'\w*\d\w*', '', text)
- text = re.sub("[\n]+", "\n",text)
- text = text.strip()
- punctuationNoPeriod = "[" + "(" + ")" + "]"
- text = re.sub(punctuationNoPeriod, "", text)
- text = text.translate(str.maketrans('', '', string.punctuation))
- text = re.sub(r"\b[a-zA-Z]\b", "", text)
- text = re.sub("[\s]+", " ",text)
- text = text.replace('"', '')
- return text
diff --git a/ngram/licenses.py b/ngram/licenses.py
index a53f1b701..00b919b19 100644
--- a/ngram/licenses.py
+++ b/ngram/licenses.py
@@ -10,99 +10,99 @@
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
-
+
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""
+
+
import os
-from os import walk
-from os.path import splitext
-from os.path import join
-from preprocessing import *
-from regex_handling import *
-from ngram import *
+from .preprocessing import *
+from .regex_handling import *
+from .ngram import *
import argparse
import pandas as pd
import random
import pathlib
import multiprocessing
+import numpy as np
+
def chunkIt(seq, num):
- avg = len(seq) / float(num)
- out = []
- last = 0.0
-
- while last < len(seq):
- out.append(seq[int(last):int(last + avg)])
- last += avg
+ return np.array_split(seq, num)
- return out
def read_directory(path):
barlist = list()
for root, dirs, files in os.walk(path):
- for f in files:
- if splitext(f)[1].lower() == ".txt":
- barlist.append(os.path.join(root, f))
- #print(barlist)
+ for f in files:
+ if os.path.splitext(f)[1].lower() == ".txt":
+ barlist.append(os.path.join(root, f))
return barlist
+
def file_vocab(filename):
- vfile = os.path.join("../Original-SPDX-Dataset",filename + '.txt')
- # licensename = filepath.split('\\')[-1]
- with open(vfile, 'r', encoding = 'unicode_escape') as f:
+ vfile = os.path.join("../Original-SPDX-Dataset", filename + '.txt')
+ with open(vfile, 'r', encoding='unicode_escape') as f:
vocab = f.read()
return vocab
+
def file_regex(filepath, regexcsv):
- licensename = os.path.sep.join(filepath.split(os.path.sep)[0:-1]).split(os.path.sep)[-1]
+ licensename = os.path.sep.join(filepath.split(
+ os.path.sep)[0:-1]).split(os.path.sep)[-1]
df = pd.read_csv(regexcsv)
- var = df.loc[df.Licenses==licensename,'Regex']
+ var = df.loc[df.Licenses == licensename, 'Regex']
if var.shape[0] == 0:
return ""
else:
return var.values[0]
+
def main(files, regexcsv):
pathlib.Path("ngramfiles").mkdir(parents=True, exist_ok=True)
- # files = read_directory(path)
-
+
for file in files:
- filename = os.path.sep.join(file.split(os.path.sep)[0:-1]).split(os.path.sep)[-1]
- with open(file, 'r', encoding = 'unicode_escape') as f:
+ filename = os.path.sep.join(file.split(os.path.sep)[
+ 0:-1]).split(os.path.sep)[-1]
+ with open(file, 'r', encoding='unicode_escape') as f:
content = f.read()
vocabulary = file_vocab(filename)
regex = file_regex(file, regexcsv)
regex = regex.strip().replace('"', '')
- if len(regex)==0:
- # print('Regex not found for -> ', filename)
+ if len(regex) == 0:
continue
- os.makedirs(os.path.join("ngramfiles",filename), exist_ok=True)
+ os.makedirs(os.path.join("ngramfiles", filename), exist_ok=True)
preregex = regex.split("(.{1,32} (AND|OR)){1,4}")[0]
secregex = regex.split("(.{1,32} (AND|OR)){1,4}")[-1]
expansion = []
- for ind in range(2,8):
- m = create_ngram_model(ind,file)
- for i in range(1,len(vocabulary)):
+ for ind in range(2, 8):
+ m = create_ngram_model(ind, file)
+ for i in range(1, len(vocabulary)):
random.seed(i)
- generated_text = m.generate_text(np.random.randint(6,31))
+ generated_text = m.generate_text(np.random.randint(6, 31))
generated_text = preprocessing_text(generated_text).lower()
expansion.append(generated_text)
expansion = list(set(expansion))
- expansion_regex = regex_expansion(preregex,expansion,secregex)
- lst = os.listdir(os.path.join("ngramfiles",filename))
+ expansion_regex = regex_expansion(preregex, expansion, secregex)
+ lst = os.listdir(os.path.join("ngramfiles", filename))
count = len(lst)
for ind in range(len(expansion_regex)):
- count+=1
- with open(os.path.join(os.path.join("ngramfiles",filename),'{}-{}.txt'.format(filename,count)), 'w', encoding = 'unicode_escape') as o1:
+ count += 1
+ with open(os.path.join(
+ os.path.join("ngramfiles", filename),
+ f"""{filename}-{count}.txt"""),
+ 'w', encoding='unicode_escape') as o1:
+
o1.write(content + '.' + expansion_regex[ind])
+
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
@@ -123,19 +123,33 @@ def main(files, regexcsv):
)
args = parser.parse_args()
- inputpath = args.inputpath
- regexcsv = args.regexcsv
- n = int(args.cores)
+
+ try:
+ inputpath = args.inputpath
+ if not os.path.isdir(inputpath):
+ raise TypeError
+ except TypeError:
+ print("Valid License directory not provided")
+
+ try:
+ regexcsv = args.regexcsv
+ if not os.path.isfile(regexcsv):
+ raise TypeError
+ if os.path.splitext(regexcsv)[1].lower() != ".csv":
+ raise TypeError
+ except TypeError:
+ print("Invalid File or path provided. Expected csv filepath")
+
+ try:
+ n = int(args.cores)
+ if n <= 0:
+ raise ValueError
+ except ValueError:
+ print("Number of cores cannot be a string, zero or a negative number")
samples = read_directory(inputpath)
ls = chunkIt(samples, n)
- list_data = []
-
- for i in range(len(ls)):
- list_data.append((ls[i], regexcsv))
+ list_data = [(ls[i], regexcsv) for i in range(len(ls))]
with multiprocessing.Pool(processes=n) as pool:
pool.starmap(main, list_data)
-
- # main(inputpath, regexcsv)
-
diff --git a/ngram/ngram.py b/ngram/ngram.py
index 757c30f67..2ba574bbd 100644
--- a/ngram/ngram.py
+++ b/ngram/ngram.py
@@ -10,11 +10,13 @@
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
-
+
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""
+
+
import string
import re
from collections import defaultdict
@@ -22,10 +24,13 @@
from preprocessing import tokenize
import random
+
def get_ngrams(n: int, tokens: list) -> list:
tokens = (n-1)*['']+tokens
- l = [(tuple([tokens[i-p-1] for p in reversed(range(n-1))]), tokens[i]) for i in range(n-1, len(tokens))]
- return l
+ lr = [(tuple([tokens[i-p-1] for p in reversed(range(n-1))]), tokens[i])
+ for i in range(n-1, len(tokens))]
+ return lr
+
class NgramModel(object):
@@ -97,4 +102,4 @@ def create_ngram_model(n, path):
# add back the fullstop
sentence += '.'
m.update(sentence)
- return m
\ No newline at end of file
+ return m
diff --git a/ngram/preprocessing.py b/ngram/preprocessing.py
index c4acee94d..d19c34a12 100644
--- a/ngram/preprocessing.py
+++ b/ngram/preprocessing.py
@@ -10,7 +10,7 @@
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
-
+
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
@@ -19,20 +19,22 @@
import string
from typing import List
+
def preprocessing_text(text):
text = re.sub(r'\w*\d\w*', '', text)
- text = re.sub("[\n]+", "\n",text)
+ text = re.sub("[\n]+", "\n", text)
text = text.strip()
punctuationNoPeriod = "[" + "(" + ")" + "]"
text = re.sub(punctuationNoPeriod, "", text)
text = text.translate(str.maketrans('', '', string.punctuation))
- text = re.sub(r"\b[a-zA-Z]\b", "", text)
- text = re.sub("[\s]+", " ",text)
- text = text.replace('"', '')
+ text = re.sub(r"\b[a-zA-Z]\b", "", text)
+ text = re.sub("[\s]+", " ", text)
+ text = text.replace('"', '')
return text
+
def tokenize(text: str) -> List[str]:
for punct in string.punctuation:
text = text.replace(punct, ' '+punct+' ')
t = text.split()
- return t
\ No newline at end of file
+ return t
diff --git a/ngram/regex_handling.py b/ngram/regex_handling.py
index 0df8637ef..b0c36e60d 100644
--- a/ngram/regex_handling.py
+++ b/ngram/regex_handling.py
@@ -10,11 +10,12 @@
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
-
+
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""
+
import string
import re
import intxeger
@@ -22,45 +23,49 @@
import numpy as np
import random
+
def licensestatement_(regex_):
- x = intxeger.build(regex_)
- res=x.sample(N=1)
- result = res
- i=2
- while True:
- try:
- result = res
- result = list(set(result))
- if len(result) >10:
- return result
- res = x.sample(N=i)
- i+=1
- except:
- break
- return result
+ x = intxeger.build(regex_)
+ res = x.sample(N=1)
+ result = res
+ i = 2
+ while True:
+ try:
+ result = res
+ result = list(set(result))
+ if len(result) > 10:
+ return result
+ res = x.sample(N=i)
+ i += 1
+ except Exception:
+ break
+ return result
+
+
+def regex_expansion(prevsen, input_list, latersen):
+ res_ = []
+ while(len(res_) < 200):
+ final_regex = ""
+ num = random.randint(1, 4)
+ for i in range(num):
+ final_regex = final_regex + \
+ np.random.choice(input_list)+" (and|or) "
+ fregex_ = prevsen + " " + final_regex + " " + latersen
+ ans = licensestatement_(fregex_)
+ for i in ans:
+ i = re.sub("[\s]+", " ", i)
+ res_.append(i)
+ return res_
-def regex_expansion(prevsen,input_list,latersen):
- res_=[]
- while(len(res_)<200):
- final_regex = ""
- num = random.randint(1,4)
- for i in range(num):
- final_regex = final_regex + np.random.choice(input_list)+" (and|or) "
- fregex_ = prevsen + " " + final_regex + " " + latersen
- ans = licensestatement_(fregex_)
- for i in ans:
- i = re.sub("[\s]+", " ",i)
- res_.append(i)
- return res_
def generate_statements():
- for i in range(2,20):
- m = create_ngram_model(i,key)
-
- for i in range(1,len(text)):
- random.seed(i)
- generated_text = m.generate_text(np.random.randint(6,31))
- generated_text = clean_license(generated_text)
- generated_text = generated_text.lower()
- expansion.append(generated_text)
- expansion = list(set(expansion))
\ No newline at end of file
+ for i in range(2, 20):
+ m = create_ngram_model(i, key)
+
+ for i in range(1, len(text)):
+ random.seed(i)
+ generated_text = m.generate_text(np.random.randint(6, 31))
+ generated_text = clean_license(generated_text)
+ generated_text = generated_text.lower()
+ expansion.append(generated_text)
+ expansion = list(set(expansion))
diff --git a/requirements.txt b/requirements.txt
index 38a6919db..8c1654492 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
-pandas==1.2.4
-numpy==1.18.2
+pandas>=1.2.4
+numpy>=1.18.2
intxeger==0.1.1