fossology · TanweerulHaque · Sep 5, 2021 · Sep 7, 2021
diff --git a/Download-licenses-Script/database-foss.py b/Download-licenses-Script/database-foss.py
@@ -10,7 +10,7 @@
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
- 
+
  You should have received a copy of the GNU General Public License along
  with this program; if not, write to the Free Software Foundation, Inc.,
  51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
@@ -20,15 +20,17 @@
 import json
 import os
 
+
 def main():
     download = "..\\Original-DB-Foss-Dataset"
     os.makedirs(download, exist_ok=True)
     url = 'https://raw.githubusercontent.com/fossology/fossology/master/install/db/licenseRef.json'
     response = urlopen(url)
     data_json = json.loads(response.read())
     for licenses in data_json:
-        with open(download+'\\'+licenses["rf_shortname"], 'w', encoding ='utf-8') as o1:
-                o1.write(licenses["rf_text"])
+        with open(download+'\\'+licenses["rf_shortname"], 'w', encoding='utf-8') as o1:
+            o1.write(licenses["rf_text"])
+
 
 if __name__ == "__main__":
     main()
diff --git a/Download-licenses-Script/exceptions.py b/Download-licenses-Script/exceptions.py
@@ -10,7 +10,7 @@
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
- 
+
  You should have received a copy of the GNU General Public License along
  with this program; if not, write to the Free Software Foundation, Inc.,
  51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
@@ -20,25 +20,29 @@
 import json
 import os
 
+
 def extract_exceptions():
-  """
-  There are 41 files of SPDX exception in licenseListVersion: 3.13
-  url of latest SPDX Exception release = https://spdx.org/licenses/exceptions.json
-  """
-  download = "..\\Original-SPDX-Dataset"
-  os.makedirs(download, exist_ok=True)
-  url = 'https://spdx.org/licenses/exceptions.json'
-  response = urlopen(url)
-  data_json = json.loads(response.read())
-
-  for license in data_json["exceptions"]:
-    license["reference"] = license["reference"].replace("./","https://spdx.org/licenses/",1)
-    url2 = license["reference"]
-    response2 = urlopen(url2)
-    data_json2 = json.loads(response2.read())
-
-    with open(download+'\\'+license["licenseExceptionId"], 'w') as o1:
-          o1.write(data_json2["licenseExceptionText"])
+    """
+    There are 41 files of SPDX exception in licenseListVersion: 3.13
+    url of latest SPDX Exception release = https://spdx.org/licenses/exceptions.json
+    """
+
+    download = "..\\Original-SPDX-Dataset"
+    os.makedirs(download, exist_ok=True)
+    url = 'https://spdx.org/licenses/exceptions.json'
+    response = urlopen(url)
+    data_json = json.loads(response.read())
+
+    for license in data_json["exceptions"]:
+        license["reference"] = license["reference"].replace(
+            "./", "https://spdx.org/licenses/", 1)
+        url2 = license["reference"]
+        response2 = urlopen(url2)
+        data_json2 = json.loads(response2.read())
+
+        with open(download+'\\'+license["licenseExceptionId"], 'w') as o1:
+            o1.write(data_json2["licenseExceptionText"])
+
 
 if __name__ == "__main__":
-    extract_exceptions()
+    extract_exceptions()
diff --git a/Download-licenses-Script/spdx.py b/Download-licenses-Script/spdx.py
@@ -10,7 +10,7 @@
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
- 
+
  You should have received a copy of the GNU General Public License along
  with this program; if not, write to the Free Software Foundation, Inc.,
  51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
@@ -20,24 +20,27 @@
 import json
 import os
 
+
 def extract_spdx():
-  """
-  There are 460 files of SPDX licenses in licenseListVersion: 3.13
-  url of latest SPDX Exception release = https://spdx.org/licenses/licenses.json
-  """  
-  download = "..\\Original-SPDX-Dataset"
-  os.makedirs(download, exist_ok=True)
-  url = 'https://spdx.org/licenses/licenses.json'
-  response = urlopen(url)
-  data_json = json.loads(response.read())
-
-  for license in data_json["licenses"]:
-    url2 = 'https://spdx.org/licenses/licenses.json'
-    response2 = urlopen(url2)
-    data_json2 = json.loads(response2.read())
-
-    with open(download+'\\'+license["licenseId"], 'w', encoding='utf-8') as o1:
-          o1.write(data_json2["licenseText"])
+    """
+    There are 460 files of SPDX licenses in licenseListVersion: 3.13
+    url of latest SPDX Exception release = https://spdx.org/licenses/licenses.json
+    """
+
+    download = "..\\Original-SPDX-Dataset"
+    os.makedirs(download, exist_ok=True)
+    url = 'https://spdx.org/licenses/licenses.json'
+    response = urlopen(url)
+    data_json = json.loads(response.read())
+
+    for license in data_json["licenses"]:
+        url2 = 'https://spdx.org/licenses/licenses.json'
+        response2 = urlopen(url2)
+        data_json2 = json.loads(response2.read())
+
+        with open(download+'\\'+license["licenseId"], 'w', encoding='utf-8') as o1:
+            o1.write(data_json2["licenseText"])
+
 
 if __name__ == "__main__":
-    extract_spdx()
+    extract_spdx()
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-<h1 align="center">Minerva Dataset Generation<img src="assets\wcoding.giff" width="80"></h1>
+<h1 align="center">Minerva Dataset Generation<img src="assets\wcoding.gif" width="80"></h1>
 <h2 align="center">Project Overview</h2>
 
 <p align="center">
@@ -84,4 +84,4 @@ Using Nomos to validate generated files. This is a base line <i>regex-based</i>
 And to use multiple cores to validate files (here I am using 3 cores) :
 ```
  sudo nomos -J -d <folder_with_files> -n 3
-```
+```
diff --git a/Script-Initial-Split/count.py b/Script-Initial-Split/count.py
@@ -10,7 +10,7 @@
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
- 
+
  You should have received a copy of the GNU General Public License along
  with this program; if not, write to the Free Software Foundation, Inc.,
  51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
@@ -20,22 +20,34 @@
 import pandas as pd
 import argparse
 
+
 def main(path):
     file = []
     count = []
 
     for filename in os.listdir(path):
         file.append(filename)
-        lst = os.listdir(os.path.join(path,filename)) # dir is your directory path
+        # dir is your directory path
+        lst = os.listdir(os.path.join(path, filename))
         number_files = len(lst)
-        count.append(number_files)           
+        count.append(number_files)
 
-    data = pd.DataFrame({"files":file,"count":count})
+    data = pd.DataFrame({"files": file, "count": count})
     data.to_csv("Count.csv", index=False)
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('path', help='Pass a directory to find original licenses')
+    parser.add_argument(
+        'path', help='Pass a directory to find original licenses')
+
     args = parser.parse_args()
-    path = args.path
+
+    try:
+        path = args.path
+        if not os.path.isdir(path):
+            raise TypeError
+    except TypeError:
+        print("Valid directory not provided")
+
     main(path)
diff --git a/Script-Initial-Split/initial_split.py b/Script-Initial-Split/initial_split.py
@@ -10,86 +10,101 @@
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
- 
+
  You should have received a copy of the GNU General Public License along
  with this program; if not, write to the Free Software Foundation, Inc.,
  51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 """
 
+
 import re
 import os
 import argparse
 
-def splitter(file,dirname):
-
+
+def splitter(file, dirname):
+
     history = []
-    with open(file, 'r', encoding= 'unicode_escape') as f:
+    with open(file, 'r', encoding='unicode_escape') as f:
         content = f.readlines()
-    # you may also want to remove whitespace characters like `\n` at the end of each line
+
+#   you may also want to remove whitespace characters
+#   like `\n` at the end of each line
+
     text = " ".join(content)
     content = text.split(". ")
     content = [x.strip() for x in content]
     para = ""
-    for comb  in range(1,len(content)):
+    for comb in range(1, len(content)):
         for i in range(0, len(content)-comb+1, comb):
-            if len(history)>1000:
+            if len(history) > 1000:
                 history = list(set(history))
-                if len(history)>1000:
+                if len(history) > 1000:
                     break
-            para = para  + " " +  content[i]
-            para = re.sub("\s\s+" , " ", para)
+            para = para + " " + content[i]
+            para = re.sub("\s\s+", " ", para)
             para = para.strip()
             if para not in history:
                 history.append(para)
     history = list(set(history))
-    generate_files(file,history,dirname)
+    generate_files(file, history, dirname)
 
-                
-def generate_files(file,history,dirname):
+
+def generate_files(file, history, dirname):
     counter = 0
     os.makedirs(dirname, exist_ok=True)
     for texts in history:
-        counter+=1
-        name = dirname + '-{}.txt'.format(counter)
-        with open(os.path.join(dirname,name), 'w', encoding= 'unicode_escape') as o1:
-                    o1.write(texts)
-    naive_approach(file,dirname,counter)
+        counter += 1
+        name = dirname + f"""-{counter}.txt"""
+        with open(os.path.join(dirname, name),
+                  'w', encoding='unicode_escape') as o1:
+            o1.write(texts)
+    naive_approach(file, dirname, counter)
 
-def naive_approach(file,dirname,counter):
+
+def naive_approach(file, dirname, counter):
     os.makedirs(dirname, exist_ok=True)
 
-    with open(file, 'r', encoding= 'unicode_escape') as f:
+    with open(file, 'r', encoding='unicode_escape') as f:
         para = sum(line.isspace() for line in f) + 1
 
-    with open(file, 'r+', encoding= 'unicode_escape') as f:
+    with open(file, 'r+', encoding='unicode_escape') as f:
         contents = f.read()
 
     content = contents.split('\n\n')
 
     for i in range(para):
         counter += 1
-        name = dirname + '-{}.txt'.format(counter)
+        name = dirname + f"""-{counter}.txt"""
         try:
-            with open(os.path.join(dirname,name), 'w', encoding= 'unicode_escape') as o1:
+            with open(os.path.join(dirname, name),
+                      'w', encoding='unicode_escape') as o1:
                 o1.write(str(content[i]))
-        except:
+
+        except EnvironmentError:
             break
 
+
 def main(path):
-    for roots, dirs, files in os.walk(path,topdown=True):
+    for roots, dirs, files in os.walk(path, topdown=True):
         for name in files:
             dirname = os.path.splitext(name)[0]
-            file = os.path.join(path,name)
-            splitter(file,dirname)    
+            file = os.path.join(path, name)
+            splitter(file, dirname)
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('path', help='Pass a directory to find original licenses')
+    parser.add_argument(
+        'path', help='Pass a directory to find original licenses')
+
     args = parser.parse_args()
-    path = args.path
-
-    if path.isdir():
-        main(path)
-    else:
-        print("Invalid directory")
-
+
+    try:
+        path = args.path
+        if not os.path.isdir(path):
+            raise TypeError
+    except TypeError:
+        print("Valid directory not provided")
+
+    main(path)
diff --git a/markov/__init__.py b/markov/__init__.py