fossology · TanweerulHaque · Sep 5, 2021
diff --git a/Download-licenses-Script/database-foss.py b/Download-licenses-Script/database-foss.py
@@ -10,7 +10,7 @@
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
- 
+
  You should have received a copy of the GNU General Public License along
  with this program; if not, write to the Free Software Foundation, Inc.,
  51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
@@ -20,15 +20,17 @@
 import json
 import os
 
+
 def main():
     download = "..\\Original-DB-Foss-Dataset"
     os.makedirs(download, exist_ok=True)
     url = 'https://raw.githubusercontent.com/fossology/fossology/master/install/db/licenseRef.json'
     response = urlopen(url)
     data_json = json.loads(response.read())
     for licenses in data_json:
-        with open(download+'\\'+licenses["rf_shortname"], 'w', encoding ='utf-8') as o1:
-                o1.write(licenses["rf_text"])
+        with open(download+'\\'+licenses["rf_shortname"], 'w', encoding='utf-8') as o1:
+            o1.write(licenses["rf_text"])
+
 
 if __name__ == "__main__":
     main()
diff --git a/Download-licenses-Script/exceptions.py b/Download-licenses-Script/exceptions.py
@@ -10,7 +10,7 @@
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
- 
+
  You should have received a copy of the GNU General Public License along
  with this program; if not, write to the Free Software Foundation, Inc.,
  51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
@@ -20,25 +20,29 @@
 import json
 import os
 
+
 def extract_exceptions():
-  """
-  There are 41 files of SPDX exception in licenseListVersion: 3.13
-  url of latest SPDX Exception release = https://spdx.org/licenses/exceptions.json
-  """
-  download = "..\\Original-SPDX-Dataset"
-  os.makedirs(download, exist_ok=True)
-  url = 'https://spdx.org/licenses/exceptions.json'
-  response = urlopen(url)
-  data_json = json.loads(response.read())
-
-  for license in data_json["exceptions"]:
-    license["reference"] = license["reference"].replace("./","https://spdx.org/licenses/",1)
-    url2 = license["reference"]
-    response2 = urlopen(url2)
-    data_json2 = json.loads(response2.read())
-
-    with open(download+'\\'+license["licenseExceptionId"], 'w') as o1:
-          o1.write(data_json2["licenseExceptionText"])
+    """
+    There are 41 files of SPDX exception in licenseListVersion: 3.13
+    url of latest SPDX Exception release = https://spdx.org/licenses/exceptions.json
+    """
+
+    download = "..\\Original-SPDX-Dataset"
+    os.makedirs(download, exist_ok=True)
+    url = 'https://spdx.org/licenses/exceptions.json'
+    response = urlopen(url)
+    data_json = json.loads(response.read())
+
+    for license in data_json["exceptions"]:
+        license["reference"] = license["reference"].replace(
+            "./", "https://spdx.org/licenses/", 1)
+        url2 = license["reference"]
+        response2 = urlopen(url2)
+        data_json2 = json.loads(response2.read())
+
+        with open(download+'\\'+license["licenseExceptionId"], 'w') as o1:
+            o1.write(data_json2["licenseExceptionText"])
+
 
 if __name__ == "__main__":
-    extract_exceptions()
+    extract_exceptions()
diff --git a/Download-licenses-Script/spdx.py b/Download-licenses-Script/spdx.py
@@ -10,7 +10,7 @@
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
- 
+
  You should have received a copy of the GNU General Public License along
  with this program; if not, write to the Free Software Foundation, Inc.,
  51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
@@ -20,24 +20,27 @@
 import json
 import os
 
+
 def extract_spdx():
-  """
-  There are 460 files of SPDX licenses in licenseListVersion: 3.13
-  url of latest SPDX Exception release = https://spdx.org/licenses/licenses.json
-  """  
-  download = "..\\Original-SPDX-Dataset"
-  os.makedirs(download, exist_ok=True)
-  url = 'https://spdx.org/licenses/licenses.json'
-  response = urlopen(url)
-  data_json = json.loads(response.read())
-
-  for license in data_json["licenses"]:
-    url2 = 'https://spdx.org/licenses/licenses.json'
-    response2 = urlopen(url2)
-    data_json2 = json.loads(response2.read())
-
-    with open(download+'\\'+license["licenseId"], 'w', encoding='utf-8') as o1:
-          o1.write(data_json2["licenseText"])
+    """
+    There are 460 files of SPDX licenses in licenseListVersion: 3.13
+    url of latest SPDX Exception release = https://spdx.org/licenses/licenses.json
+    """
+
+    download = "..\\Original-SPDX-Dataset"
+    os.makedirs(download, exist_ok=True)
+    url = 'https://spdx.org/licenses/licenses.json'
+    response = urlopen(url)
+    data_json = json.loads(response.read())
+
+    for license in data_json["licenses"]:
+        url2 = 'https://spdx.org/licenses/licenses.json'
+        response2 = urlopen(url2)
+        data_json2 = json.loads(response2.read())
+
+        with open(download+'\\'+license["licenseId"], 'w', encoding='utf-8') as o1:
+            o1.write(data_json2["licenseText"])
+
 
 if __name__ == "__main__":
-    extract_spdx()
+    extract_spdx()
diff --git a/Script-Initial-Split/count.py b/Script-Initial-Split/count.py
@@ -10,7 +10,7 @@
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
- 
+
  You should have received a copy of the GNU General Public License along
  with this program; if not, write to the Free Software Foundation, Inc.,
  51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
@@ -20,22 +20,34 @@
 import pandas as pd
 import argparse
 
+
 def main(path):
     file = []
     count = []
 
     for filename in os.listdir(path):
         file.append(filename)
-        lst = os.listdir(os.path.join(path,filename)) # dir is your directory path
+        # dir is your directory path
+        lst = os.listdir(os.path.join(path, filename))
         number_files = len(lst)
-        count.append(number_files)           
+        count.append(number_files)
 
-    data = pd.DataFrame({"files":file,"count":count})
+    data = pd.DataFrame({"files": file, "count": count})
     data.to_csv("Count.csv", index=False)
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('path', help='Pass a directory to find original licenses')
+    parser.add_argument(
+        'path', help='Pass a directory to find original licenses')
+
     args = parser.parse_args()
-    path = args.path
+
+    try:
+        path = args.path
+        if not os.path.isdir(path):
+            raise TypeError
+    except TypeError:
+        print("Valid directory not provided")
+
     main(path)
diff --git a/Script-Initial-Split/initial_split.py b/Script-Initial-Split/initial_split.py
@@ -10,86 +10,101 @@
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
- 
+
  You should have received a copy of the GNU General Public License along
  with this program; if not, write to the Free Software Foundation, Inc.,
  51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 """
 
+
 import re
 import os
 import argparse
 
-def splitter(file,dirname):
-
+
+def splitter(file, dirname):
+
     history = []
-    with open(file, 'r', encoding= 'unicode_escape') as f:
+    with open(file, 'r', encoding='unicode_escape') as f:
         content = f.readlines()
-    # you may also want to remove whitespace characters like `\n` at the end of each line
+
+#   you may also want to remove whitespace characters
+#   like `\n` at the end of each line
+
     text = " ".join(content)
     content = text.split(". ")
     content = [x.strip() for x in content]
     para = ""
-    for comb  in range(1,len(content)):
+    for comb in range(1, len(content)):
         for i in range(0, len(content)-comb+1, comb):
-            if len(history)>1000:
+            if len(history) > 1000:
                 history = list(set(history))
-                if len(history)>1000:
+                if len(history) > 1000:
                     break
-            para = para  + " " +  content[i]
-            para = re.sub("\s\s+" , " ", para)
+            para = para + " " + content[i]
+            para = re.sub("\s\s+", " ", para)
             para = para.strip()
             if para not in history:
                 history.append(para)
     history = list(set(history))
-    generate_files(file,history,dirname)
+    generate_files(file, history, dirname)
 
-                
-def generate_files(file,history,dirname):
+
+def generate_files(file, history, dirname):
     counter = 0
     os.makedirs(dirname, exist_ok=True)
     for texts in history:
-        counter+=1
-        name = dirname + '-{}.txt'.format(counter)
-        with open(os.path.join(dirname,name), 'w', encoding= 'unicode_escape') as o1:
-                    o1.write(texts)
-    naive_approach(file,dirname,counter)
+        counter += 1
+        name = dirname + f"""-{counter}.txt"""
+        with open(os.path.join(dirname, name),
+                  'w', encoding='unicode_escape') as o1:
+            o1.write(texts)
+    naive_approach(file, dirname, counter)
 
-def naive_approach(file,dirname,counter):
+
+def naive_approach(file, dirname, counter):
     os.makedirs(dirname, exist_ok=True)
 
-    with open(file, 'r', encoding= 'unicode_escape') as f:
+    with open(file, 'r', encoding='unicode_escape') as f:
         para = sum(line.isspace() for line in f) + 1
 
-    with open(file, 'r+', encoding= 'unicode_escape') as f:
+    with open(file, 'r+', encoding='unicode_escape') as f:
         contents = f.read()
 
     content = contents.split('\n\n')
 
     for i in range(para):
         counter += 1
-        name = dirname + '-{}.txt'.format(counter)
+        name = dirname + f"""-{counter}.txt"""
         try:
-            with open(os.path.join(dirname,name), 'w', encoding= 'unicode_escape') as o1:
+            with open(os.path.join(dirname, name),
+                      'w', encoding='unicode_escape') as o1:
                 o1.write(str(content[i]))
-        except:
+
+        except EnvironmentError:
             break
 
+
 def main(path):
-    for roots, dirs, files in os.walk(path,topdown=True):
+    for roots, dirs, files in os.walk(path, topdown=True):
         for name in files:
             dirname = os.path.splitext(name)[0]
-            file = os.path.join(path,name)
-            splitter(file,dirname)    
+            file = os.path.join(path, name)
+            splitter(file, dirname)
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('path', help='Pass a directory to find original licenses')
+    parser.add_argument(
+        'path', help='Pass a directory to find original licenses')
+
     args = parser.parse_args()
-    path = args.path
-
-    if path.isdir():
-        main(path)
-    else:
-        print("Invalid directory")
-
+
+    try:
+        path = args.path
+        if not os.path.isdir(path):
+            raise TypeError
+    except TypeError:
+        print("Valid directory not provided")
+
+    main(path)
diff --git a/markov/__init__.py b/markov/__init__.py
diff --git a/markov/helper.py b/markov/helper.py
@@ -10,37 +10,53 @@
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
- 
+
  You should have received a copy of the GNU General Public License along
  with this program; if not, write to the Free Software Foundation, Inc.,
  51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 """
+
 import os
-from os import walk
-from os.path import splitext
-from os.path import join
 import pandas as pd
+import re
+import string
+
+
+def preprocessing_text(text):
+    text = re.sub(r'\w*\d\w*', '', text)
+    text = re.sub("[\n]+", "\n", text)
+    text = text.strip()
+    punctuationNoPeriod = "[" + "(" + ")" + "]"
+    text = re.sub(punctuationNoPeriod, "", text)
+    text = text.translate(str.maketrans('', '', string.punctuation))
+    text = re.sub(r"\b[a-zA-Z]\b", "", text)
+    text = re.sub("[\s]+", " ", text)
+    text = text.replace('"', '')
+    return text
+
 
 def read_directory(path):
     barlist = list()
     for root, dirs, files in os.walk(path):
-      for f in files:
-        if splitext(f)[1].lower() == ".txt":
-          barlist.append(os.path.join(root, f))
-    #print(barlist)
+        for f in files:
+            if os.path.splitext(f)[1].lower() == ".txt":
+                barlist.append(os.path.join(root, f))
     return barlist
 
+
 def file_vocab(filename):
-    vfile = os.path.join("../Original-SPDX-Dataset",filename + '.txt')
-    with open(vfile, 'r', encoding = 'unicode_escape') as f:
+    vfile = os.path.join("../Original-SPDX-Dataset", filename + '.txt')
+    with open(vfile, 'r', encoding='unicode_escape') as f:
         vocab = f.read()
     return vocab
 
+
 def file_regex(filepath, regexcsv):
-    licensename = os.path.sep.join(filepath.split(os.path.sep)[0:-1]).split(os.path.sep)[-1]
+    licensename = os.path.sep.join(filepath.split(
+        os.path.sep)[0:-1]).split(os.path.sep)[-1]
     df = pd.read_csv(regexcsv)
-    var = df.loc[df.Licenses==licensename,'Regex']
+    var = df.loc[df.Licenses == licensename, 'Regex']
     if var.shape[0] == 0:
         return ""
     else:
-        return var.values[0]
+        return var.values[0]