Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 32 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ In order to install all packages follow the steps below:
1. Move to flask-api folder `cd flask-api`
2. For installing virtual environment - `python3 -m pip install --user virtualenv`
3. Create A Virtual env - `python3 -m venv env`
4. Activate virtual env - `source env/bin/activate`
4. Activate virtual env
- For Mac/Linux : `source env/bin/activate`
- For Windows : `.\env\Scripts\activate`
5. `pip3 install -r requirements.txt`
6. `flask run`

Expand Down Expand Up @@ -100,7 +102,17 @@ The following is a high-level overview of relevant files and folders.
```
DocVerifier/
├── flask-api/
│ ├── datasets
│ ├── static/uploads
│ ├── model
│ ├── scraper
│ ├── templates
│ ├── .gitignore
│ ├── Procfile
│ ├── nltk.txt
│ ├── requirements.txt
│ ├── runtime.txt
│ ├── output.txt
│ └── app.py

└── custom_greeting/
Expand All @@ -111,18 +123,31 @@ DocVerifier/
│ ├── custom_greeting_assets/
│ │ ├── assets
│ │ └── public
├── dfx.json
├── package.json
|__ webpack.config.js
|__tsconfig.json
|__ .gitignore
├── dfx.json
├── package.json
|__ webpack.config.js
|__ tsconfig.json
|__ canister_ids.json
|__ README.md
|__ package-lock.json
|__ .gitignore
|
|__get-current-tab-url
|__chrome-extension
|_ background.js
|_ icon.png
|_ manifest.json
|_ window.html
|_ icon.svg
|_ style.css
|__images
|_ demo.gif
|__jupyter-notebooks
|_ privacy_policy_predictor.ipynb
|_ web_Scraping.ipynb
|__ .gitignore
|__ CODE_OF_CONDUCT.md
|__ LICENSE
|__ README.md

```

Expand Down
2 changes: 1 addition & 1 deletion chrome-extension/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"manifest_version": 2,
"name": "DocVerifier!",
"description": "This extension verifies any agreement and policies!",
"version": "1.0.0",
"version": "1.0.1",
"browser_action": {
"default_icon": "icon.png",
"default_popup": "window.html"
Expand Down
8 changes: 6 additions & 2 deletions flask-api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,13 @@ def testfn():
# POST request
if request.method == "POST":
tabUrl = request.get_json()["tabUrl"]
url_list = collect_url_links(tabUrl) # parse as JSON
currentTabUrl = tabUrl.split('/')
url = "https://"+currentTabUrl[2]
logger.info("tab url", url)

url_list = collect_url_links(url) # parse as JSON
# get the current tab url
logger.info(url_list)
logger.info("privacy urls", url_list)
for link in url_list:
# get the list of privacy policies
logger.info(link)
Expand Down
1 change: 0 additions & 1 deletion flask-api/output.txt

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions flask-api/scraper/getPolicyText.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@


def getPolicies(url_link):

source = requests.get(url_link).text
soup = BeautifulSoup(source, "lxml")
policies = soup.find("body")
Expand Down
8 changes: 8 additions & 0 deletions flask-api/scraper/getUrls.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,28 @@
from bs4 import BeautifulSoup
import requests
import re
import time
import random

# function to get all the policy urls from a website


def collect_url_links(url_link) -> list:


url_list = []
pattern = re.compile(r'^http')
source = requests.get(url_link).text
# List for Randomizing our request rate
rate = [i/10 for i in range(10)]
soup = BeautifulSoup(source, 'lxml')
a_tag = soup.find_all("a") # Gives you the list of all the a tags
for i in a_tag:
if i.text in ["Privacy", "Terms", "Privacy Policy", "Terms of Service"]:
url = i["href"]
url_list.append(url)
# Randomizing our request rate
time.sleep(random.choice(rate))

for i in range(len(url_list)):
matches = pattern.finditer(url_list[i])
Expand Down