Merge branch 'main' into add-workflow

lurenss · web-flow · commit 0a4e02f4392f · 2024-09-12T16:49:38.000+02:00
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,38 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Desktop (please complete the following information):**
+ - OS: [e.g. iOS]
+ - Browser [e.g. chrome, safari]
+ - Version [e.g. 22]
+
+**Smartphone (please complete the following information):**
+ - Device: [e.g. iPhone6]
+ - OS: [e.g. iOS8.1]
+ - Browser [e.g. stock browser, safari]
+ - Version [e.g. 22]
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/custom.md b/.github/ISSUE_TEMPLATE/custom.md
@@ -0,0 +1,10 @@
+---
+name: Custom issue template
+about: Describe this issue template's purpose here.
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml
@@ -0,0 +1,39 @@
+# Dependency Review Action
+#
+# This Action will scan dependency manifest files that change as part of a Pull Request,
+# surfacing known-vulnerable versions of the packages declared or updated in the PR.
+# Once installed, if the workflow run is marked as required, PRs introducing known-vulnerable
+# packages will be blocked from merging.
+#
+# Source repository: https://github.com/actions/dependency-review-action
+# Public documentation: https://docs.github.com/en/code-security/supply-chain-security/understanding-your-software-supply-chain/about-dependency-review#dependency-review-enforcement
+name: 'Dependency review'
+on:
+  pull_request:
+    branches: [ "main" ]
+
+# If using a dependency submission action in this workflow this permission will need to be set to:
+#
+# permissions:
+#   contents: write
+#
+# https://docs.github.com/en/enterprise-cloud@latest/code-security/supply-chain-security/understanding-your-software-supply-chain/using-the-dependency-submission-api
+permissions:
+  contents: read
+  # Write permissions for pull-requests are required for using the `comment-summary-in-pr` option, comment out if you aren't using this option
+  pull-requests: write
+
+jobs:
+  dependency-review:
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Checkout repository'
+        uses: actions/checkout@v4
+      - name: 'Dependency Review'
+        uses: actions/dependency-review-action@v4
+        # Commonly enabled options, see https://github.com/actions/dependency-review-action#configuration-options for all available options.
+        with:
+          comment-summary-in-pr: always
+        #   fail-on-severity: moderate
+        #   deny-licenses: GPL-1.0-or-later, LGPL-2.0-or-later
+        #   retry-on-snapshot-warnings: true
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -0,0 +1,26 @@
+on:
+  push:
+    paths:
+      - 'scrapegraphai/**'
+      - '.github/workflows/pylint.yml'
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Install the latest version of rye
+        uses: eifinger/setup-rye@v3
+      - name: Install dependencies
+        run: rye sync --no-lock
+      - name: Analysing the code with pylint
+        run: rye run pylint-ci
+      - name: Check Pylint score
+        run: |
+          pylint_score=$(rye run pylint-score-ci | grep 'Raw metrics' | awk '{print $4}')
+          if (( $(echo "$pylint_score < 8" | bc -l) )); then
+            echo "Pylint score is below 8. Blocking commit."
+            exit 1
+          else
+            echo "Pylint score is acceptable."
+          fi
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ The generate schemas can be used to infer from document to use for tables in a d
 
 - **Entity Extraction**: Automatically identifies and extracts entities from PDF files.
 - **Schema Generation**: Constructs a schema based and structure of the extracted entities.
-- **Visualization**: Leverages Graphviz to visualize the extracted schema.
+- **Visualization**: Dynamic schema visualization
 
 ## Quick Start
 
@@ -16,23 +16,40 @@ The generate schemas can be used to infer from document to use for tables in a d
 Before you begin, ensure you have the following installed on your system:
 
 - **Python**: Make sure Python 3.9+ is installed.
-- **Graphviz**: This tool is necessary for visualizing the extracted schema.
+- **Poppler**: This tool is necessary for converting PDF to images.
 
 #### MacOS Installation
 
-To install Graphviz on MacOS, use the following command:
+To install Poppler on MacOS, use the following command:
 
 ```bash
-brew install graphviz
+brew install poppler
+
 ```
 
 #### Linux Installation
 
 To install Graphviz on Linux, use the following command:
 
 ```bash
-sudo apt install graphviz
+sudo apt-get install poppler-utils
 ```
+
+#### Windows
+
+1. Download the latest Poppler release for Windows from [poppler releases](https://github.com/oschwartz10612/poppler-windows/releases/).
+2. Extract the downloaded zip file to a location on your computer (e.g., `C:\Program Files\poppler`).
+3. Add the `bin` directory of the extracted folder to your system's PATH environment variable.
+
+To add to PATH:
+1. Search for "Environment Variables" in the Start menu and open it.
+2. Under "System variables", find and select "Path", then click "Edit".
+3. Click "New" and add the path to the Poppler `bin` directory (e.g., `C:\Program Files\poppler\bin`).
+4. Click "OK" to save the changes.
+
+After installation, restart your terminal or command prompt for the changes to take effect.
+If doesn't work try the magic restart button.
+
 #### Installation
 After installing the prerequisites and dependencies, you can start using ScrapeSchema to extract entities and their schema from PDFs.
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,4 +1,5 @@
 [project]
+
 name = "scrapeschema"
 version = "0.0.1"
 description = "library for creating ontologies from documents"
@@ -68,3 +69,8 @@ dev-dependencies = [
     "-e file:.[docs]",
     "pylint>=3.2.5",
 ]
+[tool.rye.scripts]
+pylint-local = "pylint scrapegraphai/**/*.py"
+pylint-ci = "pylint --disable=C0114,C0115,C0116 --exit-zero scrapegraphai/**/*.py"
+update-requirements = "python 'manual deployment/autorequirements.py'"
+
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,6 @@
 certifi==2024.7.4
 charset-normalizer==3.3.2
 idna==3.8
-pdf2image==1.17.0
 pillow==10.4.0
 python-dotenv==1.0.1
 requests==2.32.3
diff --git a/scrapeschema/extractor.py b/scrapeschema/extractor.py
@@ -1,6 +1,10 @@
 from abc import ABC, abstractmethod
 from typing import List, Tuple, Dict, Any
 from .primitives import Entity, Relation
+from .parsers.base_parser import BaseParser
+from .parsers.prompts import DELETE_PROMPT, UPDATE_ENTITIES_PROMPT
+import requests
+import json
 
 class Extractor(ABC):
     @abstractmethod
@@ -15,16 +19,111 @@ def extract_relations(self) -> List[Relation]:
     def entities_json_schema(self) -> Dict[str, Any]:
         pass
 
+    @abstractmethod
+    def update_entities(self, new_entities: List[Entity]) -> List[Entity]:
+        pass
+
 class FileExtractor(Extractor):
-    def __init__(self, file_path: str, parser):
+    def __init__(self, file_path: str, parser: BaseParser):
         self.file_path = file_path
         self.parser = parser
 
     def extract_entities(self) -> List[Entity]:
-        return self.parser.extract_entities(self.file_path)
+        new_entities = self.parser.extract_entities(self.file_path)
+        return self.update_entities(new_entities)
 
     def extract_relations(self) -> List[Relation]:
         return self.parser.extract_relations(self.file_path)
     
     def entities_json_schema(self) -> Dict[str, Any]:
-        return self.parser.entities_json_schema(self.file_path)
+        return self.parser.entities_json_schema(self.file_path)
+
+    def delete_entity_or_relation(self, item_description: str) -> None:
+        """
+        Delete an entity or relation based on user description.
+        
+        :param item_description: User's description of the entity or relation to delete
+        """
+        entities_ids = [e.id for e in self.parser.get_entities()]
+        relations_ids = [(r.source, r.target, r.name) for r in self.parser.get_relations()]
+        prompt = DELETE_PROMPT.format(
+            entities=entities_ids,
+            relations=relations_ids,
+            item_description=item_description
+        )
+
+        response = self._get_llm_response(prompt)[8:-3]
+        response_dict = json.loads(response)
+
+        for key, value in response_dict.items():
+            if key == 'Type':
+                if value == 'Entity':
+                    self._delete_entity(response_dict['ID'])
+                elif value == 'Relation':
+                    self._delete_relation(response_dict['ID'])
+
+
+    def _delete_entity(self, entity_id: str) -> None:
+        """Delete an entity and its related relations."""
+        entities = self.parser.get_entities()
+        relations = self.parser.get_relations()
+        
+        entities = [e for e in entities if e.id != entity_id]
+        relations = [r for r in relations if r.source != entity_id and r.target != entity_id]
+        
+        self.parser.set_entities(entities)
+        self.parser.set_relations(relations)
+        print(f"Entity '{entity_id}' and its related relations have been deleted.")
+
+    def _delete_relation(self, relation_id: str) -> None:
+        """Delete a relation."""
+        relations = self.parser.get_relations()
+        
+        source, target, name = eval(relation_id)
+        relations = [r for r in relations if not (r.source == source and r.target == target and r.name == name)]
+        
+        self.parser.set_relations(relations)
+        print(f"Relation '{name}' between '{source}' and '{target}' has been deleted.")
+
+    def _get_llm_response(self, prompt: str) -> str:
+        """Get a response from the language model."""
+        payload = {
+            "model": self.parser.get_model(),
+            "temperature": self.parser.get_temperature(),
+            "messages": [
+                {"role": "user", "content": prompt}
+            ],
+        }
+        response = requests.post(self.parser.get_inference_base_url(), headers=self.parser.get_headers(), json=payload)
+        return response.json()['choices'][0]['message']['content']
+
+    def update_entities(self, new_entities: List[Entity]) -> List[Entity]:
+        """
+        Update the existing entities with new entities, integrating and deduplicating as necessary.
+        
+        :param new_entities: List of new entities to be integrated
+        :return: Updated list of entities
+        """
+        existing_entities = self.parser.get_entities()
+        
+        # Prepare the prompt for the LLM
+        prompt = UPDATE_ENTITIES_PROMPT.format(
+            existing_entities=json.dumps([e.__dict__ for e in existing_entities], indent=2),
+            new_entities=json.dumps([e.__dict__ for e in new_entities], indent=2)
+        )
+
+        # Get the LLM response
+        response = self._get_llm_response(prompt)
+        
+        try:
+            updated_entities_data = json.loads(response)
+            updated_entities = [Entity(**entity_data) for entity_data in updated_entities_data]
+            
+            # Update the parser's entities
+            self.parser.set_entities(updated_entities)
+            
+            print(f"Entities updated. New count: {len(updated_entities)}")
+            return updated_entities
+        except json.JSONDecodeError:
+            print("Error: Unable to parse the LLM response.")
+            return existing_entities
diff --git a/scrapeschema/parsers/base_parser.py b/scrapeschema/parsers/base_parser.py
@@ -3,7 +3,7 @@
 from ..primitives import Entity, Relation
 
 class BaseParser(ABC):
-    def __init__(self, api_key: str, inference_base_url: str = "https://api.openai.com/v1/chat/completions", model: str = "gpt-4o", temperature: float = 0.0):
+    def __init__(self, api_key: str, inference_base_url: str = "https://api.openai.com/v1/chat/completions", model: str = "gpt-4o-2024-08-06", temperature: float = 0.0):
         """
         Initializes the PDFParser with an API key.
 
@@ -16,9 +16,9 @@ def __init__(self, api_key: str, inference_base_url: str = "https://api.openai.c
             "Authorization": f"Bearer {self._api_key}"
         }
 
-        self.inference_base_url = inference_base_url
-        self.model = model
-        self.temperature = temperature
+        self._inference_base_url = inference_base_url
+        self._model = model
+        self._temperature = temperature
         self._entities = []
         self._relations = []
 
@@ -34,15 +34,20 @@ def extract_relations(self, file_path: str) -> List[Relation]:
     def entities_json_schema(self, file_path: str) -> Dict[str, Any]:
         pass
 
-    
     def get_api_key(self):
         return self._api_key
     
     def get_headers(self):
         return self._headers
     
+    def get_model(self):
+        return self._model
+    
+    def get_temperature(self):
+        return self._temperature
+    
     def get_inference_base_url(self):
-        return self.inference_base_url
+        return self._inference_base_url
     
     def set_api_key(self, api_key: str):
         self._api_key = api_key
@@ -68,4 +73,3 @@ def set_relations(self, relations: List[Relation]):
         if not isinstance(relations, list) or not all(isinstance(relation, Relation) for relation in relations):
             raise TypeError("relations must be a List of Relation objects")
         self._relations = relations
-    
diff --git a/scrapeschema/parsers/pdf_parser.py b/scrapeschema/parsers/pdf_parser.py
diff --git a/scrapeschema/parsers/prompts.py b/scrapeschema/parsers/prompts.py
diff --git a/scrapeschema/primitives.py b/scrapeschema/primitives.py