Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# ScrapeSchema

ScrapeSchema is a Python-based library designed to extract entities and relationship from files.
![graph](docs/assets/graph_pyecharts.png)

The generate schemas can be used to infer from document to use for tables in a database or for generating knowledge graph.

## Features
Expand Down
14 changes: 0 additions & 14 deletions canvas_to_use_the lib.py

This file was deleted.

Binary file added docs/assets/graph_pyecharts.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
43 changes: 43 additions & 0 deletions examples/example_renderer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from scrapeschema import Entity, Relation
from scrapeschema.renderers import PyechartsRenderer

# Define entities with nested attributes
entities = [
Entity(id="1", type="Person", attributes={
"name": "Alice",
"age": 30,
"address": {
"city": "New York",
"zip": "10001"
}
}),
Entity(id="2", type="Person", attributes={
"name": "Bob",
"age": 40,
"address": {
"city": "Los Angeles",
"zip": "90001"
}
}),
Entity(id="3", type="Company", attributes={
"name": "Acme Corp",
"industry": "Tech",
"headquarters": {
"city": "San Francisco",
"zip": "94105"
}
})
]

# Define relations between the entities
relations = [
Relation(id="r1", source="1", target="2", name="Friend"),
Relation(id="r2", source="1", target="3", name="Employee"),
Relation(id="r3", source="2", target="3", name="Employer"),
]

# Initialize the PyechartsRenderer
renderer = PyechartsRenderer(repulsion=2000, title="Graph Example with Nested Entities")

# Render the graph using the provided nodes and links
graph = renderer.render(entities, relations, output_path="graph_nested.html")
11 changes: 8 additions & 3 deletions examples/extract_entities_json_schema_from_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,16 @@ def main():
load_dotenv() # Load environment variables from .env file
api_key = os.getenv("OPENAI_API_KEY")

# Path to your PDF file
pdf_path = "./test.pdf"
# get current directory
curr_dirr = os.path.dirname(os.path.abspath(__file__))
pdf_name = "test.pdf"
pdf_path = os.path.join(curr_dirr, pdf_name)

# Create a PDFParser instance with the API key
pdf_parser = PDFParser(api_key)
pdf_parser = PDFParser(
api_key=api_key,
model="gpt-4o-mini"
)

# Create a FileExtraxctor instance with the PDF parser
pdf_extractor = FileExtractor(pdf_path, pdf_parser)
Expand Down
32 changes: 22 additions & 10 deletions examples/extract_entities_relations_from_pdf.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
from scrapeschema import FileExtractor, PDFParser
from scrapeschema.renderers import PyechartsRenderer
import os
from dotenv import load_dotenv
load_dotenv() # Load environment variables from .env file

def main():
load_dotenv() # Load environment variables from .env file
api_key = os.getenv("OPENAI_API_KEY")
# Get the OpenAI API key from the environment variables
api_key = os.getenv("OPENAI_API_KEY")

# get current directory
curr_dirr = os.path.dirname(os.path.abspath(__file__))

# Path to your PDF file
pdf_path = "./test.pdf"
def main():
# Path to the PDF file
pdf_name = "test.pdf"
pdf_path = os.path.join(curr_dirr, pdf_name)

# Create a PDFParser instance with the API key
pdf_parser = PDFParser(api_key)
Expand All @@ -17,12 +23,18 @@ def main():

# Extract entities from the PDF
entities = pdf_extractor.extract_entities()
relations = pdf_extractor.extract_relations()

print(entities)
# Initialize the PyechartsRenderer
renderer = PyechartsRenderer(repulsion=2000, title="Entity-Relationship Graph")

relations = pdf_extractor.extract_relations()
print(relations)

# Render the graph using the provided nodes and links
graph = renderer.render(entities, relations, output_path="graph.html")

print(graph)

if __name__ == "__main__":
main()
main()



18 changes: 10 additions & 8 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
[project]

name = "scrapeschema"
version = "0.0.1"
description = "library for creating ontologies from documents"
description = "Library for creating ontologies from documents using LLM"
authors = [
{ name = "Marco Vinciguerra", email = "mvincig11@gmail.com" },
{ name = "Marco Perini", email = "perinim.98@gmail.com" },
Expand All @@ -13,7 +12,6 @@ dependencies = [
"certifi==2024.7.4",
"charset-normalizer==3.3.2",
"idna==3.8",
"pdf2image==1.17.0",
"pillow==10.4.0",
"python-dotenv==1.0.1",
"requests==2.32.3",
Expand All @@ -26,6 +24,10 @@ homepage = "https://scrapegraphai.com/"
repository = "https://github.com/ScrapeGraphAI/ScrapeSchema"
documentation = ""
keywords = [
"scrapeschema",
"ontologies",
"documents",
"knowledge graph",
"scrapegraph",
"scrapegraphai",
"langchain",
Expand Down Expand Up @@ -53,7 +55,7 @@ classifiers = [
requires-python = ">=3.9,<4.0"

[project.optional-dependencies]
burr = ["burr[start]==0.22.1"]
renderers = ["pyecharts==2.0.6"]
docs = ["sphinx==6.0", "furo==2024.5.6"]

[build-system]
Expand All @@ -65,12 +67,12 @@ managed = true
dev-dependencies = [
"pytest==8.0.0",
"pytest-mock==3.14.0",
"-e file:.[burr]",
"-e file:.[renderers]",
"-e file:.[docs]",
"pylint>=3.2.5",
]

[tool.rye.scripts]
pylint-local = "pylint scrapegraphai/**/*.py"
pylint-ci = "pylint --disable=C0114,C0115,C0116 --exit-zero scrapegraphai/**/*.py"
pylint-local = "pylint scrapeschema/**/*.py"
pylint-ci = "pylint --disable=C0114,C0115,C0116 --exit-zero scrapeschema/**/*.py"
update-requirements = "python 'manual deployment/autorequirements.py'"

Loading