From cb05f824ce2f7a0c0898c67e6034af57c3ec3d2a Mon Sep 17 00:00:00 2001 From: ZuanZuan Date: Sat, 7 Sep 2024 17:35:03 +0800 Subject: [PATCH 1/8] Update abstract_graph.py --- scrapegraphai/graphs/abstract_graph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 0d02b6d4..7b161963 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -128,7 +128,7 @@ def _create_llm(self, llm_config: dict) -> object: return llm_params["model_instance"] known_providers = {"openai", "azure_openai", "google_genai", "google_vertexai", - "ollama", "oneapi", "nvidia", "groq", "anthropic" "bedrock", "mistralai", + "ollama", "oneapi", "nvidia", "groq", "anthropic", "bedrock", "mistralai", "hugging_face", "deepseek", "ernie", "fireworks"} split_model_provider = llm_params["model"].split("/", 1) From a5401394cc939d9a5fc58b8a9145141c2f047bab Mon Sep 17 00:00:00 2001 From: Marco Perini Date: Sun, 8 Sep 2024 01:12:29 +0200 Subject: [PATCH 2/8] =?UTF-8?q?docs(sponsor):=20=F0=9F=85=B1=EF=B8=8F=20Br?= =?UTF-8?q?owserbase=20sponsor=20=F0=9F=85=B1=EF=B8=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 39 +++++++++++++++----------- docs/assets/browserbase_logo.png | Bin 0 -> 3091 bytes docs/source/introduction/overview.rst | 5 ++++ 3 files changed, 28 insertions(+), 16 deletions(-) create mode 100644 docs/assets/browserbase_logo.png diff --git a/README.md b/README.md index 57eb79d0..75cda0eb 100644 --- a/README.md +++ b/README.md @@ -32,27 +32,31 @@ playwright install **Note**: it is recommended to install the library in a virtual environment to avoid conflicts with other libraries 🐱 -By the way if you to use not mandatory modules it is necessary to install by yourself with the following command: +
+Optional Dependencies +Additional dependecies can be added while installing the library: -### Installing "Other Language Models" +- More Language Models: additional language models are installed, such as Fireworks, Groq, Anthropic, Hugging Face, and Nvidia AI Endpoints. -This group allows you to use additional language models like Fireworks, Groq, Anthropic, Hugging Face, and Nvidia AI Endpoints. -```bash -pip install scrapegraphai[other-language-models] + ```bash + pip install scrapegraphai[other-language-models] + ``` -``` -### Installing "More Semantic Options" +- Semantic Options: this group includes tools for advanced semantic processing, such as Graphviz. + + ```bash + pip install scrapegraphai[more-semantic-options] + ``` + +- Browsers Options: this group includes additional browser management tools/services, such as Browserbase. + + ```bash + pip install scrapegraphai[more-browser-options] + ``` + +
-This group includes tools for advanced semantic processing, such as Graphviz. -```bash -pip install scrapegraphai[more-semantic-options] -``` -### Installing "More Browser Options" -This group includes additional browser management options, such as BrowserBase. -```bash -pip install scrapegraphai[more-browser-options] -``` ## 💻 Usage There are multiple standard scraping pipelines that can be used to extract information from a website (or local file). @@ -128,6 +132,9 @@ Check out also the Docusaurus [here](https://scrapegraph-doc.onrender.com/). ## 🏆 Sponsors
+ + Browserbase + SerpAPI diff --git a/docs/assets/browserbase_logo.png b/docs/assets/browserbase_logo.png new file mode 100644 index 0000000000000000000000000000000000000000..bd16f2e13bc65d4cd289791370a0ceaa390fd7b4 GIT binary patch literal 3091 zcmcgu`8O1N7oV{eVvtal#-6e~k+Cmnh8bCkP)vCJlWkPKS}}Yz42Hq?2Av-72=`eilW!u)cjzf56B+@D`kwd3V4P z8@zc$@Ak=c+UsqeTcGtdri;TUfzHd=FMmY{0h~*EURoD(Q!(Y5C8J#n<(mr)wI2n7 z6qd*eK47i$Bz)gJPD29Q=kiaO>fKVht^PEeW{nAoDFi8tdYD_-X-ConH^weLVF$kF zR6?5RJIq`7+!wDFQaxq)V1VXTx!g`$zuPr-X#cddWb1|Qy~e6hkYvgawCC;G zedp5Ezy$SjMaey(Tb$f%&l8CuA98GLyD&RrpF3;LRrMri+u)H>o+*R(WzSb`xCc!Z z25_EN$~#KCE$OG94-1A*-N?wbSbOcx_)?*&PxrUD3h;eY``NfGcJvuT zeC^%mnz7tmc-iI$uf`1jw?hM*)^#7SZ8F}!;kV_xW%xAo8Y0GE`+_L#7a zY#1>?47S?%(7&%H+BPBfFd-t`_VEslM}TaH502ckyWG_=%^JI3u6lnbC}Q#cJ)rm! z0c0X2-BlgJ3w|r*1O86k7IK=n~Lvc#c5Ryv1S58az$x8JcZQR;@aSY1ZQrPlMor>dZiK zdmB24M2SaxaFidm5yipuHp<+TI`R@fFNsow4t>SL^HMP+fiXX7Xtc#E$4a8qpmp0~ zc?ljHr~KuD1zTgEN1^?cMU&st=~4^oUvnbk*hpE15PfGx!V5DCeHHmoVFG|=2F_XD zAz0Nu7F!vltT_t^AnSF^B}J2`%ByQ59ppr_xF#twhV>$@){)0-jj{AeU1<}7NB8Z@ zz0LG)+4)b}T;3t5K1bbMdeFla@jYtAVuTsMHU27ujx9D&tk+;yk&ZviaSfLDD3zG| zIT(-p%8>2Tn?)QhXXYvFpTvwKs`U590*IT+azGMAYP?(3ueapBATC)hAS{KHZ`CIl z@#Nu(OtmSL#E#aMt&)$G6M|Oz!X8^9MC)k=!qS_F@6&plO3yXlxuj zNOKE4!0zf|2*IvrE4AXvAuJR7&0^GDUY+r5G;8?=({=)XI_F1 zcW$_@VinnNi$mYE%Asb^kr(vM$sb+kIgqo zsFTgHXw3-&%zR^*qLy{Q$2BJ@u)`_Rr?_?DW7)Oc1HY*lR+_Tat{_+Q+q8*xW74@w z!tS2=N!NHaT3c-E7f-dy)~|2W5nI3I*hmzIo(`|N9s*6^4&3N!0=sqrbZ4}Qc% z!59`Vjg(8RuWi2g zf!tkqpS$Aj#U~GaJDwbBP=&DQmXt>Pku~`>>55eVnbZC&4_0BfBPSH%!hbY}8rv^B zYbOYHP$bp4#`DKA-Xfbf>-0R0u_%$nT01g)3*)WfgeumJfpewH(a-Vr47r%HO5IuzvT!nlr0&K&y_7n^59 z{%Cfzzd+TQK3dGw#l8dSLQUtT=TME=MimfiR-hi?*A!8B!y@~GOs_zzofs?48Mk6( ze1AdN9|T#z7~^_eNxyvXo`p`EFW=zae0)NBV7?eJL|*ISrYrPDquud|jLqNW$I#xN z5+#S)=UvsX=mI`=`UG_Kv6TNk%p|uv*de#8q)AV3biJv5V9;p;G+|W#D9!xl1+tHU zLbBdJcYVs9ceKXH{`=RNLDW0q+G3M$_-=;H{Y^@rpae&>&h(#KJ%FQWaVCVZ^~xhc zx=5L6*$`}bz(w$AD}bwc2e^lNbjRW84B>rMQ{F8zUjt$^euSO zEIUxU{p52cQJ+Gn{HV$m1#d2OY472>b>pYZ|1NR?>*oYdY)iYafU|aI+U0u01yH z8e$$n+Hch&G#`nC2u$q*`c)NGwz@@YTIl&R8KLtuFTxR@v3|2Vi+=#BE`0rT@T_0~sO*k8b7-{d}JVA@%5A2B) zLpJT#KVV%>Q62dzSNgZD3Lg&%ShN`WLbT6b(eBIL2P-H-3Z!ED;*SXrD^;-Qef}o< z8OPRx)4eD~EsULvTWXc=bxkTqLcZ5<_EVjqKk|5ua6LUQxr z0jg%YB)C6nT7G9Bm~Szdkv=TV9&sq1jn_Lmq&DSwO3d93TG+F0AN) literal 0 HcmV?d00001 diff --git a/docs/source/introduction/overview.rst b/docs/source/introduction/overview.rst index 00a76d5d..506770a5 100644 --- a/docs/source/introduction/overview.rst +++ b/docs/source/introduction/overview.rst @@ -82,6 +82,11 @@ FAQ Sponsors ======== +.. image:: ../../assets/browserbase_logo.png + :width: 10% + :alt: Browserbase + :target: https://www.browserbase.com/ + .. image:: ../../assets/serp_api_logo.png :width: 10% :alt: Serp API From 57fd01f9a76ea8ea69ec04b7238ab58ca72ac8f4 Mon Sep 17 00:00:00 2001 From: Tuhin Mallick Date: Sun, 8 Sep 2024 01:41:39 +0200 Subject: [PATCH 3/8] feat(docloaders): Enhance browser_base_fetch function flexibility - Update browser_base_fetch to accept single URL or list of URLs - Add text_content parameter for choosing between text-only and HTML output - Improve type hinting and function documentation - Ensure compatibility with latest Browserbase SDK interface --- scrapegraphai/docloaders/browser_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapegraphai/docloaders/browser_base.py b/scrapegraphai/docloaders/browser_base.py index 318c9f38..5d7beb7d 100644 --- a/scrapegraphai/docloaders/browser_base.py +++ b/scrapegraphai/docloaders/browser_base.py @@ -3,7 +3,7 @@ """ from typing import List -def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[str]: +def browser_base_fetch(api_key: str, project_id: str, link: List[str], text_content: bool = True) -> List[str]: """ BrowserBase Fetch @@ -50,6 +50,6 @@ def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[s result = [] for l in link: - result.append(browserbase.load(l, text_content=True)) + result.append(browserbase.load(l, text_content=text_content)) return result From cd4ffd761a939bdedcc635b3915bc0208e2acdf5 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Sun, 8 Sep 2024 07:07:13 +0000 Subject: [PATCH 4/8] ci(release): 1.17.0 [skip ci] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## [1.17.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.16.0...v1.17.0) (2024-09-08) ### Features * **docloaders:** Enhance browser_base_fetch function flexibility ([57fd01f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/57fd01f9a76ea8ea69ec04b7238ab58ca72ac8f4)) ### Docs * **sponsor:** 🅱️ Browserbase sponsor 🅱️ ([a540139](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a5401394cc939d9a5fc58b8a9145141c2f047bab)) --- CHANGELOG.md | 12 ++++++++++++ pyproject.toml | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 509557cf..8caac374 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,15 @@ +## [1.17.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.16.0...v1.17.0) (2024-09-08) + + +### Features + +* **docloaders:** Enhance browser_base_fetch function flexibility ([57fd01f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/57fd01f9a76ea8ea69ec04b7238ab58ca72ac8f4)) + + +### Docs + +* **sponsor:** 🅱️ Browserbase sponsor 🅱️ ([a540139](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/a5401394cc939d9a5fc58b8a9145141c2f047bab)) + ## [1.16.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.15.2...v1.16.0) (2024-09-01) diff --git a/pyproject.toml b/pyproject.toml index 30eb40e1..54a68590 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scrapegraphai" -version = "1.16.0" +version = "1.17.0" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" }, From d56253d183969584cacc0cb164daa0152462f21c Mon Sep 17 00:00:00 2001 From: Tuhin Mallick Date: Sun, 8 Sep 2024 08:49:08 +0000 Subject: [PATCH 5/8] feat(browser_base_fetch): add async_mode to support both synchronous and asynchronous execution - Introduced an async_mode flag to allow users to choose between synchronous and asynchronous fetching using Browserbase. - Refactored common logic (browserbase initialization and result list) to avoid redundancy. - Added internal async handling with asyncio.to_thread() for non-blocking execution in async_mode. - Maintained backward compatibility for existing synchronous functionality. --- scrapegraphai/docloaders/browser_base.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/scrapegraphai/docloaders/browser_base.py b/scrapegraphai/docloaders/browser_base.py index 318c9f38..0d6a78c1 100644 --- a/scrapegraphai/docloaders/browser_base.py +++ b/scrapegraphai/docloaders/browser_base.py @@ -13,6 +13,8 @@ def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[s - `api_key`: The API key provided by BrowserBase. - `project_id`: The ID of the project on BrowserBase where you want to fetch data from. - `link`: The URL or link that you want to fetch data from. + - `text_content`: A boolean flag to specify whether to return only the text content (True) or the full HTML (False). + - `async_mode`: A boolean flag that determines whether the function runs asynchronously (True) or synchronously (False, default). It initializes a Browserbase object with the given API key and project ID, then uses this object to load the specified link. @@ -35,6 +37,8 @@ def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[s api_key (str): The API key provided by BrowserBase. project_id (str): The ID of the project on BrowserBase where you want to fetch data from. link (str): The URL or link that you want to fetch data from. + text_content (bool): Whether to return only the text content (True) or the full HTML (False). Defaults to True. + async_mode (bool): Whether to run the function asynchronously (True) or synchronously (False). Defaults to False. Returns: object: The result of the loading operation. @@ -49,7 +53,22 @@ def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[s browserbase = Browserbase(api_key=api_key, project_id=project_id) result = [] - for l in link: - result.append(browserbase.load(l, text_content=True)) + # Define the async fetch logic for individual links + async def _async_fetch_link(l): + return await asyncio.to_thread(browserbase.load, l, text_content=text_content) + + if async_mode: + # Asynchronously process each link + async def _async_browser_base_fetch(): + for l in link: + result.append(await _async_fetch_link(l)) + return result + + # Run the async fetch function + result = asyncio.run(_async_browser_base_fetch()) + else: + # Synchronous logic + for l in link: + result.append(browserbase.load(l, text_content=text_content)) return result From 29ef63d85a8559f46c30f2cb46f8df15f73427b6 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Sun, 8 Sep 2024 08:54:33 +0000 Subject: [PATCH 6/8] ci(release): 1.18.0 [skip ci] ## [1.18.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.17.0...v1.18.0) (2024-09-08) ### Features * **browser_base_fetch:** add async_mode to support both synchronous and asynchronous execution ([d56253d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d56253d183969584cacc0cb164daa0152462f21c)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8caac374..cb1da1c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.18.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.17.0...v1.18.0) (2024-09-08) + + +### Features + +* **browser_base_fetch:** add async_mode to support both synchronous and asynchronous execution ([d56253d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d56253d183969584cacc0cb164daa0152462f21c)) + ## [1.17.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.16.0...v1.17.0) (2024-09-08) diff --git a/pyproject.toml b/pyproject.toml index 54a68590..9efd454f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scrapegraphai" -version = "1.17.0" +version = "1.18.0" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" }, From 007ff084c68d419fac040d9b5cca3980458cfabc Mon Sep 17 00:00:00 2001 From: Tuhin Mallick Date: Sun, 8 Sep 2024 10:59:04 +0200 Subject: [PATCH 7/8] fix(browser_base_fetch): correct function signature and async_mode handling - Added missing `async_mode` parameter to the function signature. --- scrapegraphai/docloaders/browser_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapegraphai/docloaders/browser_base.py b/scrapegraphai/docloaders/browser_base.py index 58bf9e9d..c9413d68 100644 --- a/scrapegraphai/docloaders/browser_base.py +++ b/scrapegraphai/docloaders/browser_base.py @@ -3,7 +3,7 @@ """ from typing import List -def browser_base_fetch(api_key: str, project_id: str, link: List[str], text_content: bool = True) -> List[str]: +def browser_base_fetch(api_key: str, project_id: str, link: List[str], text_content: bool = True, async_mode: bool = False) -> List[str]: """ BrowserBase Fetch From c5ffdef4ff024803accd6c8321577434a5e2e3f8 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Sun, 8 Sep 2024 09:45:47 +0000 Subject: [PATCH 8/8] ci(release): 1.18.1 [skip ci] ## [1.18.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.18.0...v1.18.1) (2024-09-08) ### Bug Fixes * **browser_base_fetch:** correct function signature and async_mode handling ([007ff08](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/007ff084c68d419fac040d9b5cca3980458cfabc)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cb1da1c9..0ee2e901 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.18.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.18.0...v1.18.1) (2024-09-08) + + +### Bug Fixes + +* **browser_base_fetch:** correct function signature and async_mode handling ([007ff08](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/007ff084c68d419fac040d9b5cca3980458cfabc)) + ## [1.18.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.17.0...v1.18.0) (2024-09-08) diff --git a/pyproject.toml b/pyproject.toml index 9efd454f..b59e7f77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scrapegraphai" -version = "1.18.0" +version = "1.18.1" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" },