From 2ea54eab1d070e177c7d5ecfcc032b325fbd7c12 Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Thu, 2 May 2024 18:56:37 +0200 Subject: [PATCH 1/7] fix: gemini errror --- examples/gemini/csv_scraper_gemini.py | 13 +++---------- examples/gemini/smart_scraper_gemini.py | 2 +- scrapegraphai/helpers/models_tokens.py | 5 +++-- 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/examples/gemini/csv_scraper_gemini.py b/examples/gemini/csv_scraper_gemini.py index c19419b0..7923cf37 100644 --- a/examples/gemini/csv_scraper_gemini.py +++ b/examples/gemini/csv_scraper_gemini.py @@ -19,20 +19,13 @@ text = pd.read_csv("inputs/username.csv") # ************************************************ # Define the configuration for the graph # ************************************************ +gemini_key = os.getenv("GOOGLE_APIKEY") graph_config = { "llm": { - "model": "ollama/mistral", - "temperature": 0, - "format": "json", # Ollama needs the format to be specified explicitly - # "model_tokens": 2000, # set context length arbitrarily - "base_url": "http://localhost:11434", + "api_key": gemini_key, + "model": "gemini-pro", }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", - } } # ************************************************ diff --git a/examples/gemini/smart_scraper_gemini.py b/examples/gemini/smart_scraper_gemini.py index b3b25024..1319ab95 100644 --- a/examples/gemini/smart_scraper_gemini.py +++ b/examples/gemini/smart_scraper_gemini.py @@ -18,7 +18,7 @@ gemini_key = os.getenv("GOOGLE_APIKEY") graph_config = { "llm": { "api_key": gemini_key, - "model": "gpt-3.5-turbo", + "model": "gemini-pro", }, } diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 73a3999f..55d3e689 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -20,7 +20,9 @@ models_tokens = { "gpt-4-32k-0613": 32768, }, "azure": { - "gpt-3.5-turbo": 4096 + "gpt-3.5-turbo": 4096, + "gpt-4": 8192, + "gpt-4-32k": 32768 }, "gemini": { "gemini-pro": 128000, @@ -48,4 +50,3 @@ models_tokens = { "claude3": 200000 } } - From 2b23a3c771ff550b54414329f4c13b5e2acbab16 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Thu, 2 May 2024 16:58:06 +0000 Subject: [PATCH 2/7] ci(release): 0.6.1 [skip ci] ## [0.6.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.6.0...v0.6.1) (2024-05-02) ### Bug Fixes * gemini errror ([2ea54ea](https://github.com/VinciGit00/Scrapegraph-ai/commit/2ea54eab1d070e177c7d5ecfcc032b325fbd7c12)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 87860fbb..4a29f231 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [0.6.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.6.0...v0.6.1) (2024-05-02) + + +### Bug Fixes + +* gemini errror ([2ea54ea](https://github.com/VinciGit00/Scrapegraph-ai/commit/2ea54eab1d070e177c7d5ecfcc032b325fbd7c12)) + ## [0.6.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.5.2...v0.6.0) (2024-05-02) diff --git a/pyproject.toml b/pyproject.toml index bb36dd36..800e5c9f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "scrapegraphai" -version = "0.6.0" +version = "0.6.1" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." From 967108dfd3bd6a1ca2b378fa8ca8d191027c3f10 Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Thu, 2 May 2024 19:14:09 +0200 Subject: [PATCH 3/7] add example on readme --- README.md | 33 +++++++++++++++++++- examples/azure/smart_scraper_azure_openai.py | 9 ++++-- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index d409ee38..d1c8e00d 100644 --- a/README.md +++ b/README.md @@ -168,7 +168,38 @@ result = smart_scraper_graph.run() print(result) ``` -### Case 5: Extracting information using Gemini + +### Case 5: Extracting information using Azure +```python +from langchain_openai import AzureChatOpenAI +from langchain_openai import AzureOpenAIEmbeddings + +lm_model_instance = AzureChatOpenAI( + openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], + azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] +) + +embedder_model_instance = AzureOpenAIEmbeddings( + azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], + openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], +) +graph_config = { + "llm": {"model_instance": llm_model_instance}, + "embeddings": {"model_instance": embedder_model_instance} +} + +smart_scraper_graph = SmartScraperGraph( + prompt="""List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, + event_end_date, event_end_time, location, event_mode, event_category, + third_party_redirect, no_of_days, + time_in_hours, hosted_or_attending, refreshments_type, + registration_available, registration_link""", + source="https://www.hmhco.com/event", + config=graph_config +) +``` + +### Case 6: Extracting information using Gemini ```python from scrapegraphai.graphs import SmartScraperGraph GOOGLE_APIKEY = "YOUR_API_KEY" diff --git a/examples/azure/smart_scraper_azure_openai.py b/examples/azure/smart_scraper_azure_openai.py index bfcd6b92..53d4a197 100644 --- a/examples/azure/smart_scraper_azure_openai.py +++ b/examples/azure/smart_scraper_azure_openai.py @@ -10,7 +10,7 @@ from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info -## required environment variable in .env +# required environment variable in .env # AZURE_OPENAI_ENDPOINT # AZURE_OPENAI_CHAT_DEPLOYMENT_NAME # MODEL_NAME @@ -45,8 +45,11 @@ graph_config = { } smart_scraper_graph = SmartScraperGraph( - prompt="List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, event_end_date, event_end_time, location, event_mode, event_category, third_party_redirect, no_of_days, -time_in_hours, hosted_or_attending, refreshments_type, registration_available, registration_link", + prompt="""List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, + event_end_date, event_end_time, location, event_mode, event_category, + third_party_redirect, no_of_days, + time_in_hours, hosted_or_attending, refreshments_type, + registration_available, registration_link""", # also accepts a string with the already downloaded HTML code source="https://www.hmhco.com/event", config=graph_config From c8eeff873db6c8d23c9e4109ddee46edaa68b92b Mon Sep 17 00:00:00 2001 From: Marco Perini Date: Thu, 2 May 2024 19:20:28 +0200 Subject: [PATCH 4/7] docs(tree): added roadmap --- README.md | 47 +++++++++++++++++++++++++++++++ docs/roadmap.md | 75 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 docs/roadmap.md diff --git a/README.md b/README.md index d409ee38..06a0fb90 100644 --- a/README.md +++ b/README.md @@ -250,3 +250,50 @@ ScrapeGraphAI is licensed under the MIT License. See the [LICENSE](https://githu - We would like to thank all the contributors to the project and the open-source community for their support. - ScrapeGraphAI is meant to be used for data exploration and research purposes only. We are not responsible for any misuse of the library. + +## 📈 Roadmap + + + + + + + +Markmap + + + + + + + + diff --git a/docs/roadmap.md b/docs/roadmap.md new file mode 100644 index 00000000..4f5f8830 --- /dev/null +++ b/docs/roadmap.md @@ -0,0 +1,75 @@ +--- +title: ScrapGraphAI Roadmap +markmap: + colorFreezeLevel: 2 + maxWidth: 500 +--- + +# **ScrapGraphAI Roadmap** + +## **Short-Term Goals** + +- Integration with more llm APIs + +- Test proxy rotation implementation + +- Add more search engines inside the SearchInternetNode + +- Improve the documentation (ReadTheDocs) + - [Issue #102](https://github.com/VinciGit00/Scrapegraph-ai/issues/102) + +- Create tutorials for the library + +## **Medium-Term Goals** + +- Node for handling API requests + +- Improve SearchGraph to look into the first 5 results of the search engine + +- Make scraping more deterministic + - Create DOM tree of the website + - HTML tag text embeddings with tags metadata + - Study tree forks from root node + - How do we use the tags parameters? + +- Create scraping folder with report + - Folder contains .scrape files, DOM tree files, report + - Report could be a HTML page with scraping speed, costs, LLM info, scraped content and DOM tree visualization + - We can use pyecharts with R-markdown + +- Scrape multiple pages of the same website + - Create new node that instantiate multiple graphs at the same time + - Make graphs run in parallel + - Scrape only relevant URLs from user prompt + - Use the multi dimensional DOM tree of the website for retrieval + - [Issue #112](https://github.com/VinciGit00/Scrapegraph-ai/issues/112) + +- Crawler graph + - Scrape all the URLs with the same domain in all the pages + - Build many DOM trees and link them together + - Save the multi dimensional tree in a file + +- Compare two DOM trees to assess the similarity + - Save the DOM tree of the scraped website in a file as a sort of cache to be used to compare with future website structure + - Create similarity metrics with multiple DOM trees (overall tree? only relevant tags structure?) + +- Nodes for handling authentication + - Use Selenium or Playwright to handle authentication + - Passes the cookies to the other nodes + +- Nodes that attaches to an open browser + - Use Selenium or Playwright to attach to an open browser + - Navigate inside the browser and scrape the content + +- Nodes for taking screenshots and understanding the page layout + - Use Selenium or Playwright to take screenshots + - Use LLM to asses if it is a block-like page, paragraph-like page, etc. + - [Issue #88](https://github.com/VinciGit00/Scrapegraph-ai/issues/88) + +## **Long-Term Goals** + +- Automatic generation of scraping pipelines from a given prompt + +- Create API for the library + +- Finetune a LLM for html content \ No newline at end of file From 4441505b239fa819032469f148115bb3392b15ea Mon Sep 17 00:00:00 2001 From: Marco Perini Date: Thu, 2 May 2024 19:37:41 +0200 Subject: [PATCH 5/7] docs(roadmap): open contributions --- README.md | 54 ++++------------------------------ docs/{roadmap.md => README.md} | 0 2 files changed, 6 insertions(+), 48 deletions(-) rename docs/{roadmap.md => README.md} (100%) diff --git a/README.md b/README.md index d57f083a..efcf195e 100644 --- a/README.md +++ b/README.md @@ -246,6 +246,11 @@ Please see the [contributing guidelines](https://github.com/VinciGit00/Scrapegra [![My Skills](https://skillicons.dev/icons?i=linkedin)](https://www.linkedin.com/company/scrapegraphai/) [![My Skills](https://skillicons.dev/icons?i=twitter)](https://twitter.com/scrapegraphai) +## 📈 Roadmap +Check out the project roadmap [here](docs/README.md)! 🚀 + +Wanna visualize the roadmap in a more interactive way? Check out the [markmap](https://markmap.js.org/repl/) visualization by copy pasting the markdown content in the editor! + ## ❤️ Contributors [![Contributors](https://contrib.rocks/image?repo=VinciGit00/Scrapegraph-ai)](https://github.com/VinciGit00/Scrapegraph-ai/graphs/contributors) @@ -280,51 +285,4 @@ ScrapeGraphAI is licensed under the MIT License. See the [LICENSE](https://githu ## Acknowledgements - We would like to thank all the contributors to the project and the open-source community for their support. -- ScrapeGraphAI is meant to be used for data exploration and research purposes only. We are not responsible for any misuse of the library. - -## 📈 Roadmap - - - - - - - -Markmap - - - - - - - - +- ScrapeGraphAI is meant to be used for data exploration and research purposes only. We are not responsible for any misuse of the library. \ No newline at end of file diff --git a/docs/roadmap.md b/docs/README.md similarity index 100% rename from docs/roadmap.md rename to docs/README.md From faa3498fa7694ee3309eeed479d8f1bc4b1c7b97 Mon Sep 17 00:00:00 2001 From: Marco Perini Date: Thu, 2 May 2024 19:41:05 +0200 Subject: [PATCH 6/7] docs: typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index efcf195e..88171c91 100644 --- a/README.md +++ b/README.md @@ -249,7 +249,7 @@ Please see the [contributing guidelines](https://github.com/VinciGit00/Scrapegra ## 📈 Roadmap Check out the project roadmap [here](docs/README.md)! 🚀 -Wanna visualize the roadmap in a more interactive way? Check out the [markmap](https://markmap.js.org/repl/) visualization by copy pasting the markdown content in the editor! +Wanna visualize the roadmap in a more interactive way? Check out the [markmap](https://markmap.js.org/repl) visualization by copy pasting the markdown content in the editor! ## ❤️ Contributors [![Contributors](https://contrib.rocks/image?repo=VinciGit00/Scrapegraph-ai)](https://github.com/VinciGit00/Scrapegraph-ai/graphs/contributors) From 2f478f861e889b781e82c39f50445f460466f97f Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Thu, 2 May 2024 19:07:31 +0000 Subject: [PATCH 7/7] ci(release): 0.6.2 [skip ci] ## [0.6.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.6.1...v0.6.2) (2024-05-02) ### Bug Fixes * add to requirements.txt langchain-aws = "^0.1.2" ([1afa319](https://github.com/VinciGit00/Scrapegraph-ai/commit/1afa31910d25b2735abe0ad09dad433d6c2159fb)) ### Docs * **tree:** added roadmap ([c8eeff8](https://github.com/VinciGit00/Scrapegraph-ai/commit/c8eeff873db6c8d23c9e4109ddee46edaa68b92b)) * **roadmap:** open contributions ([4441505](https://github.com/VinciGit00/Scrapegraph-ai/commit/4441505b239fa819032469f148115bb3392b15ea)) * typo ([faa3498](https://github.com/VinciGit00/Scrapegraph-ai/commit/faa3498fa7694ee3309eeed479d8f1bc4b1c7b97)) ### CI * **release:** 0.6.1-beta.1 [skip ci] ([75a4042](https://github.com/VinciGit00/Scrapegraph-ai/commit/75a4042a232a5b69fd38d1666fea9633b4fd015e)) --- CHANGELOG.md | 19 +++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 17505fcd..61557e62 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,22 @@ +## [0.6.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.6.1...v0.6.2) (2024-05-02) + + +### Bug Fixes + +* add to requirements.txt langchain-aws = "^0.1.2" ([1afa319](https://github.com/VinciGit00/Scrapegraph-ai/commit/1afa31910d25b2735abe0ad09dad433d6c2159fb)) + + +### Docs + +* **tree:** added roadmap ([c8eeff8](https://github.com/VinciGit00/Scrapegraph-ai/commit/c8eeff873db6c8d23c9e4109ddee46edaa68b92b)) +* **roadmap:** open contributions ([4441505](https://github.com/VinciGit00/Scrapegraph-ai/commit/4441505b239fa819032469f148115bb3392b15ea)) +* typo ([faa3498](https://github.com/VinciGit00/Scrapegraph-ai/commit/faa3498fa7694ee3309eeed479d8f1bc4b1c7b97)) + + +### CI + +* **release:** 0.6.1-beta.1 [skip ci] ([75a4042](https://github.com/VinciGit00/Scrapegraph-ai/commit/75a4042a232a5b69fd38d1666fea9633b4fd015e)) + ## [0.6.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.6.0...v0.6.1) (2024-05-02) diff --git a/pyproject.toml b/pyproject.toml index 19c41f4d..5ab6d492 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "scrapegraphai" -version = "0.6.1" +version = "0.6.2" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [