diff --git a/CHANGELOG.md b/CHANGELOG.md index dd613838..61557e62 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,31 @@ -## [0.6.1-beta.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.6.0...v0.6.1-beta.1) (2024-05-02) +## [0.6.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.6.1...v0.6.2) (2024-05-02) ### Bug Fixes * add to requirements.txt langchain-aws = "^0.1.2" ([1afa319](https://github.com/VinciGit00/Scrapegraph-ai/commit/1afa31910d25b2735abe0ad09dad433d6c2159fb)) + +### Docs + +* **tree:** added roadmap ([c8eeff8](https://github.com/VinciGit00/Scrapegraph-ai/commit/c8eeff873db6c8d23c9e4109ddee46edaa68b92b)) +* **roadmap:** open contributions ([4441505](https://github.com/VinciGit00/Scrapegraph-ai/commit/4441505b239fa819032469f148115bb3392b15ea)) +* typo ([faa3498](https://github.com/VinciGit00/Scrapegraph-ai/commit/faa3498fa7694ee3309eeed479d8f1bc4b1c7b97)) + + +### CI + +* **release:** 0.6.1-beta.1 [skip ci] ([75a4042](https://github.com/VinciGit00/Scrapegraph-ai/commit/75a4042a232a5b69fd38d1666fea9633b4fd015e)) + +## [0.6.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.6.0...v0.6.1) (2024-05-02) + + + +### Bug Fixes + +* gemini errror ([2ea54ea](https://github.com/VinciGit00/Scrapegraph-ai/commit/2ea54eab1d070e177c7d5ecfcc032b325fbd7c12)) + + ## [0.6.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.5.2...v0.6.0) (2024-05-02) diff --git a/README.md b/README.md index d409ee38..88171c91 100644 --- a/README.md +++ b/README.md @@ -168,7 +168,38 @@ result = smart_scraper_graph.run() print(result) ``` -### Case 5: Extracting information using Gemini + +### Case 5: Extracting information using Azure +```python +from langchain_openai import AzureChatOpenAI +from langchain_openai import AzureOpenAIEmbeddings + +lm_model_instance = AzureChatOpenAI( + openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], + azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] +) + +embedder_model_instance = AzureOpenAIEmbeddings( + azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], + openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], +) +graph_config = { + "llm": {"model_instance": llm_model_instance}, + "embeddings": {"model_instance": embedder_model_instance} +} + +smart_scraper_graph = SmartScraperGraph( + prompt="""List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, + event_end_date, event_end_time, location, event_mode, event_category, + third_party_redirect, no_of_days, + time_in_hours, hosted_or_attending, refreshments_type, + registration_available, registration_link""", + source="https://www.hmhco.com/event", + config=graph_config +) +``` + +### Case 6: Extracting information using Gemini ```python from scrapegraphai.graphs import SmartScraperGraph GOOGLE_APIKEY = "YOUR_API_KEY" @@ -215,6 +246,11 @@ Please see the [contributing guidelines](https://github.com/VinciGit00/Scrapegra [![My Skills](https://skillicons.dev/icons?i=linkedin)](https://www.linkedin.com/company/scrapegraphai/) [![My Skills](https://skillicons.dev/icons?i=twitter)](https://twitter.com/scrapegraphai) +## 📈 Roadmap +Check out the project roadmap [here](docs/README.md)! 🚀 + +Wanna visualize the roadmap in a more interactive way? Check out the [markmap](https://markmap.js.org/repl) visualization by copy pasting the markdown content in the editor! + ## ❤️ Contributors [![Contributors](https://contrib.rocks/image?repo=VinciGit00/Scrapegraph-ai)](https://github.com/VinciGit00/Scrapegraph-ai/graphs/contributors) @@ -249,4 +285,4 @@ ScrapeGraphAI is licensed under the MIT License. See the [LICENSE](https://githu ## Acknowledgements - We would like to thank all the contributors to the project and the open-source community for their support. -- ScrapeGraphAI is meant to be used for data exploration and research purposes only. We are not responsible for any misuse of the library. +- ScrapeGraphAI is meant to be used for data exploration and research purposes only. We are not responsible for any misuse of the library. \ No newline at end of file diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..4f5f8830 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,75 @@ +--- +title: ScrapGraphAI Roadmap +markmap: + colorFreezeLevel: 2 + maxWidth: 500 +--- + +# **ScrapGraphAI Roadmap** + +## **Short-Term Goals** + +- Integration with more llm APIs + +- Test proxy rotation implementation + +- Add more search engines inside the SearchInternetNode + +- Improve the documentation (ReadTheDocs) + - [Issue #102](https://github.com/VinciGit00/Scrapegraph-ai/issues/102) + +- Create tutorials for the library + +## **Medium-Term Goals** + +- Node for handling API requests + +- Improve SearchGraph to look into the first 5 results of the search engine + +- Make scraping more deterministic + - Create DOM tree of the website + - HTML tag text embeddings with tags metadata + - Study tree forks from root node + - How do we use the tags parameters? + +- Create scraping folder with report + - Folder contains .scrape files, DOM tree files, report + - Report could be a HTML page with scraping speed, costs, LLM info, scraped content and DOM tree visualization + - We can use pyecharts with R-markdown + +- Scrape multiple pages of the same website + - Create new node that instantiate multiple graphs at the same time + - Make graphs run in parallel + - Scrape only relevant URLs from user prompt + - Use the multi dimensional DOM tree of the website for retrieval + - [Issue #112](https://github.com/VinciGit00/Scrapegraph-ai/issues/112) + +- Crawler graph + - Scrape all the URLs with the same domain in all the pages + - Build many DOM trees and link them together + - Save the multi dimensional tree in a file + +- Compare two DOM trees to assess the similarity + - Save the DOM tree of the scraped website in a file as a sort of cache to be used to compare with future website structure + - Create similarity metrics with multiple DOM trees (overall tree? only relevant tags structure?) + +- Nodes for handling authentication + - Use Selenium or Playwright to handle authentication + - Passes the cookies to the other nodes + +- Nodes that attaches to an open browser + - Use Selenium or Playwright to attach to an open browser + - Navigate inside the browser and scrape the content + +- Nodes for taking screenshots and understanding the page layout + - Use Selenium or Playwright to take screenshots + - Use LLM to asses if it is a block-like page, paragraph-like page, etc. + - [Issue #88](https://github.com/VinciGit00/Scrapegraph-ai/issues/88) + +## **Long-Term Goals** + +- Automatic generation of scraping pipelines from a given prompt + +- Create API for the library + +- Finetune a LLM for html content \ No newline at end of file diff --git a/examples/azure/smart_scraper_azure_openai.py b/examples/azure/smart_scraper_azure_openai.py index bfcd6b92..53d4a197 100644 --- a/examples/azure/smart_scraper_azure_openai.py +++ b/examples/azure/smart_scraper_azure_openai.py @@ -10,7 +10,7 @@ from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info -## required environment variable in .env +# required environment variable in .env # AZURE_OPENAI_ENDPOINT # AZURE_OPENAI_CHAT_DEPLOYMENT_NAME # MODEL_NAME @@ -45,8 +45,11 @@ graph_config = { } smart_scraper_graph = SmartScraperGraph( - prompt="List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, event_end_date, event_end_time, location, event_mode, event_category, third_party_redirect, no_of_days, -time_in_hours, hosted_or_attending, refreshments_type, registration_available, registration_link", + prompt="""List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, + event_end_date, event_end_time, location, event_mode, event_category, + third_party_redirect, no_of_days, + time_in_hours, hosted_or_attending, refreshments_type, + registration_available, registration_link""", # also accepts a string with the already downloaded HTML code source="https://www.hmhco.com/event", config=graph_config diff --git a/examples/gemini/csv_scraper_gemini.py b/examples/gemini/csv_scraper_gemini.py index c19419b0..7923cf37 100644 --- a/examples/gemini/csv_scraper_gemini.py +++ b/examples/gemini/csv_scraper_gemini.py @@ -19,20 +19,13 @@ text = pd.read_csv("inputs/username.csv") # ************************************************ # Define the configuration for the graph # ************************************************ +gemini_key = os.getenv("GOOGLE_APIKEY") graph_config = { "llm": { - "model": "ollama/mistral", - "temperature": 0, - "format": "json", # Ollama needs the format to be specified explicitly - # "model_tokens": 2000, # set context length arbitrarily - "base_url": "http://localhost:11434", + "api_key": gemini_key, + "model": "gemini-pro", }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - "base_url": "http://localhost:11434", - } } # ************************************************ diff --git a/examples/gemini/smart_scraper_gemini.py b/examples/gemini/smart_scraper_gemini.py index b3b25024..1319ab95 100644 --- a/examples/gemini/smart_scraper_gemini.py +++ b/examples/gemini/smart_scraper_gemini.py @@ -18,7 +18,7 @@ gemini_key = os.getenv("GOOGLE_APIKEY") graph_config = { "llm": { "api_key": gemini_key, - "model": "gpt-3.5-turbo", + "model": "gemini-pro", }, } diff --git a/pyproject.toml b/pyproject.toml index b6f39b23..5ab6d492 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,7 @@ [tool.poetry] name = "scrapegraphai" -version = "0.6.1b1" - +version = "0.6.2" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 3105929c..a9bab3fc 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -20,7 +20,9 @@ models_tokens = { "gpt-4-32k-0613": 32768, }, "azure": { - "gpt-3.5-turbo": 4096 + "gpt-3.5-turbo": 4096, + "gpt-4": 8192, + "gpt-4-32k": 32768 }, "gemini": { "gemini-pro": 128000, @@ -65,4 +67,3 @@ models_tokens = { "cohere.embed-multilingual-v3": 512 } } -