mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
commit
1219caa4ff
23
CHANGELOG.md
23
CHANGELOG.md
@ -1,10 +1,31 @@
|
||||
## [0.6.1-beta.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.6.0...v0.6.1-beta.1) (2024-05-02)
|
||||
## [0.6.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.6.1...v0.6.2) (2024-05-02)
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* add to requirements.txt langchain-aws = "^0.1.2" ([1afa319](https://github.com/VinciGit00/Scrapegraph-ai/commit/1afa31910d25b2735abe0ad09dad433d6c2159fb))
|
||||
|
||||
|
||||
### Docs
|
||||
|
||||
* **tree:** added roadmap ([c8eeff8](https://github.com/VinciGit00/Scrapegraph-ai/commit/c8eeff873db6c8d23c9e4109ddee46edaa68b92b))
|
||||
* **roadmap:** open contributions ([4441505](https://github.com/VinciGit00/Scrapegraph-ai/commit/4441505b239fa819032469f148115bb3392b15ea))
|
||||
* typo ([faa3498](https://github.com/VinciGit00/Scrapegraph-ai/commit/faa3498fa7694ee3309eeed479d8f1bc4b1c7b97))
|
||||
|
||||
|
||||
### CI
|
||||
|
||||
* **release:** 0.6.1-beta.1 [skip ci] ([75a4042](https://github.com/VinciGit00/Scrapegraph-ai/commit/75a4042a232a5b69fd38d1666fea9633b4fd015e))
|
||||
|
||||
## [0.6.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.6.0...v0.6.1) (2024-05-02)
|
||||
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* gemini errror ([2ea54ea](https://github.com/VinciGit00/Scrapegraph-ai/commit/2ea54eab1d070e177c7d5ecfcc032b325fbd7c12))
|
||||
|
||||
|
||||
## [0.6.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.5.2...v0.6.0) (2024-05-02)
|
||||
|
||||
|
||||
|
||||
40
README.md
40
README.md
@ -168,7 +168,38 @@ result = smart_scraper_graph.run()
|
||||
print(result)
|
||||
```
|
||||
|
||||
### Case 5: Extracting information using Gemini
|
||||
|
||||
### Case 5: Extracting information using Azure
|
||||
```python
|
||||
from langchain_openai import AzureChatOpenAI
|
||||
from langchain_openai import AzureOpenAIEmbeddings
|
||||
|
||||
lm_model_instance = AzureChatOpenAI(
|
||||
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
|
||||
azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"]
|
||||
)
|
||||
|
||||
embedder_model_instance = AzureOpenAIEmbeddings(
|
||||
azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"],
|
||||
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
|
||||
)
|
||||
graph_config = {
|
||||
"llm": {"model_instance": llm_model_instance},
|
||||
"embeddings": {"model_instance": embedder_model_instance}
|
||||
}
|
||||
|
||||
smart_scraper_graph = SmartScraperGraph(
|
||||
prompt="""List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time,
|
||||
event_end_date, event_end_time, location, event_mode, event_category,
|
||||
third_party_redirect, no_of_days,
|
||||
time_in_hours, hosted_or_attending, refreshments_type,
|
||||
registration_available, registration_link""",
|
||||
source="https://www.hmhco.com/event",
|
||||
config=graph_config
|
||||
)
|
||||
```
|
||||
|
||||
### Case 6: Extracting information using Gemini
|
||||
```python
|
||||
from scrapegraphai.graphs import SmartScraperGraph
|
||||
GOOGLE_APIKEY = "YOUR_API_KEY"
|
||||
@ -215,6 +246,11 @@ Please see the [contributing guidelines](https://github.com/VinciGit00/Scrapegra
|
||||
[](https://www.linkedin.com/company/scrapegraphai/)
|
||||
[](https://twitter.com/scrapegraphai)
|
||||
|
||||
## 📈 Roadmap
|
||||
Check out the project roadmap [here](docs/README.md)! 🚀
|
||||
|
||||
Wanna visualize the roadmap in a more interactive way? Check out the [markmap](https://markmap.js.org/repl) visualization by copy pasting the markdown content in the editor!
|
||||
|
||||
## ❤️ Contributors
|
||||
[](https://github.com/VinciGit00/Scrapegraph-ai/graphs/contributors)
|
||||
|
||||
@ -249,4 +285,4 @@ ScrapeGraphAI is licensed under the MIT License. See the [LICENSE](https://githu
|
||||
## Acknowledgements
|
||||
|
||||
- We would like to thank all the contributors to the project and the open-source community for their support.
|
||||
- ScrapeGraphAI is meant to be used for data exploration and research purposes only. We are not responsible for any misuse of the library.
|
||||
- ScrapeGraphAI is meant to be used for data exploration and research purposes only. We are not responsible for any misuse of the library.
|
||||
75
docs/README.md
Normal file
75
docs/README.md
Normal file
@ -0,0 +1,75 @@
|
||||
---
|
||||
title: ScrapGraphAI Roadmap
|
||||
markmap:
|
||||
colorFreezeLevel: 2
|
||||
maxWidth: 500
|
||||
---
|
||||
|
||||
# **ScrapGraphAI Roadmap**
|
||||
|
||||
## **Short-Term Goals**
|
||||
|
||||
- Integration with more llm APIs
|
||||
|
||||
- Test proxy rotation implementation
|
||||
|
||||
- Add more search engines inside the SearchInternetNode
|
||||
|
||||
- Improve the documentation (ReadTheDocs)
|
||||
- [Issue #102](https://github.com/VinciGit00/Scrapegraph-ai/issues/102)
|
||||
|
||||
- Create tutorials for the library
|
||||
|
||||
## **Medium-Term Goals**
|
||||
|
||||
- Node for handling API requests
|
||||
|
||||
- Improve SearchGraph to look into the first 5 results of the search engine
|
||||
|
||||
- Make scraping more deterministic
|
||||
- Create DOM tree of the website
|
||||
- HTML tag text embeddings with tags metadata
|
||||
- Study tree forks from root node
|
||||
- How do we use the tags parameters?
|
||||
|
||||
- Create scraping folder with report
|
||||
- Folder contains .scrape files, DOM tree files, report
|
||||
- Report could be a HTML page with scraping speed, costs, LLM info, scraped content and DOM tree visualization
|
||||
- We can use pyecharts with R-markdown
|
||||
|
||||
- Scrape multiple pages of the same website
|
||||
- Create new node that instantiate multiple graphs at the same time
|
||||
- Make graphs run in parallel
|
||||
- Scrape only relevant URLs from user prompt
|
||||
- Use the multi dimensional DOM tree of the website for retrieval
|
||||
- [Issue #112](https://github.com/VinciGit00/Scrapegraph-ai/issues/112)
|
||||
|
||||
- Crawler graph
|
||||
- Scrape all the URLs with the same domain in all the pages
|
||||
- Build many DOM trees and link them together
|
||||
- Save the multi dimensional tree in a file
|
||||
|
||||
- Compare two DOM trees to assess the similarity
|
||||
- Save the DOM tree of the scraped website in a file as a sort of cache to be used to compare with future website structure
|
||||
- Create similarity metrics with multiple DOM trees (overall tree? only relevant tags structure?)
|
||||
|
||||
- Nodes for handling authentication
|
||||
- Use Selenium or Playwright to handle authentication
|
||||
- Passes the cookies to the other nodes
|
||||
|
||||
- Nodes that attaches to an open browser
|
||||
- Use Selenium or Playwright to attach to an open browser
|
||||
- Navigate inside the browser and scrape the content
|
||||
|
||||
- Nodes for taking screenshots and understanding the page layout
|
||||
- Use Selenium or Playwright to take screenshots
|
||||
- Use LLM to asses if it is a block-like page, paragraph-like page, etc.
|
||||
- [Issue #88](https://github.com/VinciGit00/Scrapegraph-ai/issues/88)
|
||||
|
||||
## **Long-Term Goals**
|
||||
|
||||
- Automatic generation of scraping pipelines from a given prompt
|
||||
|
||||
- Create API for the library
|
||||
|
||||
- Finetune a LLM for html content
|
||||
@ -10,7 +10,7 @@ from scrapegraphai.graphs import SmartScraperGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
|
||||
|
||||
## required environment variable in .env
|
||||
# required environment variable in .env
|
||||
# AZURE_OPENAI_ENDPOINT
|
||||
# AZURE_OPENAI_CHAT_DEPLOYMENT_NAME
|
||||
# MODEL_NAME
|
||||
@ -45,8 +45,11 @@ graph_config = {
|
||||
}
|
||||
|
||||
smart_scraper_graph = SmartScraperGraph(
|
||||
prompt="List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, event_end_date, event_end_time, location, event_mode, event_category, third_party_redirect, no_of_days,
|
||||
time_in_hours, hosted_or_attending, refreshments_type, registration_available, registration_link",
|
||||
prompt="""List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time,
|
||||
event_end_date, event_end_time, location, event_mode, event_category,
|
||||
third_party_redirect, no_of_days,
|
||||
time_in_hours, hosted_or_attending, refreshments_type,
|
||||
registration_available, registration_link""",
|
||||
# also accepts a string with the already downloaded HTML code
|
||||
source="https://www.hmhco.com/event",
|
||||
config=graph_config
|
||||
|
||||
@ -19,20 +19,13 @@ text = pd.read_csv("inputs/username.csv")
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
gemini_key = os.getenv("GOOGLE_APIKEY")
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ollama/mistral",
|
||||
"temperature": 0,
|
||||
"format": "json", # Ollama needs the format to be specified explicitly
|
||||
# "model_tokens": 2000, # set context length arbitrarily
|
||||
"base_url": "http://localhost:11434",
|
||||
"api_key": gemini_key,
|
||||
"model": "gemini-pro",
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
"base_url": "http://localhost:11434",
|
||||
}
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
|
||||
@ -18,7 +18,7 @@ gemini_key = os.getenv("GOOGLE_APIKEY")
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"api_key": gemini_key,
|
||||
"model": "gpt-3.5-turbo",
|
||||
"model": "gemini-pro",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@ -1,8 +1,7 @@
|
||||
[tool.poetry]
|
||||
name = "scrapegraphai"
|
||||
|
||||
version = "0.6.1b1"
|
||||
|
||||
version = "0.6.2"
|
||||
|
||||
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
|
||||
authors = [
|
||||
|
||||
@ -20,7 +20,9 @@ models_tokens = {
|
||||
"gpt-4-32k-0613": 32768,
|
||||
},
|
||||
"azure": {
|
||||
"gpt-3.5-turbo": 4096
|
||||
"gpt-3.5-turbo": 4096,
|
||||
"gpt-4": 8192,
|
||||
"gpt-4-32k": 32768
|
||||
},
|
||||
"gemini": {
|
||||
"gemini-pro": 128000,
|
||||
@ -65,4 +67,3 @@ models_tokens = {
|
||||
"cohere.embed-multilingual-v3": 512
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user