mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-01 21:00:48 +08:00
feat: refactoring_to_md function
This commit is contained in:
parent
bb624399cf
commit
602dd00209
@ -34,7 +34,6 @@ dependencies = [
|
|||||||
"undetected-playwright==0.3.0",
|
"undetected-playwright==0.3.0",
|
||||||
"semchunk==1.0.1",
|
"semchunk==1.0.1",
|
||||||
"html2text==2024.2.26",
|
"html2text==2024.2.26",
|
||||||
"trafilatura==1.10.0",
|
|
||||||
"langchain-fireworks==0.1.3"
|
"langchain-fireworks==0.1.3"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@ -41,7 +41,6 @@ attrs==23.2.0
|
|||||||
# via jsonschema
|
# via jsonschema
|
||||||
# via referencing
|
# via referencing
|
||||||
babel==2.15.0
|
babel==2.15.0
|
||||||
# via courlan
|
|
||||||
# via sphinx
|
# via sphinx
|
||||||
beautifulsoup4==4.12.3
|
beautifulsoup4==4.12.3
|
||||||
# via furo
|
# via furo
|
||||||
@ -63,11 +62,8 @@ certifi==2024.2.2
|
|||||||
# via httpcore
|
# via httpcore
|
||||||
# via httpx
|
# via httpx
|
||||||
# via requests
|
# via requests
|
||||||
# via trafilatura
|
|
||||||
charset-normalizer==3.3.2
|
charset-normalizer==3.3.2
|
||||||
# via htmldate
|
|
||||||
# via requests
|
# via requests
|
||||||
# via trafilatura
|
|
||||||
click==8.1.7
|
click==8.1.7
|
||||||
# via burr
|
# via burr
|
||||||
# via streamlit
|
# via streamlit
|
||||||
@ -75,15 +71,11 @@ click==8.1.7
|
|||||||
# via uvicorn
|
# via uvicorn
|
||||||
contourpy==1.2.1
|
contourpy==1.2.1
|
||||||
# via matplotlib
|
# via matplotlib
|
||||||
courlan==1.2.0
|
|
||||||
# via trafilatura
|
|
||||||
cycler==0.12.1
|
cycler==0.12.1
|
||||||
# via matplotlib
|
# via matplotlib
|
||||||
dataclasses-json==0.6.6
|
dataclasses-json==0.6.6
|
||||||
# via langchain
|
# via langchain
|
||||||
# via langchain-community
|
# via langchain-community
|
||||||
dateparser==1.2.0
|
|
||||||
# via htmldate
|
|
||||||
defusedxml==0.7.1
|
defusedxml==0.7.1
|
||||||
# via langchain-anthropic
|
# via langchain-anthropic
|
||||||
dill==0.3.8
|
dill==0.3.8
|
||||||
@ -204,8 +196,6 @@ h11==0.14.0
|
|||||||
# via uvicorn
|
# via uvicorn
|
||||||
html2text==2024.2.26
|
html2text==2024.2.26
|
||||||
# via scrapegraphai
|
# via scrapegraphai
|
||||||
htmldate==1.8.1
|
|
||||||
# via trafilatura
|
|
||||||
httpcore==1.0.5
|
httpcore==1.0.5
|
||||||
# via httpx
|
# via httpx
|
||||||
httplib2==0.22.0
|
httplib2==0.22.0
|
||||||
@ -259,8 +249,6 @@ jsonschema==4.22.0
|
|||||||
# via altair
|
# via altair
|
||||||
jsonschema-specifications==2023.12.1
|
jsonschema-specifications==2023.12.1
|
||||||
# via jsonschema
|
# via jsonschema
|
||||||
justext==3.0.1
|
|
||||||
# via trafilatura
|
|
||||||
kiwisolver==1.4.5
|
kiwisolver==1.4.5
|
||||||
# via matplotlib
|
# via matplotlib
|
||||||
langchain==0.1.15
|
langchain==0.1.15
|
||||||
@ -302,12 +290,6 @@ loguru==0.7.2
|
|||||||
# via burr
|
# via burr
|
||||||
lxml==5.2.2
|
lxml==5.2.2
|
||||||
# via free-proxy
|
# via free-proxy
|
||||||
# via htmldate
|
|
||||||
# via justext
|
|
||||||
# via lxml-html-clean
|
|
||||||
# via trafilatura
|
|
||||||
lxml-html-clean==0.1.1
|
|
||||||
# via lxml
|
|
||||||
markdown-it-py==3.0.0
|
markdown-it-py==3.0.0
|
||||||
# via rich
|
# via rich
|
||||||
markupsafe==2.1.5
|
markupsafe==2.1.5
|
||||||
@ -430,9 +412,7 @@ pytest==8.0.0
|
|||||||
pytest-mock==3.14.0
|
pytest-mock==3.14.0
|
||||||
python-dateutil==2.9.0.post0
|
python-dateutil==2.9.0.post0
|
||||||
# via botocore
|
# via botocore
|
||||||
# via dateparser
|
|
||||||
# via google-cloud-bigquery
|
# via google-cloud-bigquery
|
||||||
# via htmldate
|
|
||||||
# via matplotlib
|
# via matplotlib
|
||||||
# via pandas
|
# via pandas
|
||||||
python-dotenv==1.0.1
|
python-dotenv==1.0.1
|
||||||
@ -441,7 +421,6 @@ python-dotenv==1.0.1
|
|||||||
python-multipart==0.0.9
|
python-multipart==0.0.9
|
||||||
# via fastapi
|
# via fastapi
|
||||||
pytz==2024.1
|
pytz==2024.1
|
||||||
# via dateparser
|
|
||||||
# via pandas
|
# via pandas
|
||||||
pyyaml==6.0.1
|
pyyaml==6.0.1
|
||||||
# via huggingface-hub
|
# via huggingface-hub
|
||||||
@ -453,7 +432,6 @@ referencing==0.35.1
|
|||||||
# via jsonschema
|
# via jsonschema
|
||||||
# via jsonschema-specifications
|
# via jsonschema-specifications
|
||||||
regex==2024.5.15
|
regex==2024.5.15
|
||||||
# via dateparser
|
|
||||||
# via tiktoken
|
# via tiktoken
|
||||||
requests==2.32.2
|
requests==2.32.2
|
||||||
# via burr
|
# via burr
|
||||||
@ -534,8 +512,6 @@ tenacity==8.3.0
|
|||||||
tiktoken==0.6.0
|
tiktoken==0.6.0
|
||||||
# via langchain-openai
|
# via langchain-openai
|
||||||
# via scrapegraphai
|
# via scrapegraphai
|
||||||
tld==0.13
|
|
||||||
# via courlan
|
|
||||||
tokenizers==0.19.1
|
tokenizers==0.19.1
|
||||||
# via anthropic
|
# via anthropic
|
||||||
toml==0.10.2
|
toml==0.10.2
|
||||||
@ -555,8 +531,6 @@ tqdm==4.66.4
|
|||||||
# via openai
|
# via openai
|
||||||
# via scrapegraphai
|
# via scrapegraphai
|
||||||
# via semchunk
|
# via semchunk
|
||||||
trafilatura==1.10.0
|
|
||||||
# via scrapegraphai
|
|
||||||
typer==0.12.3
|
typer==0.12.3
|
||||||
# via fastapi-cli
|
# via fastapi-cli
|
||||||
typing-extensions==4.12.0
|
typing-extensions==4.12.0
|
||||||
@ -586,8 +560,6 @@ typing-inspect==0.9.0
|
|||||||
# via sf-hamilton
|
# via sf-hamilton
|
||||||
tzdata==2024.1
|
tzdata==2024.1
|
||||||
# via pandas
|
# via pandas
|
||||||
tzlocal==5.2
|
|
||||||
# via dateparser
|
|
||||||
ujson==5.10.0
|
ujson==5.10.0
|
||||||
# via fastapi
|
# via fastapi
|
||||||
undetected-playwright==0.3.0
|
undetected-playwright==0.3.0
|
||||||
@ -596,10 +568,7 @@ uritemplate==4.1.1
|
|||||||
# via google-api-python-client
|
# via google-api-python-client
|
||||||
urllib3==1.26.18
|
urllib3==1.26.18
|
||||||
# via botocore
|
# via botocore
|
||||||
# via courlan
|
|
||||||
# via htmldate
|
|
||||||
# via requests
|
# via requests
|
||||||
# via trafilatura
|
|
||||||
uvicorn==0.29.0
|
uvicorn==0.29.0
|
||||||
# via burr
|
# via burr
|
||||||
# via fastapi
|
# via fastapi
|
||||||
|
|||||||
@ -28,8 +28,6 @@ async-timeout==4.0.3
|
|||||||
# via langchain
|
# via langchain
|
||||||
attrs==23.2.0
|
attrs==23.2.0
|
||||||
# via aiohttp
|
# via aiohttp
|
||||||
babel==2.15.0
|
|
||||||
# via courlan
|
|
||||||
beautifulsoup4==4.12.3
|
beautifulsoup4==4.12.3
|
||||||
# via google
|
# via google
|
||||||
# via scrapegraphai
|
# via scrapegraphai
|
||||||
@ -44,18 +42,11 @@ certifi==2024.2.2
|
|||||||
# via httpcore
|
# via httpcore
|
||||||
# via httpx
|
# via httpx
|
||||||
# via requests
|
# via requests
|
||||||
# via trafilatura
|
|
||||||
charset-normalizer==3.3.2
|
charset-normalizer==3.3.2
|
||||||
# via htmldate
|
|
||||||
# via requests
|
# via requests
|
||||||
# via trafilatura
|
|
||||||
courlan==1.2.0
|
|
||||||
# via trafilatura
|
|
||||||
dataclasses-json==0.6.6
|
dataclasses-json==0.6.6
|
||||||
# via langchain
|
# via langchain
|
||||||
# via langchain-community
|
# via langchain-community
|
||||||
dateparser==1.2.0
|
|
||||||
# via htmldate
|
|
||||||
defusedxml==0.7.1
|
defusedxml==0.7.1
|
||||||
# via langchain-anthropic
|
# via langchain-anthropic
|
||||||
distro==1.9.0
|
distro==1.9.0
|
||||||
@ -150,8 +141,6 @@ h11==0.14.0
|
|||||||
# via httpcore
|
# via httpcore
|
||||||
html2text==2024.2.26
|
html2text==2024.2.26
|
||||||
# via scrapegraphai
|
# via scrapegraphai
|
||||||
htmldate==1.8.1
|
|
||||||
# via trafilatura
|
|
||||||
httpcore==1.0.5
|
httpcore==1.0.5
|
||||||
# via httpx
|
# via httpx
|
||||||
httplib2==0.22.0
|
httplib2==0.22.0
|
||||||
@ -181,8 +170,6 @@ jsonpatch==1.33
|
|||||||
# via langchain-core
|
# via langchain-core
|
||||||
jsonpointer==2.4
|
jsonpointer==2.4
|
||||||
# via jsonpatch
|
# via jsonpatch
|
||||||
justext==3.0.1
|
|
||||||
# via trafilatura
|
|
||||||
langchain==0.1.15
|
langchain==0.1.15
|
||||||
# via scrapegraphai
|
# via scrapegraphai
|
||||||
langchain-anthropic==0.1.11
|
langchain-anthropic==0.1.11
|
||||||
@ -220,12 +207,6 @@ langsmith==0.1.63
|
|||||||
# via langchain-core
|
# via langchain-core
|
||||||
lxml==5.2.2
|
lxml==5.2.2
|
||||||
# via free-proxy
|
# via free-proxy
|
||||||
# via htmldate
|
|
||||||
# via justext
|
|
||||||
# via lxml-html-clean
|
|
||||||
# via trafilatura
|
|
||||||
lxml-html-clean==0.1.1
|
|
||||||
# via lxml
|
|
||||||
marshmallow==3.21.2
|
marshmallow==3.21.2
|
||||||
# via dataclasses-json
|
# via dataclasses-json
|
||||||
minify-html==0.15.0
|
minify-html==0.15.0
|
||||||
@ -298,14 +279,11 @@ pyparsing==3.1.2
|
|||||||
# via httplib2
|
# via httplib2
|
||||||
python-dateutil==2.9.0.post0
|
python-dateutil==2.9.0.post0
|
||||||
# via botocore
|
# via botocore
|
||||||
# via dateparser
|
|
||||||
# via google-cloud-bigquery
|
# via google-cloud-bigquery
|
||||||
# via htmldate
|
|
||||||
# via pandas
|
# via pandas
|
||||||
python-dotenv==1.0.1
|
python-dotenv==1.0.1
|
||||||
# via scrapegraphai
|
# via scrapegraphai
|
||||||
pytz==2024.1
|
pytz==2024.1
|
||||||
# via dateparser
|
|
||||||
# via pandas
|
# via pandas
|
||||||
pyyaml==6.0.1
|
pyyaml==6.0.1
|
||||||
# via huggingface-hub
|
# via huggingface-hub
|
||||||
@ -313,7 +291,6 @@ pyyaml==6.0.1
|
|||||||
# via langchain-community
|
# via langchain-community
|
||||||
# via langchain-core
|
# via langchain-core
|
||||||
regex==2024.5.15
|
regex==2024.5.15
|
||||||
# via dateparser
|
|
||||||
# via tiktoken
|
# via tiktoken
|
||||||
requests==2.32.2
|
requests==2.32.2
|
||||||
# via free-proxy
|
# via free-proxy
|
||||||
@ -354,8 +331,6 @@ tenacity==8.3.0
|
|||||||
tiktoken==0.6.0
|
tiktoken==0.6.0
|
||||||
# via langchain-openai
|
# via langchain-openai
|
||||||
# via scrapegraphai
|
# via scrapegraphai
|
||||||
tld==0.13
|
|
||||||
# via courlan
|
|
||||||
tokenizers==0.19.1
|
tokenizers==0.19.1
|
||||||
# via anthropic
|
# via anthropic
|
||||||
tqdm==4.66.4
|
tqdm==4.66.4
|
||||||
@ -364,8 +339,6 @@ tqdm==4.66.4
|
|||||||
# via openai
|
# via openai
|
||||||
# via scrapegraphai
|
# via scrapegraphai
|
||||||
# via semchunk
|
# via semchunk
|
||||||
trafilatura==1.10.0
|
|
||||||
# via scrapegraphai
|
|
||||||
typing-extensions==4.12.0
|
typing-extensions==4.12.0
|
||||||
# via anthropic
|
# via anthropic
|
||||||
# via anyio
|
# via anyio
|
||||||
@ -382,17 +355,12 @@ typing-inspect==0.9.0
|
|||||||
# via dataclasses-json
|
# via dataclasses-json
|
||||||
tzdata==2024.1
|
tzdata==2024.1
|
||||||
# via pandas
|
# via pandas
|
||||||
tzlocal==5.2
|
|
||||||
# via dateparser
|
|
||||||
undetected-playwright==0.3.0
|
undetected-playwright==0.3.0
|
||||||
# via scrapegraphai
|
# via scrapegraphai
|
||||||
uritemplate==4.1.1
|
uritemplate==4.1.1
|
||||||
# via google-api-python-client
|
# via google-api-python-client
|
||||||
urllib3==1.26.18
|
urllib3==1.26.18
|
||||||
# via botocore
|
# via botocore
|
||||||
# via courlan
|
|
||||||
# via htmldate
|
|
||||||
# via requests
|
# via requests
|
||||||
# via trafilatura
|
|
||||||
yarl==1.9.4
|
yarl==1.9.4
|
||||||
# via aiohttp
|
# via aiohttp
|
||||||
|
|||||||
@ -2,8 +2,6 @@
|
|||||||
convert_to_md modul
|
convert_to_md modul
|
||||||
"""
|
"""
|
||||||
import html2text
|
import html2text
|
||||||
from trafilatura import extract
|
|
||||||
|
|
||||||
|
|
||||||
def convert_to_md(html):
|
def convert_to_md(html):
|
||||||
""" Convert HTML to Markdown.
|
""" Convert HTML to Markdown.
|
||||||
@ -20,6 +18,6 @@ def convert_to_md(html):
|
|||||||
'This is a paragraph.\n\n# This is a heading.'
|
'This is a paragraph.\n\n# This is a heading.'
|
||||||
|
|
||||||
Note: All the styles and links are ignored during the conversion. """
|
Note: All the styles and links are ignored during the conversion. """
|
||||||
|
h = html2text.HTML2Text()
|
||||||
return extract(filecontent=html,include_images=True,
|
h.ignore_links = False
|
||||||
include_links=True, include_tables=True, output_format="markdown")
|
return h.handle(html)
|
||||||
|
|||||||
@ -7,7 +7,7 @@ def test_basic_html_to_md():
|
|||||||
|
|
||||||
def test_html_with_links_and_images():
|
def test_html_with_links_and_images():
|
||||||
html = '<p>This is a <a href="https://example.com">link</a> and this is an <img src="https://example.com/image.jpg" alt="image"></p>'
|
html = '<p>This is a <a href="https://example.com">link</a> and this is an <img src="https://example.com/image.jpg" alt="image"></p>'
|
||||||
assert convert_to_md(html) is None
|
assert convert_to_md(html) is not None
|
||||||
|
|
||||||
def test_html_with_tables():
|
def test_html_with_tables():
|
||||||
html = '''
|
html = '''
|
||||||
@ -17,11 +17,11 @@ def test_html_with_tables():
|
|||||||
<tr><td>Row 2, Cell 1</td><td>Row 2, Cell 2</td></tr>
|
<tr><td>Row 2, Cell 1</td><td>Row 2, Cell 2</td></tr>
|
||||||
</table>
|
</table>
|
||||||
'''
|
'''
|
||||||
assert convert_to_md(html) is None
|
assert convert_to_md(html) is not None
|
||||||
|
|
||||||
def test_empty_html():
|
def test_empty_html():
|
||||||
html = ""
|
html = ""
|
||||||
assert convert_to_md(html) is None
|
assert convert_to_md(html) is not None
|
||||||
|
|
||||||
def test_complex_html_structure():
|
def test_complex_html_structure():
|
||||||
html = '''
|
html = '''
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user