feat: refactoring_to_md function

This commit is contained in:
Marco Vinciguerra 2024-07-16 12:39:48 +02:00
parent bb624399cf
commit 602dd00209
5 changed files with 6 additions and 72 deletions

View File

@ -34,7 +34,6 @@ dependencies = [
"undetected-playwright==0.3.0", "undetected-playwright==0.3.0",
"semchunk==1.0.1", "semchunk==1.0.1",
"html2text==2024.2.26", "html2text==2024.2.26",
"trafilatura==1.10.0",
"langchain-fireworks==0.1.3" "langchain-fireworks==0.1.3"
] ]

View File

@ -41,7 +41,6 @@ attrs==23.2.0
# via jsonschema # via jsonschema
# via referencing # via referencing
babel==2.15.0 babel==2.15.0
# via courlan
# via sphinx # via sphinx
beautifulsoup4==4.12.3 beautifulsoup4==4.12.3
# via furo # via furo
@ -63,11 +62,8 @@ certifi==2024.2.2
# via httpcore # via httpcore
# via httpx # via httpx
# via requests # via requests
# via trafilatura
charset-normalizer==3.3.2 charset-normalizer==3.3.2
# via htmldate
# via requests # via requests
# via trafilatura
click==8.1.7 click==8.1.7
# via burr # via burr
# via streamlit # via streamlit
@ -75,15 +71,11 @@ click==8.1.7
# via uvicorn # via uvicorn
contourpy==1.2.1 contourpy==1.2.1
# via matplotlib # via matplotlib
courlan==1.2.0
# via trafilatura
cycler==0.12.1 cycler==0.12.1
# via matplotlib # via matplotlib
dataclasses-json==0.6.6 dataclasses-json==0.6.6
# via langchain # via langchain
# via langchain-community # via langchain-community
dateparser==1.2.0
# via htmldate
defusedxml==0.7.1 defusedxml==0.7.1
# via langchain-anthropic # via langchain-anthropic
dill==0.3.8 dill==0.3.8
@ -204,8 +196,6 @@ h11==0.14.0
# via uvicorn # via uvicorn
html2text==2024.2.26 html2text==2024.2.26
# via scrapegraphai # via scrapegraphai
htmldate==1.8.1
# via trafilatura
httpcore==1.0.5 httpcore==1.0.5
# via httpx # via httpx
httplib2==0.22.0 httplib2==0.22.0
@ -259,8 +249,6 @@ jsonschema==4.22.0
# via altair # via altair
jsonschema-specifications==2023.12.1 jsonschema-specifications==2023.12.1
# via jsonschema # via jsonschema
justext==3.0.1
# via trafilatura
kiwisolver==1.4.5 kiwisolver==1.4.5
# via matplotlib # via matplotlib
langchain==0.1.15 langchain==0.1.15
@ -302,12 +290,6 @@ loguru==0.7.2
# via burr # via burr
lxml==5.2.2 lxml==5.2.2
# via free-proxy # via free-proxy
# via htmldate
# via justext
# via lxml-html-clean
# via trafilatura
lxml-html-clean==0.1.1
# via lxml
markdown-it-py==3.0.0 markdown-it-py==3.0.0
# via rich # via rich
markupsafe==2.1.5 markupsafe==2.1.5
@ -430,9 +412,7 @@ pytest==8.0.0
pytest-mock==3.14.0 pytest-mock==3.14.0
python-dateutil==2.9.0.post0 python-dateutil==2.9.0.post0
# via botocore # via botocore
# via dateparser
# via google-cloud-bigquery # via google-cloud-bigquery
# via htmldate
# via matplotlib # via matplotlib
# via pandas # via pandas
python-dotenv==1.0.1 python-dotenv==1.0.1
@ -441,7 +421,6 @@ python-dotenv==1.0.1
python-multipart==0.0.9 python-multipart==0.0.9
# via fastapi # via fastapi
pytz==2024.1 pytz==2024.1
# via dateparser
# via pandas # via pandas
pyyaml==6.0.1 pyyaml==6.0.1
# via huggingface-hub # via huggingface-hub
@ -453,7 +432,6 @@ referencing==0.35.1
# via jsonschema # via jsonschema
# via jsonschema-specifications # via jsonschema-specifications
regex==2024.5.15 regex==2024.5.15
# via dateparser
# via tiktoken # via tiktoken
requests==2.32.2 requests==2.32.2
# via burr # via burr
@ -534,8 +512,6 @@ tenacity==8.3.0
tiktoken==0.6.0 tiktoken==0.6.0
# via langchain-openai # via langchain-openai
# via scrapegraphai # via scrapegraphai
tld==0.13
# via courlan
tokenizers==0.19.1 tokenizers==0.19.1
# via anthropic # via anthropic
toml==0.10.2 toml==0.10.2
@ -555,8 +531,6 @@ tqdm==4.66.4
# via openai # via openai
# via scrapegraphai # via scrapegraphai
# via semchunk # via semchunk
trafilatura==1.10.0
# via scrapegraphai
typer==0.12.3 typer==0.12.3
# via fastapi-cli # via fastapi-cli
typing-extensions==4.12.0 typing-extensions==4.12.0
@ -586,8 +560,6 @@ typing-inspect==0.9.0
# via sf-hamilton # via sf-hamilton
tzdata==2024.1 tzdata==2024.1
# via pandas # via pandas
tzlocal==5.2
# via dateparser
ujson==5.10.0 ujson==5.10.0
# via fastapi # via fastapi
undetected-playwright==0.3.0 undetected-playwright==0.3.0
@ -596,10 +568,7 @@ uritemplate==4.1.1
# via google-api-python-client # via google-api-python-client
urllib3==1.26.18 urllib3==1.26.18
# via botocore # via botocore
# via courlan
# via htmldate
# via requests # via requests
# via trafilatura
uvicorn==0.29.0 uvicorn==0.29.0
# via burr # via burr
# via fastapi # via fastapi

View File

@ -28,8 +28,6 @@ async-timeout==4.0.3
# via langchain # via langchain
attrs==23.2.0 attrs==23.2.0
# via aiohttp # via aiohttp
babel==2.15.0
# via courlan
beautifulsoup4==4.12.3 beautifulsoup4==4.12.3
# via google # via google
# via scrapegraphai # via scrapegraphai
@ -44,18 +42,11 @@ certifi==2024.2.2
# via httpcore # via httpcore
# via httpx # via httpx
# via requests # via requests
# via trafilatura
charset-normalizer==3.3.2 charset-normalizer==3.3.2
# via htmldate
# via requests # via requests
# via trafilatura
courlan==1.2.0
# via trafilatura
dataclasses-json==0.6.6 dataclasses-json==0.6.6
# via langchain # via langchain
# via langchain-community # via langchain-community
dateparser==1.2.0
# via htmldate
defusedxml==0.7.1 defusedxml==0.7.1
# via langchain-anthropic # via langchain-anthropic
distro==1.9.0 distro==1.9.0
@ -150,8 +141,6 @@ h11==0.14.0
# via httpcore # via httpcore
html2text==2024.2.26 html2text==2024.2.26
# via scrapegraphai # via scrapegraphai
htmldate==1.8.1
# via trafilatura
httpcore==1.0.5 httpcore==1.0.5
# via httpx # via httpx
httplib2==0.22.0 httplib2==0.22.0
@ -181,8 +170,6 @@ jsonpatch==1.33
# via langchain-core # via langchain-core
jsonpointer==2.4 jsonpointer==2.4
# via jsonpatch # via jsonpatch
justext==3.0.1
# via trafilatura
langchain==0.1.15 langchain==0.1.15
# via scrapegraphai # via scrapegraphai
langchain-anthropic==0.1.11 langchain-anthropic==0.1.11
@ -220,12 +207,6 @@ langsmith==0.1.63
# via langchain-core # via langchain-core
lxml==5.2.2 lxml==5.2.2
# via free-proxy # via free-proxy
# via htmldate
# via justext
# via lxml-html-clean
# via trafilatura
lxml-html-clean==0.1.1
# via lxml
marshmallow==3.21.2 marshmallow==3.21.2
# via dataclasses-json # via dataclasses-json
minify-html==0.15.0 minify-html==0.15.0
@ -298,14 +279,11 @@ pyparsing==3.1.2
# via httplib2 # via httplib2
python-dateutil==2.9.0.post0 python-dateutil==2.9.0.post0
# via botocore # via botocore
# via dateparser
# via google-cloud-bigquery # via google-cloud-bigquery
# via htmldate
# via pandas # via pandas
python-dotenv==1.0.1 python-dotenv==1.0.1
# via scrapegraphai # via scrapegraphai
pytz==2024.1 pytz==2024.1
# via dateparser
# via pandas # via pandas
pyyaml==6.0.1 pyyaml==6.0.1
# via huggingface-hub # via huggingface-hub
@ -313,7 +291,6 @@ pyyaml==6.0.1
# via langchain-community # via langchain-community
# via langchain-core # via langchain-core
regex==2024.5.15 regex==2024.5.15
# via dateparser
# via tiktoken # via tiktoken
requests==2.32.2 requests==2.32.2
# via free-proxy # via free-proxy
@ -354,8 +331,6 @@ tenacity==8.3.0
tiktoken==0.6.0 tiktoken==0.6.0
# via langchain-openai # via langchain-openai
# via scrapegraphai # via scrapegraphai
tld==0.13
# via courlan
tokenizers==0.19.1 tokenizers==0.19.1
# via anthropic # via anthropic
tqdm==4.66.4 tqdm==4.66.4
@ -364,8 +339,6 @@ tqdm==4.66.4
# via openai # via openai
# via scrapegraphai # via scrapegraphai
# via semchunk # via semchunk
trafilatura==1.10.0
# via scrapegraphai
typing-extensions==4.12.0 typing-extensions==4.12.0
# via anthropic # via anthropic
# via anyio # via anyio
@ -382,17 +355,12 @@ typing-inspect==0.9.0
# via dataclasses-json # via dataclasses-json
tzdata==2024.1 tzdata==2024.1
# via pandas # via pandas
tzlocal==5.2
# via dateparser
undetected-playwright==0.3.0 undetected-playwright==0.3.0
# via scrapegraphai # via scrapegraphai
uritemplate==4.1.1 uritemplate==4.1.1
# via google-api-python-client # via google-api-python-client
urllib3==1.26.18 urllib3==1.26.18
# via botocore # via botocore
# via courlan
# via htmldate
# via requests # via requests
# via trafilatura
yarl==1.9.4 yarl==1.9.4
# via aiohttp # via aiohttp

View File

@ -2,8 +2,6 @@
convert_to_md modul convert_to_md modul
""" """
import html2text import html2text
from trafilatura import extract
def convert_to_md(html): def convert_to_md(html):
""" Convert HTML to Markdown. """ Convert HTML to Markdown.
@ -20,6 +18,6 @@ def convert_to_md(html):
'This is a paragraph.\n\n# This is a heading.' 'This is a paragraph.\n\n# This is a heading.'
Note: All the styles and links are ignored during the conversion. """ Note: All the styles and links are ignored during the conversion. """
h = html2text.HTML2Text()
return extract(filecontent=html,include_images=True, h.ignore_links = False
include_links=True, include_tables=True, output_format="markdown") return h.handle(html)

View File

@ -7,7 +7,7 @@ def test_basic_html_to_md():
def test_html_with_links_and_images(): def test_html_with_links_and_images():
html = '<p>This is a <a href="https://example.com">link</a> and this is an <img src="https://example.com/image.jpg" alt="image"></p>' html = '<p>This is a <a href="https://example.com">link</a> and this is an <img src="https://example.com/image.jpg" alt="image"></p>'
assert convert_to_md(html) is None assert convert_to_md(html) is not None
def test_html_with_tables(): def test_html_with_tables():
html = ''' html = '''
@ -17,11 +17,11 @@ def test_html_with_tables():
<tr><td>Row 2, Cell 1</td><td>Row 2, Cell 2</td></tr> <tr><td>Row 2, Cell 1</td><td>Row 2, Cell 2</td></tr>
</table> </table>
''' '''
assert convert_to_md(html) is None assert convert_to_md(html) is not None
def test_empty_html(): def test_empty_html():
html = "" html = ""
assert convert_to_md(html) is None assert convert_to_md(html) is not None
def test_complex_html_structure(): def test_complex_html_structure():
html = ''' html = '''