From d1afd2bd695e7c75896686e822a49df49f14ad85 Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Fri, 5 Apr 2024 20:42:37 +0200 Subject: [PATCH] add xml parsing example --- README.md | 2 +- examples/inputs/books.xml | 120 ++++++++++++++++++++++++++++++++ examples/scrape_plain_text.py | 2 +- examples/scrape_xml_document.py | 37 ++++++++++ 4 files changed, 159 insertions(+), 2 deletions(-) create mode 100644 examples/inputs/books.xml create mode 100644 examples/scrape_xml_document.py diff --git a/README.md b/README.md index 3c73bd22..ad04e4cc 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ [![](https://dcbadge.vercel.app/api/server/gkxQDAjfeX)](https://discord.gg/gkxQDAjfeX) -ScrapeGraphAI is a *web scraping* python library based on LangChain which uses LLM and direct graph logic to create scraping pipelines for websites and documents. +ScrapeGraphAI is a *web scraping* python library which uses LLM and direct graph logic to create scraping pipelines for websites, documents and XML files. Just say which information you want to extract and the library will do it for you!

diff --git a/examples/inputs/books.xml b/examples/inputs/books.xml new file mode 100644 index 00000000..e3d1fe87 --- /dev/null +++ b/examples/inputs/books.xml @@ -0,0 +1,120 @@ + + + + Gambardella, Matthew + XML Developer's Guide + Computer + 44.95 + 2000-10-01 + An in-depth look at creating applications + with XML. + + + Ralls, Kim + Midnight Rain + Fantasy + 5.95 + 2000-12-16 + A former architect battles corporate zombies, + an evil sorceress, and her own childhood to become queen + of the world. + + + Corets, Eva + Maeve Ascendant + Fantasy + 5.95 + 2000-11-17 + After the collapse of a nanotechnology + society in England, the young survivors lay the + foundation for a new society. + + + Corets, Eva + Oberon's Legacy + Fantasy + 5.95 + 2001-03-10 + In post-apocalypse England, the mysterious + agent known only as Oberon helps to create a new life + for the inhabitants of London. Sequel to Maeve + Ascendant. + + + Corets, Eva + The Sundered Grail + Fantasy + 5.95 + 2001-09-10 + The two daughters of Maeve, half-sisters, + battle one another for control of England. Sequel to + Oberon's Legacy. + + + Randall, Cynthia + Lover Birds + Romance + 4.95 + 2000-09-02 + When Carla meets Paul at an ornithology + conference, tempers fly as feathers get ruffled. + + + Thurman, Paula + Splish Splash + Romance + 4.95 + 2000-11-02 + A deep sea diver finds true love twenty + thousand leagues beneath the sea. + + + Knorr, Stefan + Creepy Crawlies + Horror + 4.95 + 2000-12-06 + An anthology of horror stories about roaches, + centipedes, scorpions and other insects. + + + Kress, Peter + Paradox Lost + Science Fiction + 6.95 + 2000-11-02 + After an inadvertant trip through a Heisenberg + Uncertainty Device, James Salway discovers the problems + of being quantum. + + + O'Brien, Tim + Microsoft .NET: The Programming Bible + Computer + 36.95 + 2000-12-09 + Microsoft's .NET initiative is explored in + detail in this deep programmer's reference. + + + O'Brien, Tim + MSXML3: A Comprehensive Guide + Computer + 36.95 + 2000-12-01 + The Microsoft MSXML3 parser is covered in + detail, with attention to XML DOM interfaces, XSLT processing, + SAX and more. + + + Galos, Mike + Visual Studio 7: A Comprehensive Guide + Computer + 49.95 + 2001-04-16 + Microsoft Visual Studio 7 is explored in depth, + looking at how Visual Basic, Visual C++, C#, and ASP+ are + integrated into a comprehensive development + environment. + + \ No newline at end of file diff --git a/examples/scrape_plain_text.py b/examples/scrape_plain_text.py index 81dee0f9..03dccdf0 100644 --- a/examples/scrape_plain_text.py +++ b/examples/scrape_plain_text.py @@ -1,5 +1,5 @@ """ -Basic example of scraping pipeline using SmartScraper +Basic example of scraping pipeline using SmartScraper from text """ import os diff --git a/examples/scrape_xml_document.py b/examples/scrape_xml_document.py new file mode 100644 index 00000000..c4ed4d4e --- /dev/null +++ b/examples/scrape_xml_document.py @@ -0,0 +1,37 @@ +""" +Basic example of scraping pipeline using SmartScraper from XML documents +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json + +load_dotenv() +openai_key = os.getenv("OPENAI_APIKEY") + +# Define the configuration for the graph +graph_config = { + "llm": { + "api_key": openai_key, + "model": "gpt-3.5-turbo", + }, +} + +# Read the XML file +with open('inputs/books.xml', 'r', encoding="utf-8") as file: + text = file.read() + +# Create the SmartScraperGraph instance +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the authors, title and genres of the books", + file_source=text, # Pass the content of the file, not the file object + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result")