mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-01 21:00:48 +08:00
perf: Proxy integration in googlesearch
This commit is contained in:
parent
6fd9f14b57
commit
e828c7010a
8
=1.2.5
8
=1.2.5
@ -1,8 +0,0 @@
|
||||
Requirement already satisfied: googlesearch-python in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (1.2.5)
|
||||
Requirement already satisfied: beautifulsoup4>=4.9 in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (from googlesearch-python) (4.12.3)
|
||||
Requirement already satisfied: requests>=2.20 in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (from googlesearch-python) (2.32.3)
|
||||
Requirement already satisfied: soupsieve>1.2 in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (from beautifulsoup4>=4.9->googlesearch-python) (2.6)
|
||||
Requirement already satisfied: charset-normalizer<4,>=2 in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (from requests>=2.20->googlesearch-python) (3.3.2)
|
||||
Requirement already satisfied: idna<4,>=2.5 in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (from requests>=2.20->googlesearch-python) (3.10)
|
||||
Requirement already satisfied: urllib3<3,>=1.21.1 in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (from requests>=2.20->googlesearch-python) (2.2.3)
|
||||
Requirement already satisfied: certifi>=2017.4.17 in /home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages (from requests>=2.20->googlesearch-python) (2024.8.30)
|
||||
274
notebook.ipynb
274
notebook.ipynb
@ -1,274 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pip install -e ."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from scrapegraphai.graphs import SearchGraph"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"PROXY: https://vzktqema:btngo4nn7n6l@63.141.62.30:6323\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "TypeError",
|
||||
"evalue": "search() got an unexpected keyword argument 'num_results'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[3], line 36\u001b[0m\n\u001b[1;32m 31\u001b[0m prompt \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mList the top 5 companies in the world by market capitalization.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 32\u001b[0m search_graph \u001b[38;5;241m=\u001b[39m SearchGraph(\n\u001b[1;32m 33\u001b[0m prompt\u001b[38;5;241m=\u001b[39mprompt,\n\u001b[1;32m 34\u001b[0m config\u001b[38;5;241m=\u001b[39mgraph_config\n\u001b[1;32m 35\u001b[0m )\n\u001b[0;32m---> 36\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43msearch_graph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"File \u001b[0;32m~/AUK/Colloborations_Scrapegraphai/Scrapegraph-ai/scrapegraphai/graphs/search_graph.py:121\u001b[0m, in \u001b[0;36mSearchGraph.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 114\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;124;03mExecutes the web scraping and searching process.\u001b[39;00m\n\u001b[1;32m 116\u001b[0m \n\u001b[1;32m 117\u001b[0m \u001b[38;5;124;03mReturns:\u001b[39;00m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;124;03m str: The answer to the prompt.\u001b[39;00m\n\u001b[1;32m 119\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 120\u001b[0m inputs \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muser_prompt\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprompt}\n\u001b[0;32m--> 121\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfinal_state, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexecution_info \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgraph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 123\u001b[0m \u001b[38;5;66;03m# Store the URLs after execution\u001b[39;00m\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124murls\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfinal_state:\n",
|
||||
"File \u001b[0;32m~/AUK/Colloborations_Scrapegraphai/Scrapegraph-ai/scrapegraphai/graphs/base_graph.py:281\u001b[0m, in \u001b[0;36mBaseGraph.execute\u001b[0;34m(self, initial_state)\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m (result[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_state\u001b[39m\u001b[38;5;124m\"\u001b[39m], [])\n\u001b[1;32m 280\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 281\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_standard\u001b[49m\u001b[43m(\u001b[49m\u001b[43minitial_state\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"File \u001b[0;32m~/AUK/Colloborations_Scrapegraphai/Scrapegraph-ai/scrapegraphai/graphs/base_graph.py:197\u001b[0m, in \u001b[0;36mBaseGraph._execute_standard\u001b[0;34m(self, initial_state)\u001b[0m\n\u001b[1;32m 184\u001b[0m graph_execution_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m start_time\n\u001b[1;32m 185\u001b[0m log_graph_execution(\n\u001b[1;32m 186\u001b[0m graph_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgraph_name,\n\u001b[1;32m 187\u001b[0m source\u001b[38;5;241m=\u001b[39msource,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 195\u001b[0m exception\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mstr\u001b[39m(e)\n\u001b[1;32m 196\u001b[0m )\n\u001b[0;32m--> 197\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m 198\u001b[0m node_exec_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m curr_time\n\u001b[1;32m 199\u001b[0m total_exec_time \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m node_exec_time\n",
|
||||
"File \u001b[0;32m~/AUK/Colloborations_Scrapegraphai/Scrapegraph-ai/scrapegraphai/graphs/base_graph.py:181\u001b[0m, in \u001b[0;36mBaseGraph._execute_standard\u001b[0;34m(self, initial_state)\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_manager\u001b[38;5;241m.\u001b[39mexclusive_get_callback(llm_model, llm_model_name) \u001b[38;5;28;01mas\u001b[39;00m cb:\n\u001b[1;32m 180\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 181\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mcurrent_node\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstate\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 182\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 183\u001b[0m error_node \u001b[38;5;241m=\u001b[39m current_node\u001b[38;5;241m.\u001b[39mnode_name\n",
|
||||
"File \u001b[0;32m~/AUK/Colloborations_Scrapegraphai/Scrapegraph-ai/scrapegraphai/nodes/search_internet_node.py:97\u001b[0m, in \u001b[0;36mSearchInternetNode.execute\u001b[0;34m(self, state)\u001b[0m\n\u001b[1;32m 93\u001b[0m search_query \u001b[38;5;241m=\u001b[39m search_answer\u001b[38;5;241m.\u001b[39minvoke({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muser_prompt\u001b[39m\u001b[38;5;124m\"\u001b[39m: user_prompt})[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 95\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlogger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSearch Query: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msearch_query\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 97\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[43msearch_on_web\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msearch_query\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 98\u001b[0m \u001b[43m \u001b[49m\u001b[43msearch_engine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msearch_engine\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mproxy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mproxy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(answer) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mZero results found for the search query.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
||||
"File \u001b[0;32m~/AUK/Colloborations_Scrapegraphai/Scrapegraph-ai/scrapegraphai/utils/research_web.py:74\u001b[0m, in \u001b[0;36msearch_on_web\u001b[0;34m(query, search_engine, max_results, port, timeout, proxy)\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPROXY: \u001b[39m\u001b[38;5;124m\"\u001b[39m, proxy)\n\u001b[1;32m 73\u001b[0m res \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m---> 74\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m url \u001b[38;5;129;01min\u001b[39;00m \u001b[43mgoogle_search\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mproxy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproxy\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 75\u001b[0m res\u001b[38;5;241m.\u001b[39mappend(url)\n\u001b[1;32m 76\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m filter_pdf_links(res)\n",
|
||||
"\u001b[0;31mTypeError\u001b[0m: search() got an unexpected keyword argument 'num_results'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"os.environ['AZURE_OPENAI_GPT4O_SERVICE']=\"dwtc-openai-gpt4o\"\n",
|
||||
"os.environ['AZURE_OPENAI_GPT4O_DEPLOYMENT']=\"gpt4o\"\n",
|
||||
"os.environ['AZURE_OPENAI_GPT4O_KEY']=\"3cb3875145ec425880c6974d74e10cd7\"\n",
|
||||
"os.environ['AZURE_OPENAI_GPT4O_API_VERSION']=\"2024-02-15-preview\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"graph_config = {\n",
|
||||
" \"llm\": {\n",
|
||||
" \"model\": \"azure_openai/gpt-4o\",\n",
|
||||
" \"api_key\": os.environ['AZURE_OPENAI_GPT4O_KEY'],\n",
|
||||
" \"azure_endpoint\": f\"https://{os.environ['AZURE_OPENAI_GPT4O_SERVICE']}.openai.azure.com\",\n",
|
||||
" \"azure_deployment\": os.environ['AZURE_OPENAI_GPT4O_DEPLOYMENT'],\n",
|
||||
" \"api_version\": os.environ['AZURE_OPENAI_GPT4O_API_VERSION'],\n",
|
||||
" \"temperature\": 0.0,\n",
|
||||
" },\n",
|
||||
"\n",
|
||||
" \"loader_kwargs\": {\n",
|
||||
" \"proxy\" : {\n",
|
||||
" \"server\": '63.141.62.30:6323', \n",
|
||||
" \"username\": \"vzktqema\", \n",
|
||||
" \"password\": \"btngo4nn7n6l\",\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
"\n",
|
||||
" \"verbose\": False,\n",
|
||||
" \"headless\": True,\n",
|
||||
" \"max_sites\": 1\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
"prompt = \"List the top 5 companies in the world by market capitalization.\"\n",
|
||||
"search_graph = SearchGraph(\n",
|
||||
" prompt=prompt,\n",
|
||||
" config=graph_config\n",
|
||||
" )\n",
|
||||
"result = search_graph.run()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "ImportError",
|
||||
"evalue": "cannot import name 'search' from 'googlesearch' (unknown location)",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgooglesearch\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m search\n",
|
||||
"\u001b[0;31mImportError\u001b[0m: cannot import name 'search' from 'googlesearch' (unknown location)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from googlesearch import search"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"102.77s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Found existing installation: google 3.0.0\n",
|
||||
"Uninstalling google-3.0.0:\n",
|
||||
" Successfully uninstalled google-3.0.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!pip uninstall google -y"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"search()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'/home/funavry/anaconda3/envs/colscrap-env/lib/python3.11/site-packages/googlesearch/__init__.py'"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import inspect\n",
|
||||
"inspect.getfile(search)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "ImportError",
|
||||
"evalue": "cannot import name 'search' from 'googlesearch' (unknown location)",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgooglesearch\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m search\n\u001b[1;32m 2\u001b[0m search(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGoogle\u001b[39m\u001b[38;5;124m\"\u001b[39m, num_results\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m)\n",
|
||||
"\u001b[0;31mImportError\u001b[0m: cannot import name 'search' from 'googlesearch' (unknown location)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from googlesearch import search\n",
|
||||
"search(\"Google\", num_results=100)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from scrapegraphai.utils import research_web"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import concurrent.futures\n",
|
||||
"import time\n",
|
||||
"from googlesearch import search # Ensure you have the googlesearch package installed\n",
|
||||
"\n",
|
||||
"def fetch_url(query):\n",
|
||||
" # Fetch the URLs from the search query\n",
|
||||
" return list(search(query, stop=10)) # Fetch 10 URLs for each query\n",
|
||||
"\n",
|
||||
"def main():\n",
|
||||
" query = \"Weather in Pakistan\"\n",
|
||||
" batch_size = 50 # Number of requests to send concurrently\n",
|
||||
"\n",
|
||||
" res = []\n",
|
||||
" # Create a ThreadPoolExecutor to manage threads\n",
|
||||
" with concurrent.futures.ThreadPoolExecutor(max_workers=batch_size) as executor:\n",
|
||||
" # Submit multiple fetch requests to the executor\n",
|
||||
" future_to_url = {executor.submit(fetch_url, query): i for i in range(batch_size)}\n",
|
||||
" \n",
|
||||
" for future in concurrent.futures.as_completed(future_to_url):\n",
|
||||
" try:\n",
|
||||
" urls = future.result()\n",
|
||||
" res.append(urls) # Extend the results with the fetched URLs\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error fetching data: {e}\")\n",
|
||||
"\n",
|
||||
" return res\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" result = main()\n",
|
||||
" print(len(result))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "colscrap-env",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@ -94,7 +94,7 @@ class SearchInternetNode(BaseNode):
|
||||
|
||||
self.logger.info(f"Search Query: {search_query}")
|
||||
|
||||
answer = search_on_web(query=search_query, max_results=self.max_results,
|
||||
answer = search_on_web(query=search_query, num_results=self.max_results,
|
||||
search_engine=self.search_engine, proxy=self.proxy)
|
||||
|
||||
if len(answer) == 0:
|
||||
|
||||
@ -44,7 +44,7 @@ def search_on_web(query: str, search_engine: str = "Google",
|
||||
password = proxy.get('password')
|
||||
|
||||
if all([username, password, server]):
|
||||
proxy_url = f"https://{username}:{password}@{server}"
|
||||
proxy_url = f"http://{username}:{password}@{server}"
|
||||
return proxy_url
|
||||
else:
|
||||
raise ValueError("Proxy dictionary is missing required fields.")
|
||||
@ -69,7 +69,6 @@ def search_on_web(query: str, search_engine: str = "Google",
|
||||
proxy = format_proxy(proxy)
|
||||
|
||||
if search_engine.lower() == "google":
|
||||
print("PROXY: ", proxy)
|
||||
res = []
|
||||
for url in google_search(query, num_results=max_results, proxy=proxy):
|
||||
res.append(url)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user