Spider Search / Scrape as a Tool (FlowiseAI#2972)

toi500 · toi500 · Alves, Patrick · commit c29e4c1bde57 · 2024-09-03T15:21:51.000Z
adding tools

Co-authored-by: toi500 &lt;toi500@gmail.com&gt;
diff --git a/packages/server/marketplaces/tools/Spider Web Scraper.json b/packages/server/marketplaces/tools/Spider Web Scraper.json
@@ -0,0 +1,8 @@
+{
+    "name": "webpage_scraper",
+    "description": "This tool is useful for extracting up-to-date information (text) from web pages, making it ideal for gathering data for analysis. If the user provides multiple URLs, process each one separately and then synthesize the extracted information into a single, comprehensive response. Make sure to add the HTTP protocol (https://) to website URLs if the user forgets to do so.\n\nImportant: The webpage_scraper function retrieves the raw text content of any webpage. It does not provide any structural information like headings, paragraphs, or specific elements.",
+    "color": "linear-gradient(rgb(75,205,223), rgb(4,90,12))",
+    "iconSrc": "https://raw.githubusercontent.com/FlowiseAI/Flowise/main/packages/components/nodes/documentloaders/Spider/spider.svg",
+    "schema": "[{\"id\":0,\"property\":\"url\",\"description\":\"This is the URL provided by the user\",\"type\":\"string\",\"required\":true}]",
+    "func": "const fetch = require('node-fetch');\nconst targetUrl = $url;\nconst data = {\n  \"depth\": 1,\n  \"limit\": 1,\n  \"proxy_enabled\": true,\n  \"anti_bot\": true,\n  \"request\": \"smart\",\n  \"return_format\": \"text\",\n  \"cache\": true,\n  \"store_data\":true,\n  \"url\": `${targetUrl}`\n};\n\nconst url = 'https://api.spider.cloud/crawl';\n\ntry {\n    const response = await fetch(url, {\n        method: 'POST',\n        headers: {\n            'Authorization': `Bearer SPIDER_API_KEY`,\n            'Content-Type': 'application/json'\n        },\n        body: JSON.stringify(data)\n    });\n    if (!response.ok) {\n        console.error('Network response was not ok:', response.statusText);\n        return `Error: ${response.statusText}`; \n    }\n    const text = await response.text(); \n    return text; \n} catch (error) {\n    console.error(error);\n    return ''; \n}\n\n/*\n * Works well with OpenAI models (gpt-4o and gpt-4o-mini). \n * Inconsistencies may occur with Google models (Gemini 1.5, 1.5 Flash).\n * Other models are untested.\n *\n * For Scraping:\n * depth (number): The maximum scrape depth (0 for no limit).\n * limit (number): The maximum number of pages to scrape per website.\n * proxy_enabled (boolean): Enables the use of premium proxies for scraping.\n * anti_bot (boolean): Enable anti-bot mode using techniques to increase the chance of success\n * request (string): The request type: 'http', 'chrome', or 'smart'.\n * return_format (string): The format for the returned data.\n * cache (boolean): Use HTTP caching for the crawl to speed up repeated runs.\n * store_data (boolean): To collect resources to download and re-use later on.\n * url (string): The URI of the resource to scrape.\n * \n * For more options:\n * https://spider.cloud/docs/api\n */\n"
+}
diff --git a/packages/server/marketplaces/tools/Spider Web Search & Scrape.json b/packages/server/marketplaces/tools/Spider Web Search & Scrape.json
@@ -0,0 +1,8 @@
+{
+    "name": "metasearch_engine",
+    "description": "This tool provides real-time information from the internet using a Metasearch Engine, ensuring up-to-date and relevant responses. Use it to research complex topics by strategically breaking them down into multiple, targeted search queries, exploring different facets and subtopics to gather a comprehensive understanding. If needed, you can use this tool multiple times, but refine your queries based on previous results rather than repeating the same search. Before using the tool, make sure to improve the user's search query to make it clear, thorough, and optimized for the most relevant results.\n",
+    "color": "linear-gradient(rgb(181,220,163), rgb(216,1,106))",
+    "iconSrc": "https://raw.githubusercontent.com/FlowiseAI/Flowise/main/packages/components/nodes/documentloaders/Spider/spider.svg",
+    "schema": "[{\"id\":0,\"property\":\"query\",\"description\":\"This is the search query\",\"type\":\"string\",\"required\":true}]",
+    "func": "const fetch = require('node-fetch');\nconst searchQuery = $query;\nconst data = {\n  \"search\": `${searchQuery}`,\n  \"country\": \"us\",\n  \"language\":\"en\",\n  \"cache\": true,\n  \"store_data\": true,\n  \"search_limit\": 3,\n  \"depth\": 1,\n  \"limit\": 1,\n  \"proxy_enabled\": true,\n  \"anti_bot\": true,\n  \"request\": \"smart\",\n  \"return_format\": \"text\"\n};\n\nconst url = 'https://api.spider.cloud/search';\n\ntry {\n    const response = await fetch(url, {\n        method: 'POST',\n        headers: {\n            'Authorization': `Bearer SPIDER_API_KEY`, \n            'Content-Type': 'application/json'\n        },\n        body: JSON.stringify(data)\n    });\n    if (!response.ok) {\n        console.error('Network response was not ok:', response.statusText);\n        return `Error: ${response.statusText}`; \n    }\n    const text = await response.text(); \n    return text; \n} catch (error) {\n    console.error(error);\n    return ''; \n}\n\n/*\n * This tool performs a meta search on any given topic and immediately scrapes \n * the discovered URLs in a single API call.\n * \n * Works well with OpenAI models (gpt-4o and gpt-4o-mini). \n * Inconsistencies may occur with Google models (Gemini).\n * Other models are untested.\n *\n * For Searching:\n * search (string): The search query you want to search for.\n * country (string): The two-letter country code to use for the search.\n * language (string): The language to use for the search.\n * cache (boolean): Use HTTP caching for the crawl to speed up repeated runs.\n * store_data (boolean): To collect resources to download and re-use later on.\n * search_limit (number): The maximum number of URLs to fetch from the search results.\n *\n * For Scraping:\n * depth (number): The maximum scrape depth (0 for no limit).\n * limit (number): The maximum number of pages to scrape per website.\n * proxy_enabled (boolean): Enables the use of premium proxies for scraping.\n * anti_bot (boolean): Enable anti-bot mode using techniques to increase the chance of success\n * request (string): The request type: 'http', 'chrome', or 'smart'.\n * return_format (string): The format for the returned data.\n *\n * For more options:\n * https://spider.cloud/docs/api\n */"
+}