Skip to content

Commit 5894076

Browse files
authored
Merge pull request #1120 from aparupganguly/feature-o3-crawler
Feature o3-mini web crawler
2 parents e0c292f + 01d656f commit 5894076

File tree

1 file changed

+175
-0
lines changed

1 file changed

+175
-0
lines changed
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
import os
2+
from firecrawl import FirecrawlApp
3+
import json
4+
from dotenv import load_dotenv
5+
from openai import OpenAI
6+
7+
# ANSI color codes
8+
class Colors:
9+
CYAN = '\033[96m'
10+
YELLOW = '\033[93m'
11+
GREEN = '\033[92m'
12+
RED = '\033[91m'
13+
MAGENTA = '\033[95m'
14+
BLUE = '\033[94m'
15+
RESET = '\033[0m'
16+
17+
# Load environment variables
18+
load_dotenv()
19+
20+
# Retrieve API keys from environment variables
21+
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
22+
openai_api_key = os.getenv("OPENAI_API_KEY")
23+
24+
# Initialize the FirecrawlApp and OpenAI client
25+
app = FirecrawlApp(api_key=firecrawl_api_key)
26+
client = OpenAI(api_key=openai_api_key)
27+
28+
# Find the page that most likely contains the objective
29+
def find_relevant_page_via_map(objective, url, app, client):
30+
try:
31+
print(f"{Colors.CYAN}Understood. The objective is: {objective}{Colors.RESET}")
32+
print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}")
33+
34+
map_prompt = f"""
35+
The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else.
36+
"""
37+
38+
print(f"{Colors.YELLOW}Analyzing objective to determine optimal search parameter...{Colors.RESET}")
39+
completion = client.chat.completions.create(
40+
model="o3-mini",
41+
messages=[
42+
{
43+
"role": "user",
44+
"content": [
45+
{
46+
"type": "text",
47+
"text": map_prompt
48+
}
49+
]
50+
}
51+
]
52+
)
53+
54+
map_search_parameter = completion.choices[0].message.content
55+
print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}")
56+
57+
print(f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}")
58+
map_website = app.map_url(url, params={"search": map_search_parameter})
59+
60+
# Debug print to see the response structure
61+
print(f"{Colors.MAGENTA}Debug - Map response structure: {json.dumps(map_website, indent=2)}{Colors.RESET}")
62+
63+
print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}")
64+
65+
# Handle the response based on its structure
66+
if isinstance(map_website, dict):
67+
# Assuming the links are in a 'urls' or similar key
68+
links = map_website.get('urls', []) or map_website.get('links', [])
69+
elif isinstance(map_website, str):
70+
try:
71+
parsed = json.loads(map_website)
72+
links = parsed.get('urls', []) or parsed.get('links', [])
73+
except json.JSONDecodeError:
74+
links = []
75+
else:
76+
links = map_website if isinstance(map_website, list) else []
77+
78+
print(f"{Colors.GREEN}Located {len(links)} relevant links.{Colors.RESET}")
79+
return links
80+
81+
except Exception as e:
82+
print(f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}")
83+
return None
84+
85+
# Scrape the top 3 pages and see if the objective is met, if so return in json format else return None
86+
def find_objective_in_top_pages(map_website, objective, app, client):
87+
try:
88+
# Get top 3 links from the map result
89+
if not map_website:
90+
print(f"{Colors.RED}No links found to analyze.{Colors.RESET}")
91+
return None
92+
93+
top_links = map_website[:3]
94+
print(f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}")
95+
96+
for link in top_links:
97+
print(f"{Colors.YELLOW}Initiating scrape of page: {link}{Colors.RESET}")
98+
# Scrape the page
99+
scrape_result = app.scrape_url(link, params={'formats': ['markdown']})
100+
print(f"{Colors.GREEN}Page scraping completed successfully.{Colors.RESET}")
101+
102+
103+
# Check if objective is met
104+
check_prompt = f"""
105+
Given the following scraped content and objective, determine if the objective is met.
106+
If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible.
107+
If the objective is not met with confidence, respond with 'Objective not met'.
108+
109+
Objective: {objective}
110+
Scraped content: {scrape_result['markdown']}
111+
112+
Remember:
113+
1. Only return JSON if you are confident the objective is fully met.
114+
2. Keep the JSON structure as simple and flat as possible.
115+
3. Do not include any explanations or markdown formatting in your response.
116+
"""
117+
118+
completion = client.chat.completions.create(
119+
model="o3-mini",
120+
messages=[
121+
{
122+
"role": "user",
123+
"content": [
124+
{
125+
"type": "text",
126+
"text": check_prompt
127+
}
128+
]
129+
}
130+
]
131+
)
132+
133+
result = completion.choices[0].message.content
134+
135+
if result != "Objective not met":
136+
print(f"{Colors.GREEN}Objective potentially fulfilled. Relevant information identified.{Colors.RESET}")
137+
try:
138+
return json.loads(result)
139+
except json.JSONDecodeError:
140+
print(f"{Colors.RED}Error in parsing response. Proceeding to next page...{Colors.RESET}")
141+
else:
142+
print(f"{Colors.YELLOW}Objective not met on this page. Proceeding to next link...{Colors.RESET}")
143+
144+
print(f"{Colors.RED}All available pages analyzed. Objective not fulfilled in examined content.{Colors.RESET}")
145+
return None
146+
147+
except Exception as e:
148+
print(f"{Colors.RED}Error encountered during page analysis: {str(e)}{Colors.RESET}")
149+
return None
150+
151+
# Main function to execute the process
152+
def main():
153+
# Get user input
154+
url = input(f"{Colors.BLUE}Enter the website to crawl : {Colors.RESET}")
155+
objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}")
156+
157+
print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}")
158+
# Find the relevant page
159+
map_website = find_relevant_page_via_map(objective, url, app, client)
160+
161+
if map_website:
162+
print(f"{Colors.GREEN}Relevant pages identified. Proceeding with detailed analysis using o3-mini...{Colors.RESET}")
163+
# Find objective in top pages
164+
result = find_objective_in_top_pages(map_website, objective, app, client)
165+
166+
if result:
167+
print(f"{Colors.GREEN}Objective successfully fulfilled. Extracted information :{Colors.RESET}")
168+
print(f"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}")
169+
else:
170+
print(f"{Colors.RED}Unable to fulfill the objective with the available content.{Colors.RESET}")
171+
else:
172+
print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}")
173+
174+
if __name__ == "__main__":
175+
main()

0 commit comments

Comments
 (0)