1
+ import os
2
+ from firecrawl import FirecrawlApp
3
+ import json
4
+ from dotenv import load_dotenv
5
+ from openai import OpenAI
6
+
7
+ # ANSI color codes
8
+ class Colors :
9
+ CYAN = '\033 [96m'
10
+ YELLOW = '\033 [93m'
11
+ GREEN = '\033 [92m'
12
+ RED = '\033 [91m'
13
+ MAGENTA = '\033 [95m'
14
+ BLUE = '\033 [94m'
15
+ RESET = '\033 [0m'
16
+
17
+ # Load environment variables
18
+ load_dotenv ()
19
+
20
+ # Retrieve API keys from environment variables
21
+ firecrawl_api_key = os .getenv ("FIRECRAWL_API_KEY" )
22
+ openai_api_key = os .getenv ("OPENAI_API_KEY" )
23
+
24
+ # Initialize the FirecrawlApp and OpenAI client
25
+ app = FirecrawlApp (api_key = firecrawl_api_key )
26
+ client = OpenAI (api_key = openai_api_key )
27
+
28
+ # Find the page that most likely contains the objective
29
+ def find_relevant_page_via_map (objective , url , app , client ):
30
+ try :
31
+ print (f"{ Colors .CYAN } Understood. The objective is: { objective } { Colors .RESET } " )
32
+ print (f"{ Colors .CYAN } Initiating search on the website: { url } { Colors .RESET } " )
33
+
34
+ map_prompt = f"""
35
+ The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: { objective } , come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else.
36
+ """
37
+
38
+ print (f"{ Colors .YELLOW } Analyzing objective to determine optimal search parameter...{ Colors .RESET } " )
39
+ completion = client .chat .completions .create (
40
+ model = "o3-mini" ,
41
+ messages = [
42
+ {
43
+ "role" : "user" ,
44
+ "content" : [
45
+ {
46
+ "type" : "text" ,
47
+ "text" : map_prompt
48
+ }
49
+ ]
50
+ }
51
+ ]
52
+ )
53
+
54
+ map_search_parameter = completion .choices [0 ].message .content
55
+ print (f"{ Colors .GREEN } Optimal search parameter identified: { map_search_parameter } { Colors .RESET } " )
56
+
57
+ print (f"{ Colors .YELLOW } Mapping website using the identified search parameter...{ Colors .RESET } " )
58
+ map_website = app .map_url (url , params = {"search" : map_search_parameter })
59
+
60
+ # Debug print to see the response structure
61
+ print (f"{ Colors .MAGENTA } Debug - Map response structure: { json .dumps (map_website , indent = 2 )} { Colors .RESET } " )
62
+
63
+ print (f"{ Colors .GREEN } Website mapping completed successfully.{ Colors .RESET } " )
64
+
65
+ # Handle the response based on its structure
66
+ if isinstance (map_website , dict ):
67
+ # Assuming the links are in a 'urls' or similar key
68
+ links = map_website .get ('urls' , []) or map_website .get ('links' , [])
69
+ elif isinstance (map_website , str ):
70
+ try :
71
+ parsed = json .loads (map_website )
72
+ links = parsed .get ('urls' , []) or parsed .get ('links' , [])
73
+ except json .JSONDecodeError :
74
+ links = []
75
+ else :
76
+ links = map_website if isinstance (map_website , list ) else []
77
+
78
+ print (f"{ Colors .GREEN } Located { len (links )} relevant links.{ Colors .RESET } " )
79
+ return links
80
+
81
+ except Exception as e :
82
+ print (f"{ Colors .RED } Error encountered during relevant page identification: { str (e )} { Colors .RESET } " )
83
+ return None
84
+
85
+ # Scrape the top 3 pages and see if the objective is met, if so return in json format else return None
86
+ def find_objective_in_top_pages (map_website , objective , app , client ):
87
+ try :
88
+ # Get top 3 links from the map result
89
+ if not map_website :
90
+ print (f"{ Colors .RED } No links found to analyze.{ Colors .RESET } " )
91
+ return None
92
+
93
+ top_links = map_website [:3 ]
94
+ print (f"{ Colors .CYAN } Proceeding to analyze top { len (top_links )} links: { top_links } { Colors .RESET } " )
95
+
96
+ for link in top_links :
97
+ print (f"{ Colors .YELLOW } Initiating scrape of page: { link } { Colors .RESET } " )
98
+ # Scrape the page
99
+ scrape_result = app .scrape_url (link , params = {'formats' : ['markdown' ]})
100
+ print (f"{ Colors .GREEN } Page scraping completed successfully.{ Colors .RESET } " )
101
+
102
+
103
+ # Check if objective is met
104
+ check_prompt = f"""
105
+ Given the following scraped content and objective, determine if the objective is met.
106
+ If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible.
107
+ If the objective is not met with confidence, respond with 'Objective not met'.
108
+
109
+ Objective: { objective }
110
+ Scraped content: { scrape_result ['markdown' ]}
111
+
112
+ Remember:
113
+ 1. Only return JSON if you are confident the objective is fully met.
114
+ 2. Keep the JSON structure as simple and flat as possible.
115
+ 3. Do not include any explanations or markdown formatting in your response.
116
+ """
117
+
118
+ completion = client .chat .completions .create (
119
+ model = "o3-mini" ,
120
+ messages = [
121
+ {
122
+ "role" : "user" ,
123
+ "content" : [
124
+ {
125
+ "type" : "text" ,
126
+ "text" : check_prompt
127
+ }
128
+ ]
129
+ }
130
+ ]
131
+ )
132
+
133
+ result = completion .choices [0 ].message .content
134
+
135
+ if result != "Objective not met" :
136
+ print (f"{ Colors .GREEN } Objective potentially fulfilled. Relevant information identified.{ Colors .RESET } " )
137
+ try :
138
+ return json .loads (result )
139
+ except json .JSONDecodeError :
140
+ print (f"{ Colors .RED } Error in parsing response. Proceeding to next page...{ Colors .RESET } " )
141
+ else :
142
+ print (f"{ Colors .YELLOW } Objective not met on this page. Proceeding to next link...{ Colors .RESET } " )
143
+
144
+ print (f"{ Colors .RED } All available pages analyzed. Objective not fulfilled in examined content.{ Colors .RESET } " )
145
+ return None
146
+
147
+ except Exception as e :
148
+ print (f"{ Colors .RED } Error encountered during page analysis: { str (e )} { Colors .RESET } " )
149
+ return None
150
+
151
+ # Main function to execute the process
152
+ def main ():
153
+ # Get user input
154
+ url = input (f"{ Colors .BLUE } Enter the website to crawl : { Colors .RESET } " )
155
+ objective = input (f"{ Colors .BLUE } Enter your objective: { Colors .RESET } " )
156
+
157
+ print (f"{ Colors .YELLOW } Initiating web crawling process...{ Colors .RESET } " )
158
+ # Find the relevant page
159
+ map_website = find_relevant_page_via_map (objective , url , app , client )
160
+
161
+ if map_website :
162
+ print (f"{ Colors .GREEN } Relevant pages identified. Proceeding with detailed analysis using o3-mini...{ Colors .RESET } " )
163
+ # Find objective in top pages
164
+ result = find_objective_in_top_pages (map_website , objective , app , client )
165
+
166
+ if result :
167
+ print (f"{ Colors .GREEN } Objective successfully fulfilled. Extracted information :{ Colors .RESET } " )
168
+ print (f"{ Colors .MAGENTA } { json .dumps (result , indent = 2 )} { Colors .RESET } " )
169
+ else :
170
+ print (f"{ Colors .RED } Unable to fulfill the objective with the available content.{ Colors .RESET } " )
171
+ else :
172
+ print (f"{ Colors .RED } No relevant pages identified. Consider refining the search parameters or trying a different website.{ Colors .RESET } " )
173
+
174
+ if __name__ == "__main__" :
175
+ main ()
0 commit comments