forked from unclecode/crawl4ai
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproxy_rotation_demo.py
161 lines (137 loc) · 5.95 KB
/
proxy_rotation_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import os
import re
from typing import List, Dict
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
CacheMode,
RoundRobinProxyStrategy
)
def load_proxies_from_env() -> List[Dict]:
"""Load proxies from PROXIES environment variable"""
proxies = []
try:
proxy_list = os.getenv("PROXIES", "").split(",")
for proxy in proxy_list:
if not proxy:
continue
ip, port, username, password = proxy.split(":")
proxies.append({
"server": f"http://{ip}:{port}",
"username": username,
"password": password,
"ip": ip # Store original IP for verification
})
except Exception as e:
print(f"Error loading proxies from environment: {e}")
return proxies
async def demo_proxy_rotation():
"""
Proxy Rotation Demo using RoundRobinProxyStrategy
===============================================
Demonstrates proxy rotation using the strategy pattern.
"""
print("\n=== Proxy Rotation Demo (Round Robin) ===")
# Load proxies and create rotation strategy
proxies = load_proxies_from_env()
if not proxies:
print("No proxies found in environment. Set PROXIES env variable!")
return
proxy_strategy = RoundRobinProxyStrategy(proxies)
# Create configs
browser_config = BrowserConfig(headless=True, verbose=False)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
proxy_rotation_strategy=proxy_strategy
)
# Test URLs
urls = ["https://httpbin.org/ip"] * len(proxies) # Test each proxy once
async with AsyncWebCrawler(config=browser_config) as crawler:
for url in urls:
result = await crawler.arun(url=url, config=run_config)
if result.success:
# Extract IP from response
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
current_proxy = run_config.proxy_config if run_config.proxy_config else None
if current_proxy:
print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}")
verified = ip_match and ip_match.group(0) == current_proxy['ip']
if verified:
print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
else:
print("❌ Proxy failed or IP mismatch!")
else:
print(f"Request failed: {result.error_message}")
async def demo_proxy_rotation_batch():
"""
Proxy Rotation Demo with Batch Processing
=======================================
Demonstrates proxy rotation using arun_many with memory dispatcher.
"""
print("\n=== Proxy Rotation Batch Demo ===")
try:
# Load proxies and create rotation strategy
proxies = load_proxies_from_env()
if not proxies:
print("No proxies found in environment. Set PROXIES env variable!")
return
proxy_strategy = RoundRobinProxyStrategy(proxies)
# Configurations
browser_config = BrowserConfig(headless=True, verbose=False)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
proxy_rotation_strategy=proxy_strategy,
markdown_generator=DefaultMarkdownGenerator()
)
# Test URLs - multiple requests to test rotation
urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice
print("\n📈 Initializing crawler with proxy rotation...")
async with AsyncWebCrawler(config=browser_config) as crawler:
monitor = CrawlerMonitor(
max_visible_rows=10,
display_mode=DisplayMode.DETAILED
)
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=80.0,
check_interval=0.5,
max_session_permit=1, #len(proxies), # Match concurrent sessions to proxy count
# monitor=monitor
)
print("\n🚀 Starting batch crawl with proxy rotation...")
results = await crawler.arun_many(
urls=urls,
config=run_config,
dispatcher=dispatcher
)
# Verify results
success_count = 0
for result in results:
if result.success:
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
current_proxy = run_config.proxy_config if run_config.proxy_config else None
if current_proxy and ip_match:
print(f"URL {result.url}")
print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0)}")
verified = ip_match.group(0) == current_proxy['ip']
if verified:
print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
success_count += 1
else:
print("❌ Proxy failed or IP mismatch!")
print("---")
print(f"\n✅ Completed {len(results)} requests with {success_count} successful proxy verifications")
except Exception as e:
print(f"\n❌ Error in proxy rotation batch demo: {str(e)}")
if __name__ == "__main__":
import asyncio
from crawl4ai import (
CrawlerMonitor,
DisplayMode,
MemoryAdaptiveDispatcher,
DefaultMarkdownGenerator
)
async def run_demos():
# await demo_proxy_rotation() # Original single-request demo
await demo_proxy_rotation_batch() # New batch processing demo
asyncio.run(run_demos())