1
+ import datetime
2
+ import random
3
+ import copy
4
+ from collections import OrderedDict
5
+
6
+ from crawlfrontier import Backend
7
+
8
+
9
+ class MemoryBackend (Backend ):
10
+ name = 'Memory Backend'
11
+
12
+ def __init__ (self , manager ):
13
+ self .manager = manager
14
+ self .pages = OrderedDict ()
15
+
16
+ @classmethod
17
+ def from_manager (cls , manager ):
18
+ return cls (manager )
19
+
20
+ def add_seeds (self , links ):
21
+ # Log
22
+ self .manager .logger .backend .debug ('ADD_SEEDS n_links=%s' % len (links ))
23
+
24
+ pages = []
25
+ for link in links :
26
+ # Get timestamp
27
+ now = datetime .datetime .utcnow ()
28
+
29
+ # Get or create page from link
30
+ page , created = self ._get_or_create_page_from_link (link , now )
31
+
32
+ # Update add fields
33
+ page .n_adds += 1
34
+ page .last_update = now
35
+ pages .append (page )
36
+
37
+ # Return updated pages
38
+ return pages
39
+
40
+ def page_crawled (self , page , links ):
41
+ # Log
42
+ self .manager .logger .backend .debug ('PAGE_CRAWLED page=%s status=%s links=%s' %
43
+ (page , page .status , len (links )))
44
+
45
+ # process page crawled
46
+ backend_page = self ._page_crawled (page )
47
+
48
+ # Update crawled fields
49
+ backend_page .n_crawls += 1
50
+ backend_page .state = self .manager .page_model .State .CRAWLED
51
+
52
+ # Create links
53
+ for link in links :
54
+ self .manager .logger .backend .debug ('ADD_LINK link=%s' % link )
55
+ link_page , link_created = self ._get_or_create_page_from_link (link , datetime .datetime .utcnow ())
56
+ if link_created :
57
+ link_page .depth = page .depth + 1
58
+
59
+ # Return updated page
60
+ return backend_page
61
+
62
+ def page_crawled_error (self , page , error ):
63
+ # Log
64
+ self .manager .logger .backend .debug ('PAGE_CRAWLED_ERROR page=%s error=%s' % (page , error ))
65
+
66
+ # process page crawled
67
+ backend_page = self ._page_crawled (page )
68
+
69
+ # Update error fields
70
+ backend_page .n_errors += 1
71
+ backend_page .state = self .manager .page_model .State .ERROR
72
+
73
+ # Return updated page
74
+ return backend_page
75
+
76
+ def get_next_pages (self , max_next_pages ):
77
+ # Log
78
+ self .manager .logger .backend .debug ('GET_NEXT_PAGES max_next_pages=%s' % max_next_pages )
79
+
80
+ now = datetime .datetime .utcnow ()
81
+ pages = [page for page in self .pages .values () if page .state == self .manager .page_model .State .NOT_CRAWLED ]
82
+ pages = self ._sort_pages (pages )
83
+ if max_next_pages :
84
+ pages = pages [0 :max_next_pages ]
85
+ for page in pages :
86
+ page .state = self .manager .page_model .State .QUEUED
87
+ page .n_queued += 1
88
+ page .last_update = now
89
+ return pages
90
+
91
+ def get_page (self , link ):
92
+ return self .pages .get (link .fingerprint , None )
93
+
94
+ def _page_crawled (self , page ):
95
+ # Get timestamp
96
+ now = datetime .datetime .utcnow ()
97
+
98
+ # Get or create page from incoming page
99
+ backend_page , created = self ._get_or_create_page_from_page (page , now )
100
+
101
+ # Update creation fields
102
+ if created :
103
+ backend_page .created_at = now
104
+
105
+ # Update fields
106
+ backend_page .last_update = now
107
+ backend_page .status = page .status
108
+
109
+ return backend_page
110
+
111
+ def _get_or_create_page_from_link (self , link , now ):
112
+ fingerprint = link .fingerprint
113
+ if not fingerprint in self .pages :
114
+ new_page = self .manager .page_model .from_link (link )
115
+ self .pages [fingerprint ] = new_page
116
+ new_page .created_at = now
117
+ self .manager .logger .backend .debug ('Creating page %s from link %s' % (new_page , link ))
118
+ return new_page , True
119
+ else :
120
+ page = self .pages [fingerprint ]
121
+ self .manager .logger .backend .debug ('Page %s exists' % page )
122
+ return page , False
123
+
124
+ def _get_or_create_page_from_page (self , page , now ):
125
+ fingerprint = page .fingerprint
126
+ if not fingerprint in self .pages :
127
+ new_page = copy .deepcopy (page )
128
+ self .pages [fingerprint ] = new_page
129
+ new_page .created_at = now
130
+ self .manager .logger .backend .debug ('Creating page %s from page %s' % (new_page , page ))
131
+ return new_page , True
132
+ else :
133
+ self .manager .logger .backend .debug ('Page %s exists' % page )
134
+ return self .pages [fingerprint ], False
135
+
136
+ def _sort_pages (self , pages ):
137
+ raise NotImplementedError
138
+
139
+
140
+ class MemoryFIFOBackend (MemoryBackend ):
141
+ name = 'FIFO Memory Backend'
142
+
143
+ def _sort_pages (self , pages ):
144
+ return sorted (pages , key = lambda p : p .created_at )
145
+
146
+
147
+ class MemoryLIFOBackend (MemoryBackend ):
148
+ name = 'LIFO Memory Backend'
149
+
150
+ def _sort_pages (self , pages ):
151
+ return sorted (pages , key = lambda p : p .created_at , reverse = True )
152
+
153
+
154
+ class MemoryDFSBackend (MemoryBackend ):
155
+ name = 'DFS Memory Backend'
156
+
157
+ def _sort_pages (self , pages ):
158
+ return sorted (pages , key = lambda p : p .depth , reverse = True )
159
+
160
+
161
+ class MemoryBFSBackend (MemoryBackend ):
162
+ name = 'BFS Memory Backend'
163
+
164
+ def _sort_pages (self , pages ):
165
+ return sorted (pages , key = lambda p : p .depth )
166
+
167
+
168
+ class MemoryRandomBackend (MemoryBackend ):
169
+ name = 'RANDOM Memory Backend'
170
+
171
+ def _sort_pages (self , pages ):
172
+ random .shuffle (pages )
173
+ return pages
174
+
175
+ BASE = MemoryBackend
176
+ FIFO = MemoryFIFOBackend
177
+ LIFO = MemoryLIFOBackend
178
+ DFS = MemoryDFSBackend
179
+ BFS = MemoryBFSBackend
180
+ RANDOM = MemoryRandomBackend
0 commit comments