Skip to content

Commit 13fcff7

Browse files
committed
Initial commit from previous bitbucket repo
1 parent a61710f commit 13fcff7

File tree

123 files changed

+7371
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

123 files changed

+7371
-0
lines changed

LICENSE

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
Copyright (c) Crawl Frontier developers.
2+
All rights reserved.
3+
4+
Redistribution and use in source and binary forms, with or without modification,
5+
are permitted provided that the following conditions are met:
6+
7+
1. Redistributions of source code must retain the above copyright notice,
8+
this list of conditions and the following disclaimer.
9+
10+
2. Redistributions in binary form must reproduce the above copyright
11+
notice, this list of conditions and the following disclaimer in the
12+
documentation and/or other materials provided with the distribution.
13+
14+
3. Neither the name of Crawl Frontier nor the names of its contributors may
15+
be used to endorse or promote products derived from this software without
16+
specific prior written permission.
17+
18+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
22+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25+
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

README.md

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Installation #
2+
3+
pip install -r requirements.txt --allow-external pydot --allow-unverified pydot
4+
5+
python setup.py install

crawlfrontier/VERSION

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
0.7

crawlfrontier/__init__.py

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from core.manager import FrontierManager
2+
from core.models import Model, Page, Link
3+
from core.components import Backend, Middleware
4+
from settings import Settings
5+
from utils import graphs
6+
from utils.tester import FrontierTester

crawlfrontier/contrib/__init__.py

Whitespace-only changes.

crawlfrontier/contrib/backends/__init__.py

Whitespace-only changes.

crawlfrontier/contrib/backends/memory/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
import datetime
2+
import random
3+
import copy
4+
from collections import OrderedDict
5+
6+
from crawlfrontier import Backend
7+
8+
9+
class MemoryBackend(Backend):
10+
name = 'Memory Backend'
11+
12+
def __init__(self, manager):
13+
self.manager = manager
14+
self.pages = OrderedDict()
15+
16+
@classmethod
17+
def from_manager(cls, manager):
18+
return cls(manager)
19+
20+
def add_seeds(self, links):
21+
# Log
22+
self.manager.logger.backend.debug('ADD_SEEDS n_links=%s' % len(links))
23+
24+
pages = []
25+
for link in links:
26+
# Get timestamp
27+
now = datetime.datetime.utcnow()
28+
29+
# Get or create page from link
30+
page, created = self._get_or_create_page_from_link(link, now)
31+
32+
# Update add fields
33+
page.n_adds += 1
34+
page.last_update = now
35+
pages.append(page)
36+
37+
# Return updated pages
38+
return pages
39+
40+
def page_crawled(self, page, links):
41+
# Log
42+
self.manager.logger.backend.debug('PAGE_CRAWLED page=%s status=%s links=%s' %
43+
(page, page.status, len(links)))
44+
45+
# process page crawled
46+
backend_page = self._page_crawled(page)
47+
48+
# Update crawled fields
49+
backend_page.n_crawls += 1
50+
backend_page.state = self.manager.page_model.State.CRAWLED
51+
52+
# Create links
53+
for link in links:
54+
self.manager.logger.backend.debug('ADD_LINK link=%s' % link)
55+
link_page, link_created = self._get_or_create_page_from_link(link, datetime.datetime.utcnow())
56+
if link_created:
57+
link_page.depth = page.depth+1
58+
59+
# Return updated page
60+
return backend_page
61+
62+
def page_crawled_error(self, page, error):
63+
# Log
64+
self.manager.logger.backend.debug('PAGE_CRAWLED_ERROR page=%s error=%s' % (page, error))
65+
66+
# process page crawled
67+
backend_page = self._page_crawled(page)
68+
69+
# Update error fields
70+
backend_page.n_errors += 1
71+
backend_page.state = self.manager.page_model.State.ERROR
72+
73+
# Return updated page
74+
return backend_page
75+
76+
def get_next_pages(self, max_next_pages):
77+
# Log
78+
self.manager.logger.backend.debug('GET_NEXT_PAGES max_next_pages=%s' % max_next_pages)
79+
80+
now = datetime.datetime.utcnow()
81+
pages = [page for page in self.pages.values() if page.state == self.manager.page_model.State.NOT_CRAWLED]
82+
pages = self._sort_pages(pages)
83+
if max_next_pages:
84+
pages = pages[0:max_next_pages]
85+
for page in pages:
86+
page.state = self.manager.page_model.State.QUEUED
87+
page.n_queued += 1
88+
page.last_update = now
89+
return pages
90+
91+
def get_page(self, link):
92+
return self.pages.get(link.fingerprint, None)
93+
94+
def _page_crawled(self, page):
95+
# Get timestamp
96+
now = datetime.datetime.utcnow()
97+
98+
# Get or create page from incoming page
99+
backend_page, created = self._get_or_create_page_from_page(page, now)
100+
101+
# Update creation fields
102+
if created:
103+
backend_page.created_at = now
104+
105+
# Update fields
106+
backend_page.last_update = now
107+
backend_page.status = page.status
108+
109+
return backend_page
110+
111+
def _get_or_create_page_from_link(self, link, now):
112+
fingerprint = link.fingerprint
113+
if not fingerprint in self.pages:
114+
new_page = self.manager.page_model.from_link(link)
115+
self.pages[fingerprint] = new_page
116+
new_page.created_at = now
117+
self.manager.logger.backend.debug('Creating page %s from link %s' % (new_page, link))
118+
return new_page, True
119+
else:
120+
page = self.pages[fingerprint]
121+
self.manager.logger.backend.debug('Page %s exists' % page)
122+
return page, False
123+
124+
def _get_or_create_page_from_page(self, page, now):
125+
fingerprint = page.fingerprint
126+
if not fingerprint in self.pages:
127+
new_page = copy.deepcopy(page)
128+
self.pages[fingerprint] = new_page
129+
new_page.created_at = now
130+
self.manager.logger.backend.debug('Creating page %s from page %s' % (new_page, page))
131+
return new_page, True
132+
else:
133+
self.manager.logger.backend.debug('Page %s exists' % page)
134+
return self.pages[fingerprint], False
135+
136+
def _sort_pages(self, pages):
137+
raise NotImplementedError
138+
139+
140+
class MemoryFIFOBackend(MemoryBackend):
141+
name = 'FIFO Memory Backend'
142+
143+
def _sort_pages(self, pages):
144+
return sorted(pages, key=lambda p: p.created_at)
145+
146+
147+
class MemoryLIFOBackend(MemoryBackend):
148+
name = 'LIFO Memory Backend'
149+
150+
def _sort_pages(self, pages):
151+
return sorted(pages, key=lambda p: p.created_at, reverse=True)
152+
153+
154+
class MemoryDFSBackend(MemoryBackend):
155+
name = 'DFS Memory Backend'
156+
157+
def _sort_pages(self, pages):
158+
return sorted(pages, key=lambda p: p.depth, reverse=True)
159+
160+
161+
class MemoryBFSBackend(MemoryBackend):
162+
name = 'BFS Memory Backend'
163+
164+
def _sort_pages(self, pages):
165+
return sorted(pages, key=lambda p: p.depth)
166+
167+
168+
class MemoryRandomBackend(MemoryBackend):
169+
name = 'RANDOM Memory Backend'
170+
171+
def _sort_pages(self, pages):
172+
random.shuffle(pages)
173+
return pages
174+
175+
BASE = MemoryBackend
176+
FIFO = MemoryFIFOBackend
177+
LIFO = MemoryLIFOBackend
178+
DFS = MemoryDFSBackend
179+
BFS = MemoryBFSBackend
180+
RANDOM = MemoryRandomBackend

0 commit comments

Comments
 (0)