1
2
3
4 """
5 This module provides classes for spidering websites.
6
7 @author: Johannes Schwenk
8 @copyright: 2010, Johannes Schwenk
9 @version: 1.0
10 @date: 2010-09-15
11
12
13 """
14
15 import sys
16
17
18 reload(sys)
19 sys.setdefaultencoding('utf-8')
20
21 from urlparse import urlsplit, urlunparse
22 from hashlib import sha256
23 from httplib import HTTPConnection, HTTPException
24
25 from tools.BeautifulSoup import BeautifulSoup, Comment
26 from log import BaseLogger
27 from tools.functions import parse_date
28 from request import BaseRequester
29 from config import USER_AGENT, LAST_QUERY_DEFAULT
33 """
34 URL stack for spidering websites.
35
36 This is a simple stack, that takes an additional argument: the level of
37 the parent document of the element to be pushed. If the level of the
38 new element would exceed the limit, it is not added. It also keeps track
39 of popped elements and refuses to add an element that was already on the
40 stack in the past.
41
42 """
43
44 _urls_info = None
45 """
46 @ivar: A list of dictionaries of the form C{{u'url: url, u'level': level}}.
47 @type: list of dict
48
49 """
50
51 _urls = None
52 """
53 @ivar: The stack of only the URLs, not the level information. Required to
54 check if the URL has been on the stack before.
55 @type: list
56
57 """
58
59 _checked_urls = None
60 """
61 @ivar: A set of URLs that have already been on the stack.
62 @type: set
63
64 """
65
66
67 _max_level = -1
68 """
69 @ivar: The maximum depth to which URLs should be accepted on the stack.
70 @type: int
71
72 """
73
74
75
76 - def __init__(self, source_name, max_level):
77 """
78 Initialize the URLStack.
79
80 @param source_name: The name of the source.
81 @param max_level: The maximum level from which to accept new elements
82 for the stack.
83
84 @type source_name: string
85 @type max_level: int
86
87 """
88 BaseLogger.__init__(self, source_name)
89 self._urls = []
90 self._urls_info = []
91 self._checked_urls = set()
92 self._max_level = max_level
93
94
95
96 - def push(self, url, parent_level):
97 """
98 Push the URL on top of stack, if C{parent_level < max_level} and
99 C{url} has not already been checked. Otherwise this function will
100 refuse to push the URL on the stack.
101
102 @param url: The URL to push on the stack.
103 @param parent_level: The level of the document in which the link
104 was found.
105
106 @type url: string
107 @type parent_level: int
108
109 """
110 if not url in self._checked_urls and (parent_level < self._max_level or not self._max_level):
111
112
113 self._urls.append(url)
114 self._urls_info.append(
115 {u'url': url, u'level': parent_level + 1})
116
117
119 """
120 Pop the top element from the stack.
121
122 @return: The url information of the popped element.
123 @rtype: dict
124
125 """
126 self._checked_urls.add(self._urls.pop())
127 return self._urls_info.pop()
128
129
131 """
132 Make C{len()} work on C{URLStacks}.
133
134 @return: The size of the stack.
135 @rtype: int
136
137 """
138 return len(self._urls)
139
140
141 @property
143 """
144 Return set of checked_urls.
145
146 @return: The list of already checked (popped) URLs.
147 @rtype: list of string.
148
149 """
150 return self._checked_urls
151
155 """
156 A minimalistic webcrawler.
157
158 The most important function is L{getNextPage}. See the documentation there
159 for a detailed description of the retrieval process of a page.
160
161 """
162
163 _urls = None
164 """
165 @ivar: The url stack.
166 @type: L{URLStack}
167
168 """
169
170 _start_url = None
171 """
172 @ivar: The starting point for the crawl.
173 @type: string
174
175 """
176
177 _base_url = None
178 """
179 @ivar: The start urls base.
180 @type: L{URLStack}
181
182 """
183
184 _last_update = None
185 """
186 @ivar: The date of the last run.
187 @type: datetime
188
189 """
190
191 _mime_types = None
192 """
193 @ivar: A white list of MIME types to be returned.
194 @type: list
195
196 """
197
198 _etags = None
199 """
200 @ivar: A set of etags for pages.
201 @type: set
202
203 """
204
205 _md5hashes = None
206 """
207 @ivar: A set of MD5 hashes of the pages content.
208 @type: set
209
210 """
211
212 _content_selectors = None
213 """
214 @ivar: A list of content selectors.
215 @type: list
216
217 """
218
219 _headers = None
220 """
221 @ivar: Dictionary of headers to send to the server.
222 @type: dict
223
224 """
225
226 _hashes = None
227 """
228 @ivar: A set of sha256 hashes of the pages content.
229 @type: set
230
231 """
232
233
234
235 - def __init__(self, source_name, start_url, depth=2,
236 content_selectors=[], mime_types=set(), etags=set(),
237 md5hashes=set(), hashes=set(), headers={u'User-Agent': USER_AGENT},
238 last_update=LAST_QUERY_DEFAULT):
239 """
240 Initialize the C{IndexSpider} and its variables.
241
242 @param source_name: The sources name.
243 @param start_url: The URL from which to start the crawling.
244 @param depth: The maximum depth to which the spider should follow links
245 into the page.
246 @param content_selectors: A list of content selectors, which are used
247 to get the relevant content region from the page.
248 @param mime_types: A set of acceptable MIME types.
249 @param etags: A set of ETags.
250 @param md5hashes: A set of MD5 hashes.
251 @param hashes: A set of sha256 hashes.
252 @param headers: A dictionary of header fields to send to the server.
253 @param last_update: The date of the last run.
254
255 @type source_name: string
256 @type start_url: string
257 @type depth: int
258 @type content_selectors: list
259 @type mime_types: set
260 @type etags: set
261 @type md5hashes: set
262 @type hashes: set
263 @type headers: dict
264 @type last_update: datetime
265
266
267 """
268 BaseRequester.__init__(self, source_name)
269 self._content_selectors = content_selectors
270 self._headers = headers
271 self._hashes = hashes
272 self._mime_types = mime_types
273 self._etags = etags
274 self._md5hashes = md5hashes
275 self._start_url = start_url
276 self._urls = URLStack(source_name, depth)
277 self._urls.push(self._start_url, -1)
278 split_url = urlsplit(start_url)
279 self._base_url = (u'%s://%s'
280 % (split_url.scheme, split_url.netloc))
281 self._last_update = last_update
282
283
284
285 - def hasMorePages(self):
286 """
287 Are there still pages to crawl?
288
289 @return: Returns C{True} if the spider has more pages to crawl,
290 C{False} otherwise.
291 @rtype: bool
292 """
293 return len(self._urls) > 0
294
295
296
297 - def getNextPage(self):
298 """
299 Get the next page.
300
301 Pops the first URL from the L{URLStack}, retrieves the page, filters
302 for content, finds all links in the content that do not leave the site
303 and pushes them on the stack and finally returns a five-tuple
304 containing C{url}, C{soup}, C{content}, C{head} and C{data} .
305
306 The C{soup} is an instance of L{BeautifulSoup.BeautifulSoup}, content
307 also, but only for the part of the page that is specified through the
308 L{_content_selectors} variable, while C{head} contains the HTTP headers
309 of the server's response. Finally, C{data} contains the raw data of the
310 server's response.
311
312 If the server redirects, the spider follows and tries to retrieve
313 the redirection's target. It does so only once, to avoid loops.
314
315 If the page retrieved is not in L{_mime_types}, the content's hash
316 value is in one of the lists L{_hashes}, L{_md5hashes} or the ETag
317 is in L{_etags}, the function returns C{False} . Also, if the value of
318 the Last-Modified header is smaller than L{_last_update}, returns
319 C{False} .
320
321 Checking for the correct MIME type, the ETag and the Last-Modified
322 header is performed on the result of a HTTP HEAD request.
323
324 @return: Five-tuple or C{False}, see above.
325 @rtype: tuple or bool
326
327 """
328 if len(self._urls) > 0:
329 url = self._urls.pop()
330
331
332 head = self._getHead(url[u'url'])
333
334 if (head and head.getheader(u'Content-Type')
335 and not head.getheader(u'Content-Type').split(u';')[0]
336 in self._mime_types):
337 self.logger.debug(u'%s not in mime type list'
338 % head.getheader(u'Content-Type'))
339 return False
340
341
342 if (head and head.getheader(u'Location')
343 and (head.status >= 300
344 and head.status <= 303
345 or head.status == 307)):
346 self.logger.debug(u'Redirect')
347 url[u'url'] = head.getheader(u'Location')
348 head = self._getHead(url[u'url'])
349
350 if (head and head.getheader(u'Content-Type')
351 and head.getheader(u'Content-Type')
352 and not head.getheader(u'Content-Type').split(u';')[0]
353 in self._mime_types):
354 self.logger.debug(u'%s not in mime type list'
355 % head.getheader(u'Content-Type'))
356 return False
357
358 if head and head.status in [200, 203]:
359 last_modified = parse_date(head.getheader(u'Last-Modified'))
360 if last_modified and last_modified <= self._last_update:
361 self.logger.debug(u'Content cache hit: Last-Modified')
362 return False
363 if (head.getheader(u'ETag')
364 and head.getheader(u'ETag') in self._etags):
365 self.logger.debug(u'Content cache hit: ETag')
366 return False
367 if (head.getheader(u'Content-MD5')
368 and head.getheader(u'Content-MD5') in self._md5hashes):
369 self.logger.debug(u'Content cache hit: MD5')
370 return False
371 if not head:
372 self.logger.debug(u'No headers')
373 return False
374
375
376 soup, content, head, data = (None, None, None, None)
377 page = self._getPage(url[u'url'])
378 if page:
379 soup, content, head, data = page
380
381 if content and head:
382 sha = sha256(str(content)).hexdigest()
383 if sha in self._hashes:
384 self.logger.debug(u'Content cache hit: sha256')
385 return False
386 else:
387 self._hashes.add(sha)
388 self.logger.debug(u'sha256: %s' % sha)
389 else:
390 self.logger.debug(u'No content %s' % url[u'url'])
391
392
393 links = content and content.findAll(u'a', {u'href': True})
394 if links:
395 for link in links:
396 href = urlsplit(link[u'href'])
397 if (href.scheme == u'ftp'
398 or href.scheme == u'javascript'
399 or href.scheme == u'mailto'):
400 continue
401 found_url = link[u'href']
402 if not href.scheme == u'http':
403
404 if not found_url.startswith(u'.'):
405
406 found_url = u'%s/%s' % (self._base_url, found_url)
407 else:
408
409 found_url = u'%s/%s' % (url[u'url'], found_url)
410 if (not found_url.startswith(self._base_url)
411 and not found_url.startswith(
412 self._base_url.replace(u'http', u'https'))):
413 """
414 The absolute URL will not be pushed on the stack,
415 because it points outside the site.
416 """
417 continue
418
419 self.logger.debug(u'Found url %s' % found_url)
420 self._urls.push(found_url, url[u'level'])
421
422
423 return (url[u'url'], soup, content, head, data)
424 else:
425 self.logger.debug(u'No more urls on the stack')
426 return False
427
428
429
431 """
432 Issues a HEAD request to the given URL.
433
434 @return: The response to the HEAD request or C{False}, if an error
435 occured.
436 @rtype: C{HTTPResponse} or C{False}
437
438 """
439 res = None
440 split_url = urlsplit(url)
441 try:
442 conn = HTTPConnection(split_url.netloc)
443 except Exception, e:
444 self.logger.debug(u'Error connecting to %s : %s' %
445 (split_url.netloc, e))
446 else:
447 try:
448 url = urlunparse((u'', u'', split_url.path, u'',
449 split_url.query, split_url.fragment))
450 conn.request("HEAD", url, headers=self._headers)
451 except Exception, e:
452 self.logger.debug(u'Error requesting to %s : %s' % (url, e))
453 else:
454 try:
455 res = conn.getresponse()
456 except Exception, e:
457 self.logger.debug(u'Error reading response : %s' % e)
458
459 return res
460
461
462 - def _getPage(self, url):
463 """
464 Get a page as a L{BeautifulSoup.BeautifulSoup} instance.
465
466 @return: A four-tuple containing the page wrapped as a C{BeautifulSoup}
467 instance, the content also as a C{BeautifulSoup} instance, the
468 headers of the response, and the raw data.
469
470 @rtype: tuple or C{False}
471
472 """
473 response = self._requestURL(url, headers=self._headers)
474 if not response:
475 self.logger.debug(u'No response')
476 return False
477
478 data = response.read()
479
480 if not data:
481 self.logger.debug(u'No data')
482 return False
483 soup = BeautifulSoup(data)
484 if not soup:
485 self.logger.debug(u'No soup')
486 return False
487 content = soup
488 for selectorlist in self._content_selectors:
489 if not content:
490 break
491 for selector in selectorlist:
492 content = content.find(selector[0], selector[1])
493 if content:
494 break
495 return (soup, content, response.headers, data)
496