Package buildxml :: Package xmlgetter :: Module spider
[hide private]
[frames] | no frames]

Source Code for Module buildxml.xmlgetter.spider

  1  #!/usr/bin/python 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """ 
  5  This module provides classes for spidering websites. 
  6   
  7  @author: Johannes Schwenk 
  8  @copyright: 2010, Johannes Schwenk 
  9  @version: 1.0 
 10  @date: 2010-09-15 
 11   
 12   
 13  """ 
 14   
 15  import sys 
 16   
 17  # Imortant! 
 18  reload(sys) 
 19  sys.setdefaultencoding('utf-8') 
 20   
 21  from urlparse import urlsplit, urlunparse 
 22  from hashlib import sha256 
 23  from httplib import HTTPConnection, HTTPException 
 24   
 25  from tools.BeautifulSoup import BeautifulSoup, Comment 
 26  from log import BaseLogger 
 27  from tools.functions import parse_date 
 28  from request import BaseRequester 
 29  from config import USER_AGENT, LAST_QUERY_DEFAULT 
30 31 32 -class URLStack(BaseLogger):
33 """ 34 URL stack for spidering websites. 35 36 This is a simple stack, that takes an additional argument: the level of 37 the parent document of the element to be pushed. If the level of the 38 new element would exceed the limit, it is not added. It also keeps track 39 of popped elements and refuses to add an element that was already on the 40 stack in the past. 41 42 """ 43 44 _urls_info = None 45 """ 46 @ivar: A list of dictionaries of the form C{{u'url: url, u'level': level}}. 47 @type: list of dict 48 49 """ 50 51 _urls = None 52 """ 53 @ivar: The stack of only the URLs, not the level information. Required to 54 check if the URL has been on the stack before. 55 @type: list 56 57 """ 58 59 _checked_urls = None 60 """ 61 @ivar: A set of URLs that have already been on the stack. 62 @type: set 63 64 """ 65 66 67 _max_level = -1 68 """ 69 @ivar: The maximum depth to which URLs should be accepted on the stack. 70 @type: int 71 72 """ 73 74 75
76 - def __init__(self, source_name, max_level):
77 """ 78 Initialize the URLStack. 79 80 @param source_name: The name of the source. 81 @param max_level: The maximum level from which to accept new elements 82 for the stack. 83 84 @type source_name: string 85 @type max_level: int 86 87 """ 88 BaseLogger.__init__(self, source_name) 89 self._urls = [] 90 self._urls_info = [] 91 self._checked_urls = set() 92 self._max_level = max_level
93 94 95
96 - def push(self, url, parent_level):
97 """ 98 Push the URL on top of stack, if C{parent_level < max_level} and 99 C{url} has not already been checked. Otherwise this function will 100 refuse to push the URL on the stack. 101 102 @param url: The URL to push on the stack. 103 @param parent_level: The level of the document in which the link 104 was found. 105 106 @type url: string 107 @type parent_level: int 108 109 """ 110 if not url in self._checked_urls and (parent_level < self._max_level or not self._max_level): 111 #self.logger.debug(u'Parent level: %s' % parent_level) 112 #self.logger.debug(u'Maximum level: %s' % self._max_level) 113 self._urls.append(url) 114 self._urls_info.append( 115 {u'url': url, u'level': parent_level + 1})
116 117
118 - def pop(self):
119 """ 120 Pop the top element from the stack. 121 122 @return: The url information of the popped element. 123 @rtype: dict 124 125 """ 126 self._checked_urls.add(self._urls.pop()) 127 return self._urls_info.pop()
128 129
130 - def __len__(self):
131 """ 132 Make C{len()} work on C{URLStacks}. 133 134 @return: The size of the stack. 135 @rtype: int 136 137 """ 138 return len(self._urls)
139 140 141 @property
142 - def checked_urls(self):
143 """ 144 Return set of checked_urls. 145 146 @return: The list of already checked (popped) URLs. 147 @rtype: list of string. 148 149 """ 150 return self._checked_urls
151
152 153 154 -class IndexSpider(BaseRequester):
155 """ 156 A minimalistic webcrawler. 157 158 The most important function is L{getNextPage}. See the documentation there 159 for a detailed description of the retrieval process of a page. 160 161 """ 162 163 _urls = None 164 """ 165 @ivar: The url stack. 166 @type: L{URLStack} 167 168 """ 169 170 _start_url = None 171 """ 172 @ivar: The starting point for the crawl. 173 @type: string 174 175 """ 176 177 _base_url = None 178 """ 179 @ivar: The start urls base. 180 @type: L{URLStack} 181 182 """ 183 184 _last_update = None 185 """ 186 @ivar: The date of the last run. 187 @type: datetime 188 189 """ 190 191 _mime_types = None 192 """ 193 @ivar: A white list of MIME types to be returned. 194 @type: list 195 196 """ 197 198 _etags = None 199 """ 200 @ivar: A set of etags for pages. 201 @type: set 202 203 """ 204 205 _md5hashes = None 206 """ 207 @ivar: A set of MD5 hashes of the pages content. 208 @type: set 209 210 """ 211 212 _content_selectors = None 213 """ 214 @ivar: A list of content selectors. 215 @type: list 216 217 """ 218 219 _headers = None 220 """ 221 @ivar: Dictionary of headers to send to the server. 222 @type: dict 223 224 """ 225 226 _hashes = None 227 """ 228 @ivar: A set of sha256 hashes of the pages content. 229 @type: set 230 231 """ 232 233 234
235 - def __init__(self, source_name, start_url, depth=2, 236 content_selectors=[], mime_types=set(), etags=set(), 237 md5hashes=set(), hashes=set(), headers={u'User-Agent': USER_AGENT}, 238 last_update=LAST_QUERY_DEFAULT):
239 """ 240 Initialize the C{IndexSpider} and its variables. 241 242 @param source_name: The sources name. 243 @param start_url: The URL from which to start the crawling. 244 @param depth: The maximum depth to which the spider should follow links 245 into the page. 246 @param content_selectors: A list of content selectors, which are used 247 to get the relevant content region from the page. 248 @param mime_types: A set of acceptable MIME types. 249 @param etags: A set of ETags. 250 @param md5hashes: A set of MD5 hashes. 251 @param hashes: A set of sha256 hashes. 252 @param headers: A dictionary of header fields to send to the server. 253 @param last_update: The date of the last run. 254 255 @type source_name: string 256 @type start_url: string 257 @type depth: int 258 @type content_selectors: list 259 @type mime_types: set 260 @type etags: set 261 @type md5hashes: set 262 @type hashes: set 263 @type headers: dict 264 @type last_update: datetime 265 266 267 """ 268 BaseRequester.__init__(self, source_name) 269 self._content_selectors = content_selectors 270 self._headers = headers 271 self._hashes = hashes 272 self._mime_types = mime_types 273 self._etags = etags 274 self._md5hashes = md5hashes 275 self._start_url = start_url 276 self._urls = URLStack(source_name, depth) 277 self._urls.push(self._start_url, -1) 278 split_url = urlsplit(start_url) 279 self._base_url = (u'%s://%s' 280 % (split_url.scheme, split_url.netloc)) 281 self._last_update = last_update
282 283 284
285 - def hasMorePages(self):
286 """ 287 Are there still pages to crawl? 288 289 @return: Returns C{True} if the spider has more pages to crawl, 290 C{False} otherwise. 291 @rtype: bool 292 """ 293 return len(self._urls) > 0
294 295 296
297 - def getNextPage(self):
298 """ 299 Get the next page. 300 301 Pops the first URL from the L{URLStack}, retrieves the page, filters 302 for content, finds all links in the content that do not leave the site 303 and pushes them on the stack and finally returns a five-tuple 304 containing C{url}, C{soup}, C{content}, C{head} and C{data} . 305 306 The C{soup} is an instance of L{BeautifulSoup.BeautifulSoup}, content 307 also, but only for the part of the page that is specified through the 308 L{_content_selectors} variable, while C{head} contains the HTTP headers 309 of the server's response. Finally, C{data} contains the raw data of the 310 server's response. 311 312 If the server redirects, the spider follows and tries to retrieve 313 the redirection's target. It does so only once, to avoid loops. 314 315 If the page retrieved is not in L{_mime_types}, the content's hash 316 value is in one of the lists L{_hashes}, L{_md5hashes} or the ETag 317 is in L{_etags}, the function returns C{False} . Also, if the value of 318 the Last-Modified header is smaller than L{_last_update}, returns 319 C{False} . 320 321 Checking for the correct MIME type, the ETag and the Last-Modified 322 header is performed on the result of a HTTP HEAD request. 323 324 @return: Five-tuple or C{False}, see above. 325 @rtype: tuple or bool 326 327 """ 328 if len(self._urls) > 0: 329 url = self._urls.pop() 330 331 # Do a HEAD request first, to minimize load of server. 332 head = self._getHead(url[u'url']) 333 334 if (head and head.getheader(u'Content-Type') 335 and not head.getheader(u'Content-Type').split(u';')[0] 336 in self._mime_types): 337 self.logger.debug(u'%s not in mime type list' 338 % head.getheader(u'Content-Type')) 339 return False 340 341 # If the header redirects, do follow once only. 342 if (head and head.getheader(u'Location') 343 and (head.status >= 300 344 and head.status <= 303 345 or head.status == 307)): 346 self.logger.debug(u'Redirect') 347 url[u'url'] = head.getheader(u'Location') 348 head = self._getHead(url[u'url']) 349 350 if (head and head.getheader(u'Content-Type') 351 and head.getheader(u'Content-Type') 352 and not head.getheader(u'Content-Type').split(u';')[0] 353 in self._mime_types): 354 self.logger.debug(u'%s not in mime type list' 355 % head.getheader(u'Content-Type')) 356 return False 357 358 if head and head.status in [200, 203]: 359 last_modified = parse_date(head.getheader(u'Last-Modified')) 360 if last_modified and last_modified <= self._last_update: 361 self.logger.debug(u'Content cache hit: Last-Modified') 362 return False 363 if (head.getheader(u'ETag') 364 and head.getheader(u'ETag') in self._etags): 365 self.logger.debug(u'Content cache hit: ETag') 366 return False 367 if (head.getheader(u'Content-MD5') 368 and head.getheader(u'Content-MD5') in self._md5hashes): 369 self.logger.debug(u'Content cache hit: MD5') 370 return False 371 if not head: 372 self.logger.debug(u'No headers') 373 return False 374 375 376 soup, content, head, data = (None, None, None, None) 377 page = self._getPage(url[u'url']) 378 if page: 379 soup, content, head, data = page 380 381 if content and head: 382 sha = sha256(str(content)).hexdigest() 383 if sha in self._hashes: 384 self.logger.debug(u'Content cache hit: sha256') 385 return False 386 else: 387 self._hashes.add(sha) 388 self.logger.debug(u'sha256: %s' % sha) 389 else: 390 self.logger.debug(u'No content %s' % url[u'url']) 391 392 # Harvest links and push them on the stack. 393 links = content and content.findAll(u'a', {u'href': True}) 394 if links: 395 for link in links: 396 href = urlsplit(link[u'href']) 397 if (href.scheme == u'ftp' 398 or href.scheme == u'javascript' 399 or href.scheme == u'mailto'): 400 continue 401 found_url = link[u'href'] 402 if not href.scheme == u'http': 403 # Not an absolute url 404 if not found_url.startswith(u'.'): 405 # Link relative to site root encountered 406 found_url = u'%s/%s' % (self._base_url, found_url) 407 else: 408 # Link relative to current URL encountered. 409 found_url = u'%s/%s' % (url[u'url'], found_url) 410 if (not found_url.startswith(self._base_url) 411 and not found_url.startswith( 412 self._base_url.replace(u'http', u'https'))): 413 """ 414 The absolute URL will not be pushed on the stack, 415 because it points outside the site. 416 """ 417 continue 418 419 self.logger.debug(u'Found url %s' % found_url) 420 self._urls.push(found_url, url[u'level']) 421 422 # We have data to return. 423 return (url[u'url'], soup, content, head, data) 424 else: 425 self.logger.debug(u'No more urls on the stack') 426 return False
427 428 429
430 - def _getHead(self, url):
431 """ 432 Issues a HEAD request to the given URL. 433 434 @return: The response to the HEAD request or C{False}, if an error 435 occured. 436 @rtype: C{HTTPResponse} or C{False} 437 438 """ 439 res = None 440 split_url = urlsplit(url) 441 try: 442 conn = HTTPConnection(split_url.netloc) 443 except Exception, e: 444 self.logger.debug(u'Error connecting to %s : %s' % 445 (split_url.netloc, e)) 446 else: 447 try: 448 url = urlunparse((u'', u'', split_url.path, u'', 449 split_url.query, split_url.fragment)) 450 conn.request("HEAD", url, headers=self._headers) 451 except Exception, e: 452 self.logger.debug(u'Error requesting to %s : %s' % (url, e)) 453 else: 454 try: 455 res = conn.getresponse() 456 except Exception, e: 457 self.logger.debug(u'Error reading response : %s' % e) 458 459 return res
460 461
462 - def _getPage(self, url):
463 """ 464 Get a page as a L{BeautifulSoup.BeautifulSoup} instance. 465 466 @return: A four-tuple containing the page wrapped as a C{BeautifulSoup} 467 instance, the content also as a C{BeautifulSoup} instance, the 468 headers of the response, and the raw data. 469 470 @rtype: tuple or C{False} 471 472 """ 473 response = self._requestURL(url, headers=self._headers) 474 if not response: 475 self.logger.debug(u'No response') 476 return False 477 478 data = response.read() 479 480 if not data: 481 self.logger.debug(u'No data') 482 return False 483 soup = BeautifulSoup(data) 484 if not soup: 485 self.logger.debug(u'No soup') 486 return False 487 content = soup 488 for selectorlist in self._content_selectors: 489 if not content: 490 break 491 for selector in selectorlist: 492 content = content.find(selector[0], selector[1]) 493 if content: 494 break 495 return (soup, content, response.headers, data)
496