Package buildxml :: Package plugins :: Module studentenwerk
[hide private]
[frames] | no frames]

Source Code for Module buildxml.plugins.studentenwerk

  1  #!/usr/bin/python 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """ 
  5  Contains a very simple plugin that spiders the webpage of the Studentenwerk 
  6  starting from the sitemap under 
  7  C{http://www.studentenwerk.uni-freiburg.de/index.php?id=272} . 
  8   
  9  @author: Johannes Schwenk 
 10  @copyright: 2010, Johannes Schwenk 
 11  @version: 2.0 
 12  @date: 2010-09-15 
 13   
 14   
 15  """ 
 16   
 17  import sys 
 18  import shutil 
 19   
 20  # Imortant! 
 21  reload(sys) 
 22  sys.setdefaultencoding('utf-8') 
 23   
 24  from urlparse import urlsplit 
 25  from string import Template 
 26  from datetime import datetime, date 
 27   
 28  from tools.BeautifulSoup import Comment 
 29  from tools.functions import unescape 
 30  from xmlgetter.plugin import BaseSyncPlugin 
 31  from xmlgetter.xml import XMLEntry 
 32  from xmlgetter.spider import IndexSpider 
 33   
 34   
 35  # Config: 
 36  DEPTH = 3 
 37   
 38   
39 -class SyncPlugin_studentenwerk(BaseSyncPlugin):
40 """ 41 Spiders the webpage of the Studentenwerk starting from the sitemap under 42 C{http://www.studentenwerk.uni-freiburg.de/index.php?id=272} . 43 44 Finds all URLs in the content region of the sitemap's HTML and recurses 45 to a level of L{DEPTH}. It will currently generate a L{XMLEntry} only for 46 pages that have a C{content-type} of u'text/html' - indexing of PDF, DOC, 47 PS, XSL, PPT is not yet supported. 48 49 """ 50 51 _index_spider = None 52 """ 53 @ivar: A instance of L{IndexSpider}, which will handle the spidering. 54 55 """ 56 57
58 - def __init__(self, source_name, url, NO_NET=False):
59 """ 60 Initialize the plugin. 61 62 An L{IndexSpider} instance with the appropriate initial values is 63 created and assigned to L{_index_spider}. During indexing we only 64 want to consider the C{div} with an id of C{middle} - for content 65 gathering as for link extraction alike. 66 67 """ 68 BaseSyncPlugin.__init__(self, source_name, url, NO_NET) 69 self._index_spider = IndexSpider(source_name, url, 70 depth=DEPTH, content_selectors=[ 71 [(u'div', {u'id': u'middle'}),], 72 ], 73 mime_types=set([u'text/html']))
74 75 76
77 - def _getData(self):
78 """ 79 We request our page data from the L{IndexSpider}, until it runs out of 80 new pages. The page coming from the spider is guaranteed to have a 81 MIME type we have requested and to be unique (no duplicates). So we 82 can just focus on the extraction of the content and the generation 83 of an L{XMLEntry} per page. 84 85 @return: C{False}, if an error occured and the data is unusable, 86 C{True} otherwise. 87 88 """ 89 90 # Initialize statistics. 91 self._stats.entries = 0 92 self._stats.new_entries = 0 93 94 95 # Loop until we get no page from C{_index_spider} 96 while self._index_spider.hasMorePages(): 97 page = self._index_spider.getNextPage() 98 if not page: 99 continue # Continue with next page, if we do not have data. 100 url, soup, content, headers, data = page 101 if not headers or not content: 102 continue # Nothing to index, continue. 103 104 # Extract the documents name on the server. 105 document = urlsplit(url).path.split(u'/')[-1] 106 if soup: 107 e_title = u''.join(soup.find(u'title') or document) 108 else: 109 e_title = document 110 111 112 """ 113 Only indexing content if page was html. 114 Other content would need special attention here, and additional 115 entries in the mime_types list of L{_index_spider}. 116 """ 117 e_content = u' '.join([e 118 for e in content.recursiveChildGenerator() 119 if isinstance(e, unicode) and e != u'\n' 120 and not isinstance(e, Comment) 121 and not e.parent.name in [u'style', u'script']]) 122 123 124 # Unescape HTML-Entities 125 e_content = unescape(e_content) 126 e_content = e_content.replace(u']]>', u']]>') 127 entry = XMLEntry(url, e_title, e_content, 128 created=str(datetime.now()), 129 creator=u'studentenwerk', 130 portal_type='#studentenwerk_entry', 131 sources=[u'studentenwerk']) 132 self._entries.append(entry) 133 134 # Update statistics. 135 self._stats.entries = self._stats.entries + 1 136 self._stats.static_entries = self._stats.static_entries + 1 137 138 # Keep memory profile low. 139 if len(self._entries) % 3 == 0: 140 self._writeEntries() 141 142 self.logger.debug(u'Writing remaining entries') 143 self._writeEntries() 144 145 if self._stats.entries == 0: 146 self.logger.warn(u'No entries!') 147 self._stats.messages.append(u'WARNING: No entries! This could be' 148 u' due to the webpage beeing unreachable!') 149 self._stats.status = u'W' 150 return False 151 152 shutil.move(self._intermediate_temp_filename, self._temp_filename) 153 return True
154