buildxml.plugins.studentenwerk

40 """ 41 Spiders the webpage of the Studentenwerk starting from the sitemap under 42 C{http://www.studentenwerk.uni-freiburg.de/index.php?id=272} . 43 44 Finds all URLs in the content region of the sitemap's HTML and recurses 45 to a level of L{DEPTH}. It will currently generate a L{XMLEntry} only for 46 pages that have a C{content-type} of u'text/html' - indexing of PDF, DOC, 47 PS, XSL, PPT is not yet supported. 48 49 """ 50 51 _index_spider = None 52 """ 53 @ivar: A instance of L{IndexSpider}, which will handle the spidering. 54 55 """ 56 57

58 - def __init__(self, source_name, url, NO_NET=False):

59 """ 60 Initialize the plugin. 61 62 An L{IndexSpider} instance with the appropriate initial values is 63 created and assigned to L{_index_spider}. During indexing we only 64 want to consider the C{div} with an id of C{middle} - for content 65 gathering as for link extraction alike. 66 67 """ 68 BaseSyncPlugin.__init__(self, source_name, url, NO_NET) 69 self._index_spider = IndexSpider(source_name, url, 70 depth=DEPTH, content_selectors=[ 71 [(u'div', {u'id': u'middle'}),], 72 ], 73 mime_types=set([u'text/html']))

74 75 76

77 - def _getData(self):

78 """ 79 We request our page data from the L{IndexSpider}, until it runs out of 80 new pages. The page coming from the spider is guaranteed to have a 81 MIME type we have requested and to be unique (no duplicates). So we 82 can just focus on the extraction of the content and the generation 83 of an L{XMLEntry} per page. 84 85 @return: C{False}, if an error occured and the data is unusable, 86 C{True} otherwise. 87 88 """ 89 90 # Initialize statistics. 91 self._stats.entries = 0 92 self._stats.new_entries = 0 93 94 95 # Loop until we get no page from C{_index_spider} 96 while self._index_spider.hasMorePages(): 97 page = self._index_spider.getNextPage() 98 if not page: 99 continue # Continue with next page, if we do not have data. 100 url, soup, content, headers, data = page 101 if not headers or not content: 102 continue # Nothing to index, continue. 103 104 # Extract the documents name on the server. 105 document = urlsplit(url).path.split(u'/')[-1] 106 if soup: 107 e_title = u''.join(soup.find(u'title') or document) 108 else: 109 e_title = document 110 111 112 """ 113 Only indexing content if page was html. 114 Other content would need special attention here, and additional 115 entries in the mime_types list of L{_index_spider}. 116 """ 117 e_content = u' '.join([e 118 for e in content.recursiveChildGenerator() 119 if isinstance(e, unicode) and e != u'\n' 120 and not isinstance(e, Comment) 121 and not e.parent.name in [u'style', u'script']]) 122 123 124 # Unescape HTML-Entities 125 e_content = unescape(e_content) 126 e_content = e_content.replace(u']]>', u']]>') 127 entry = XMLEntry(url, e_title, e_content, 128 created=str(datetime.now()), 129 creator=u'studentenwerk', 130 portal_type='#studentenwerk_entry', 131 sources=[u'studentenwerk']) 132 self._entries.append(entry) 133 134 # Update statistics. 135 self._stats.entries = self._stats.entries + 1 136 self._stats.static_entries = self._stats.static_entries + 1 137 138 # Keep memory profile low. 139 if len(self._entries) % 3 == 0: 140 self._writeEntries() 141 142 self.logger.debug(u'Writing remaining entries') 143 self._writeEntries() 144 145 if self._stats.entries == 0: 146 self.logger.warn(u'No entries!') 147 self._stats.messages.append(u'WARNING: No entries! This could be' 148 u' due to the webpage beeing unreachable!') 149 self._stats.status = u'W' 150 return False 151 152 shutil.move(self._intermediate_temp_filename, self._temp_filename) 153 return True

Source Code for Module buildxml.plugins.studentenwerk