1
2
3
4 """
5 Contains a very simple plugin that spiders the webpage of the Studentenwerk
6 starting from the sitemap under
7 C{http://www.studentenwerk.uni-freiburg.de/index.php?id=272} .
8
9 @author: Johannes Schwenk
10 @copyright: 2010, Johannes Schwenk
11 @version: 2.0
12 @date: 2010-09-15
13
14
15 """
16
17 import sys
18 import shutil
19
20
21 reload(sys)
22 sys.setdefaultencoding('utf-8')
23
24 from urlparse import urlsplit
25 from string import Template
26 from datetime import datetime, date
27
28 from tools.BeautifulSoup import Comment
29 from tools.functions import unescape
30 from xmlgetter.plugin import BaseSyncPlugin
31 from xmlgetter.xml import XMLEntry
32 from xmlgetter.spider import IndexSpider
33
34
35
36 DEPTH = 3
37
38
40 """
41 Spiders the webpage of the Studentenwerk starting from the sitemap under
42 C{http://www.studentenwerk.uni-freiburg.de/index.php?id=272} .
43
44 Finds all URLs in the content region of the sitemap's HTML and recurses
45 to a level of L{DEPTH}. It will currently generate a L{XMLEntry} only for
46 pages that have a C{content-type} of u'text/html' - indexing of PDF, DOC,
47 PS, XSL, PPT is not yet supported.
48
49 """
50
51 _index_spider = None
52 """
53 @ivar: A instance of L{IndexSpider}, which will handle the spidering.
54
55 """
56
57
58 - def __init__(self, source_name, url, NO_NET=False):
59 """
60 Initialize the plugin.
61
62 An L{IndexSpider} instance with the appropriate initial values is
63 created and assigned to L{_index_spider}. During indexing we only
64 want to consider the C{div} with an id of C{middle} - for content
65 gathering as for link extraction alike.
66
67 """
68 BaseSyncPlugin.__init__(self, source_name, url, NO_NET)
69 self._index_spider = IndexSpider(source_name, url,
70 depth=DEPTH, content_selectors=[
71 [(u'div', {u'id': u'middle'}),],
72 ],
73 mime_types=set([u'text/html']))
74
75
76
78 """
79 We request our page data from the L{IndexSpider}, until it runs out of
80 new pages. The page coming from the spider is guaranteed to have a
81 MIME type we have requested and to be unique (no duplicates). So we
82 can just focus on the extraction of the content and the generation
83 of an L{XMLEntry} per page.
84
85 @return: C{False}, if an error occured and the data is unusable,
86 C{True} otherwise.
87
88 """
89
90
91 self._stats.entries = 0
92 self._stats.new_entries = 0
93
94
95
96 while self._index_spider.hasMorePages():
97 page = self._index_spider.getNextPage()
98 if not page:
99 continue
100 url, soup, content, headers, data = page
101 if not headers or not content:
102 continue
103
104
105 document = urlsplit(url).path.split(u'/')[-1]
106 if soup:
107 e_title = u''.join(soup.find(u'title') or document)
108 else:
109 e_title = document
110
111
112 """
113 Only indexing content if page was html.
114 Other content would need special attention here, and additional
115 entries in the mime_types list of L{_index_spider}.
116 """
117 e_content = u' '.join([e
118 for e in content.recursiveChildGenerator()
119 if isinstance(e, unicode) and e != u'\n'
120 and not isinstance(e, Comment)
121 and not e.parent.name in [u'style', u'script']])
122
123
124
125 e_content = unescape(e_content)
126 e_content = e_content.replace(u']]>', u']]>')
127 entry = XMLEntry(url, e_title, e_content,
128 created=str(datetime.now()),
129 creator=u'studentenwerk',
130 portal_type='#studentenwerk_entry',
131 sources=[u'studentenwerk'])
132 self._entries.append(entry)
133
134
135 self._stats.entries = self._stats.entries + 1
136 self._stats.static_entries = self._stats.static_entries + 1
137
138
139 if len(self._entries) % 3 == 0:
140 self._writeEntries()
141
142 self.logger.debug(u'Writing remaining entries')
143 self._writeEntries()
144
145 if self._stats.entries == 0:
146 self.logger.warn(u'No entries!')
147 self._stats.messages.append(u'WARNING: No entries! This could be'
148 u' due to the webpage beeing unreachable!')
149 self._stats.status = u'W'
150 return False
151
152 shutil.move(self._intermediate_temp_filename, self._temp_filename)
153 return True
154