1
2
3
4 """
5 Contains a plugin to get all publications from the Forschungsdatenbank
6 from C{http://forschdb.verwaltung.uni-freiburg.de} .
7
8
9 @author: Johannes Schwenk
10 @copyright: 2010, Johannes Schwenk
11 @version: 1.0
12 @date: 2010-09-15
13
14
15 """
16
17 import sys
18 import shutil
19
20
21 reload(sys)
22 sys.setdefaultencoding('utf-8')
23
24 import codecs
25
26 from string import Template
27 from datetime import datetime
28
29 from tools.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
30 from xmlgetter.plugin import BaseSyncPlugin
31 from xmlgetter.xml import XMLEntry
32
33
34
36 """
37 Finds all publications in the Forschungsdatenbank from all 11 faculties and
38 the Universitätsklinikum.
39
40 The plugin needs a template url entry in L{config.PLUGINS}::
41
42 ...
43 {u'name': u'forschdb',
44 u'url':
45 (u'http://forschdb.verwaltung.uni-freiburg.de/servuni/'
46 u'forschdbuni.fdbfbr1?Fakultaet=${fac}&Dokumentart='
47 u'Publikation&Ausgabeart=xml&Jahr=1900-${to_year}')},
48 ...
49
50 It will then replace C{${to_year}} with the current year and generate a
51 list of 13 URLs replacing C{${fac}} onces with 99 and the other 12 times
52 with values from the range (0, 11). These URLs will then be queried,
53 resulting each in a XML document with all publication entries for the
54 faculty C{fac} from the year C{1900} until now.
55
56 The contents of each C{<publication>} entry is then parsed with
57 L{BeautifulSoup} and a L{XMLEntry} is produced. The content of the XMLEntry
58 will be produced according to common citation rules, which presently
59 distinguish five different types of publications:
60
61 - "Buchbeitrag"
62 - "Monografie und Herausgeberschrift"
63 - "Edition und Uebersetzung"
64 - "Sonstiges"
65 - "Artikel"
66
67 "Artikel" will be the catch-all for unknown types of publications, of which
68 there are presently none.
69
70 @todo:
71 Have a look at memory consumption and optimize!
72
73 @note:
74 All authors of a publication will be listed in the content of the
75 L{XMLEntry} to increase the findability. This does not however conform to
76 standard standard citation rules. Since the Furschungsdatenbank also
77 uses this citation style for authors, this increases coherence.
78
79 @note:
80 The function L{_getData} uses german variable names to be coherent with the
81 naming of the XML elements it processes.
82
83 """
84
86 """
87 Extracts data from a C{BeautifulSoup.Tag} instance.
88
89 @param tag: The root tag from which on to search for the data.
90 @param tagname: The name of the tag that contains the data. If
91 C{tagname} is C{None}, the data will be extracted from
92 C{tag} itself.
93 @type tag: BeautifulSoup.Tag
94 @type tagname: string
95 @return: A string representing the found data, or C{None} if no data
96 could be found.
97
98 """
99 ntag = tag
100 if tagname:
101 ntag = ntag and ntag.find(tagname)
102 ntag = (ntag
103 and ntag.contents
104 and len(ntag.contents) > 0
105 and ntag.contents[0]
106 or None)
107 if not isinstance(ntag, basestring):
108 ntag = None
109 return ntag
110
111
113 """
114 Extracts author data from a C{BeautifulSoup.Tag} instance.
115
116 @param tag: The root tag from which on to search for the data.
117 @type tag: BeautifulSoup.Tag
118 @return: All authors of the publication concatenated and separated
119 by a ','. If no author could be found, C{None} is returned.
120
121 """
122 author = tag.findAll(u'autor')
123
124
125
126
127
128 author = (author and u', '.join([
129 a.contents
130 and len(a.contents) > 0
131 and isinstance(a.contents[0], basestring)
132 and a.contents[0]
133 or u''
134 for a in author])
135 or None)
136
137 return author
138
139
140
142 """
143 Gets the data from the Forschungsdatenbank.
144
145 Retrieves all publications for each faculty and the university
146 hospital.
147
148 Uses german variable names to be coherent with the XML data retrieved.
149
150 @return: C{False} if an error or warning occured, C{True} otherwise.
151
152 """
153
154 url_link = (u'http://forschdb.verwaltung.uni-freiburg.de/servukl/'
155 u'forschdbukl.recherche0?xmldokumentart=Publikation&'
156 u'lfdnr=${lfdnr}&sprache=D&Layout=uni&Ausgabeart=bs&'
157 u'Rahmen=1&CSS=http://info.verwaltung.uni-freiburg.de/'
158 u'uni2002/content.css&Variante=2')
159
160
161 this_year = datetime.now().year
162
163 """
164 Generate a list of urls. For each faculty from 0 to 11 and one
165 for fac=99 which is the Universitätsklinikum.
166 """
167 url_list = [Template(self._url).substitute(fac=u'99',
168 to_year=this_year),]
169 for i in range(0, 12):
170 url_list.append(
171 Template(self._url).substitute(fac=i, to_year=this_year))
172
173 retval = True
174
175
176 for xml_url in url_list:
177 self.logger.debug(u'Requesting URL %s' % xml_url)
178 response = self._requestURL(xml_url)
179 if not response:
180 self._stats.messages.append(u'WARNING: No response for url'
181 u' %s !' % xml_url)
182 self._stats.status = u'W'
183 retval = False
184 continue
185 soup = BeautifulStoneSoup(response.read())
186 forschb = (soup.forschungsbericht and soup.forschungsbericht
187 or None)
188 if not forschb:
189 self._stats.messages.append(u'WARNING: No "Forschungsbericht"'
190 u' for url %s !' % xml_url)
191 self._stats.status = u'W'
192 self.logger.warn(u'No "Forschungsbericht" !!')
193 retval = False
194 continue
195 num_pub = 0
196 publ = u''
197 self.logger.debug(u'Response XML parsed')
198 for publ in forschb.findAll(u'publikation'):
199 num_pub = num_pub + 1
200
201
202 typ = publ[u'typ']
203 lfdnr = publ[u'lfdnr']
204 indruck = publ[u'indruck']
205 fakultaet = publ[u'fakultaet']
206 indruck = ((indruck == u'Ja') and u'in Druck' or
207 (indruck == u'Begutachtung')
208 and u'im Begutachtungsverfahren' or
209 (indruck == u'Eingereicht') and u'eingereicht' or None
210 )
211
212 autor = self._extractAuthor(publ)
213 titel_tag = publ.find(u'titel')
214 titel = self._extractTagData(titel_tag, u'pubtitel')
215 url = self._extractTagData(publ, u'url')
216 abstract = self._extractTagData(publ, u'abstract')
217 stichworte = [s.contents
218 and len(s.contents) > 0
219 and isinstance(s.contents[0], basestring)
220 and s.contents[0]
221
222 or u''
223 for s in publ.findAll(u'stichwort')]
224
225 erscheinung = publ.find(u'erscheinung')
226 herausgeber = self._extractTagData(erscheinung,
227 u'herausgeber')
228 ort = self._extractTagData(erscheinung, u'ort')
229 pjahr = self._extractTagData(erscheinung, u'jahr')
230 auflage = self._extractTagData(erscheinung, u'auflage')
231 verlag = self._extractTagData(erscheinung, u'name')
232 tagungsname = self._extractTagData(erscheinung,
233 u'tagungsname')
234
235
236
237
238
239 reihe = publ.find(u'reihe')
240 if reihe:
241 reihename = self._extractTagData(reihe, u'reihename')
242 rjahr = self._extractTagData(reihe, u'reihejahrgang')
243
244
245 band = publ.find(u'band')
246 if band:
247 volume = self._extractTagData(band, u'volume')
248 issue = self._extractTagData(band, u'issue')
249 svon = self._extractTagData(band, u'seitevon')
250 sbis = self._extractTagData(band, u'seitebis')
251 suppl = self._extractTagData(band, u'supplement')
252 online = self._extractTagData(band, u'online')
253 online = online and (online == u'Ja') and True or None
254
255 if typ == u'Buchbeitrag':
256
257 buchtitel = self._extractTagData(titel_tag, u'buchtitel')
258
259 content = (u'%s (%s): %s - In: %s%s%s%s%s%s%s%s%s%s%s%s' %
260 (autor or u'!NO AUTHOR!',
261 pjahr or u'o.J.',
262 titel or u'!NO TITLE!',
263 herausgeber and u'%s (Hrsg.): ' % herausgeber
264 or u'',
265 buchtitel or u'!NO BOOKTITLE!',
266 reihe and reihename
267 and u', (%s%s)'
268 % (reihename,
269 rjahr and u', Jahrg. %s' % rjahr
270 or band and volume
271 and u', Bd. %s' % volume
272 or u'') or u'',
273 auflage and u', (%s. Aufl.)' % auflage or u'',
274 verlag and u', %s' % verlag or u'',
275 ort and u', %s' % ort or u', o.O.',
276 band and suppl and u', Suppl. %s' % suppl or u'',
277 band and online and u', (online)' or u'',
278 band and svon and u', S. %s' % svon or u'',
279 band and sbis and u'-%s' % sbis or u'',
280 indruck and u', (%s)' % indruck or u'',
281 url and u', %s' % url or u''
282 ))
283
284
285 elif typ in [u'Monographie', u'Herausgeberschrift']:
286 content = (u'%s (%s): %s - %s%s%s%s%s%s%s%s' %
287 (autor or u'!NO AUTHOR!',
288 pjahr or u'o.J.',
289 titel or u'!NO TITLE!',
290 herausgeber and u'%s (Hrsg.): ' % herausgeber
291 or u'',
292 reihe and reihename
293 and u', (%s%s)'
294 % (reihename,
295 rjahr and u', Jahrg. %s' % rjahr
296 or band and volume
297 and u', Bd. %s' % volume) or u'',
298 auflage and u', (%s. Aufl.)' % auflage or u'',
299 verlag and u', %s' % verlag or u'',
300 ort and u', %s' % ort or u', o.O.',
301 band and online and u', (online)' or u'',
302 indruck and u', (%s)' % indruck or u'',
303 url and u', %s' % url or u''
304 ))
305
306
307 elif typ in [u'Edition', u'Uebersetzung']:
308
309 autor_orig = self._extractTagData(publ, u'autororiginal')
310
311 content = (u'%s (Hrsg.), %s (%s): %s - %s%s%s%s%s%s%s' %
312 (autor or u'!NO AUTHOR!',
313 autor_orig or u'',
314 pjahr or u'o.J.',
315 titel or u'!NO TITLE!',
316 reihe and reihename
317 and u', (%s%s)'
318 % (reihename,
319 rjahr and u', Jahrg. %s' % rjahr
320 or band and volume
321 and u', Bd. %s' % volume) or u'',
322 auflage and u', (%s. Aufl.)' % auflage or u'',
323 verlag and u', %s' % verlag or u'',
324 ort and u', %s' % ort or u', o.O.',
325 band and online and u', (online)' or u'',
326 indruck and u', (%s)' % indruck or u'',
327 url and u', %s' % url or u''
328 ))
329
330
331 elif typ == u'Sonstiges':
332
333 zeitschrift = verlag
334
335 content = (u'%s (%s): %s - %s%s%s%s%s%s%s%s%s%s%s' %
336 (autor or u'!NO AUTHOR!',
337 pjahr or u'o.J.',
338 titel or u'!NO TITLE!',
339 zeitschrift or u'',
340 band and volume and u', %s' % volume or u'',
341 band and issue and u', (%s)' % issue or u'',
342 auflage and u', (%s. Aufl.)' % auflage or u'',
343 band and suppl and u', Suppl. %s' % suppl or u'',
344 tagungsname and u', (%s)' % tagungsname or u'',
345 band and online and u', (online)' or u'',
346 band and svon and u', S. %s' % svon or u'',
347 band and sbis and u'-%s' % sbis or u'',
348 indruck and u', (%s)' % indruck or u'',
349 url and u', %s' % url or u''
350 ))
351
352
353 else:
354
355 zeitschrift = verlag
356
357 content = (u'%s (%s): %s - %s%s%s%s%s%s%s%s%s%s%s' %
358 (autor or u'!NO AUTHOR!',
359 pjahr or u'o.J.',
360 titel or u'!NO TITLE!',
361 herausgeber and u'%s (Hrsg.): ' % herausgeber
362 or u'',
363 zeitschrift or u'',
364 band and volume and u', %s' % volume or u'',
365 band and issue and u', (%s)' % issue or u'',
366 band and suppl and u', Suppl. %s' % suppl or u'',
367 tagungsname and u', (%s)' % tagungsname or u'',
368 band and online and u', (online)' or u'',
369 band and svon and u', S. %s' % svon or u'',
370 band and sbis and u'-%s' % sbis or u'',
371 indruck and u', (%s)' % indruck or u'',
372 url and u', %s' % url or u''
373 ))
374
375
376 url = Template(url_link).substitute(lfdnr=lfdnr)
377 entry = XMLEntry(url,
378 titel,
379 content,
380 description=abstract or u'',
381 created=pjahr,
382 portal_type=u'_publication',
383 tags=stichworte,
384 sources=[self._source_name,])
385 self._entries.append(entry)
386
387
388 self._stats.entries = self._stats.entries + 1
389 self._stats.static_entries = self._stats.static_entries + 1
390
391
392 if len(self._entries) % 1000 == 0:
393 self._writeEntries()
394
395 self.logger.debug(u'%s publications for faculty %s' % (num_pub,
396 fakultaet))
397
398 self.logger.debug(u'Writing remaining entries')
399 self._writeEntries()
400
401 if self._stats.entries == 0:
402 self.logger.warn(u'No entries!')
403 self._stats.messages.append(u'WARNING: No entries! This could be'
404 u' due to the webpage beeing unreachable!')
405 self._stats.status = u'W'
406 return False
407
408 shutil.move(self._intermediate_temp_filename, self._temp_filename)
409 return retval
410