Package buildxml :: Package plugins :: Module forschdb
[hide private]
[frames] | no frames]

Source Code for Module buildxml.plugins.forschdb

  1  #!/usr/bin/python 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """ 
  5  Contains a plugin to get all publications from the Forschungsdatenbank 
  6  from C{http://forschdb.verwaltung.uni-freiburg.de} . 
  7   
  8   
  9  @author: Johannes Schwenk 
 10  @copyright: 2010, Johannes Schwenk 
 11  @version: 1.0 
 12  @date: 2010-09-15 
 13   
 14   
 15  """ 
 16   
 17  import sys 
 18  import shutil 
 19   
 20  # Imortant! 
 21  reload(sys) 
 22  sys.setdefaultencoding('utf-8') 
 23   
 24  import codecs 
 25   
 26  from string import Template 
 27  from datetime import datetime 
 28   
 29  from tools.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup 
 30  from xmlgetter.plugin import BaseSyncPlugin 
 31  from xmlgetter.xml import XMLEntry 
 32   
 33   
 34   
35 -class SyncPlugin_forschdb(BaseSyncPlugin):
36 """ 37 Finds all publications in the Forschungsdatenbank from all 11 faculties and 38 the Universitätsklinikum. 39 40 The plugin needs a template url entry in L{config.PLUGINS}:: 41 42 ... 43 {u'name': u'forschdb', 44 u'url': 45 (u'http://forschdb.verwaltung.uni-freiburg.de/servuni/' 46 u'forschdbuni.fdbfbr1?Fakultaet=${fac}&Dokumentart=' 47 u'Publikation&Ausgabeart=xml&Jahr=1900-${to_year}')}, 48 ... 49 50 It will then replace C{${to_year}} with the current year and generate a 51 list of 13 URLs replacing C{${fac}} onces with 99 and the other 12 times 52 with values from the range (0, 11). These URLs will then be queried, 53 resulting each in a XML document with all publication entries for the 54 faculty C{fac} from the year C{1900} until now. 55 56 The contents of each C{<publication>} entry is then parsed with 57 L{BeautifulSoup} and a L{XMLEntry} is produced. The content of the XMLEntry 58 will be produced according to common citation rules, which presently 59 distinguish five different types of publications: 60 61 - "Buchbeitrag" 62 - "Monografie und Herausgeberschrift" 63 - "Edition und Uebersetzung" 64 - "Sonstiges" 65 - "Artikel" 66 67 "Artikel" will be the catch-all for unknown types of publications, of which 68 there are presently none. 69 70 @todo: 71 Have a look at memory consumption and optimize! 72 73 @note: 74 All authors of a publication will be listed in the content of the 75 L{XMLEntry} to increase the findability. This does not however conform to 76 standard standard citation rules. Since the Furschungsdatenbank also 77 uses this citation style for authors, this increases coherence. 78 79 @note: 80 The function L{_getData} uses german variable names to be coherent with the 81 naming of the XML elements it processes. 82 83 """ 84
85 - def _extractTagData(self, tag, tagname=None):
86 """ 87 Extracts data from a C{BeautifulSoup.Tag} instance. 88 89 @param tag: The root tag from which on to search for the data. 90 @param tagname: The name of the tag that contains the data. If 91 C{tagname} is C{None}, the data will be extracted from 92 C{tag} itself. 93 @type tag: BeautifulSoup.Tag 94 @type tagname: string 95 @return: A string representing the found data, or C{None} if no data 96 could be found. 97 98 """ 99 ntag = tag 100 if tagname: 101 ntag = ntag and ntag.find(tagname) 102 ntag = (ntag 103 and ntag.contents 104 and len(ntag.contents) > 0 105 and ntag.contents[0] 106 or None) 107 if not isinstance(ntag, basestring): 108 ntag = None 109 return ntag
110 111
112 - def _extractAuthor(self, tag):
113 """ 114 Extracts author data from a C{BeautifulSoup.Tag} instance. 115 116 @param tag: The root tag from which on to search for the data. 117 @type tag: BeautifulSoup.Tag 118 @return: All authors of the publication concatenated and separated 119 by a ','. If no author could be found, C{None} is returned. 120 121 """ 122 author = tag.findAll(u'autor') 123 124 # Should we cut the list down to 5 authors like apa citation? 125 # A matter of style and findability... 126 #et_al = len(author) > 5 and u'et al.' or u'' 127 128 author = (author and u', '.join([ 129 a.contents 130 and len(a.contents) > 0 131 and isinstance(a.contents[0], basestring) 132 and a.contents[0] 133 or u'' 134 for a in author]) 135 or None) 136 #author = author and u'%s %s' % (author, et_al) 137 return author
138 139 140
141 - def _getData(self):
142 """ 143 Gets the data from the Forschungsdatenbank. 144 145 Retrieves all publications for each faculty and the university 146 hospital. 147 148 Uses german variable names to be coherent with the XML data retrieved. 149 150 @return: C{False} if an error or warning occured, C{True} otherwise. 151 152 """ 153 154 url_link = (u'http://forschdb.verwaltung.uni-freiburg.de/servukl/' 155 u'forschdbukl.recherche0?xmldokumentart=Publikation&' 156 u'lfdnr=${lfdnr}&sprache=D&Layout=uni&Ausgabeart=bs&' 157 u'Rahmen=1&CSS=http://info.verwaltung.uni-freiburg.de/' 158 u'uni2002/content.css&Variante=2') 159 160 # Query up to this year 161 this_year = datetime.now().year 162 163 """ 164 Generate a list of urls. For each faculty from 0 to 11 and one 165 for fac=99 which is the Universitätsklinikum. 166 """ 167 url_list = [Template(self._url).substitute(fac=u'99', 168 to_year=this_year),] 169 for i in range(0, 12): 170 url_list.append( 171 Template(self._url).substitute(fac=i, to_year=this_year)) 172 173 retval = True 174 175 # Retrieve XML for every entry in url_list 176 for xml_url in url_list: 177 self.logger.debug(u'Requesting URL %s' % xml_url) 178 response = self._requestURL(xml_url) 179 if not response: 180 self._stats.messages.append(u'WARNING: No response for url' 181 u' %s !' % xml_url) 182 self._stats.status = u'W' 183 retval = False 184 continue 185 soup = BeautifulStoneSoup(response.read()) 186 forschb = (soup.forschungsbericht and soup.forschungsbericht 187 or None) 188 if not forschb: 189 self._stats.messages.append(u'WARNING: No "Forschungsbericht"' 190 u' for url %s !' % xml_url) 191 self._stats.status = u'W' 192 self.logger.warn(u'No "Forschungsbericht" !!') 193 retval = False 194 continue 195 num_pub = 0 196 publ = u'' 197 self.logger.debug(u'Response XML parsed') 198 for publ in forschb.findAll(u'publikation'): 199 num_pub = num_pub + 1 200 201 # Fill common variables 202 typ = publ[u'typ'] # Type of publication. 203 lfdnr = publ[u'lfdnr'] # ID number of the publication. 204 indruck = publ[u'indruck'] # Preprint status. 205 fakultaet = publ[u'fakultaet'] # Faculty. 206 indruck = ((indruck == u'Ja') and u'in Druck' or 207 (indruck == u'Begutachtung') 208 and u'im Begutachtungsverfahren' or 209 (indruck == u'Eingereicht') and u'eingereicht' or None 210 ) 211 212 autor = self._extractAuthor(publ) 213 titel_tag = publ.find(u'titel') 214 titel = self._extractTagData(titel_tag, u'pubtitel') 215 url = self._extractTagData(publ, u'url') 216 abstract = self._extractTagData(publ, u'abstract') 217 stichworte = [s.contents 218 and len(s.contents) > 0 219 and isinstance(s.contents[0], basestring) 220 and s.contents[0] 221 #and self.logger.debug(u'STICHWORT') 222 or u'' 223 for s in publ.findAll(u'stichwort')] 224 225 erscheinung = publ.find(u'erscheinung') 226 herausgeber = self._extractTagData(erscheinung, 227 u'herausgeber') 228 ort = self._extractTagData(erscheinung, u'ort') 229 pjahr = self._extractTagData(erscheinung, u'jahr') 230 auflage = self._extractTagData(erscheinung, u'auflage') 231 verlag = self._extractTagData(erscheinung, u'name') 232 tagungsname = self._extractTagData(erscheinung, 233 u'tagungsname') 234 235 236 # And now some variables specific to some types... 237 238 # Reihe and/or Band 239 reihe = publ.find(u'reihe') 240 if reihe: 241 reihename = self._extractTagData(reihe, u'reihename') 242 rjahr = self._extractTagData(reihe, u'reihejahrgang') 243 244 # Band 245 band = publ.find(u'band') 246 if band: 247 volume = self._extractTagData(band, u'volume') 248 issue = self._extractTagData(band, u'issue') 249 svon = self._extractTagData(band, u'seitevon') 250 sbis = self._extractTagData(band, u'seitebis') 251 suppl = self._extractTagData(band, u'supplement') 252 online = self._extractTagData(band, u'online') 253 online = online and (online == u'Ja') and True or None 254 255 if typ == u'Buchbeitrag': 256 257 buchtitel = self._extractTagData(titel_tag, u'buchtitel') 258 259 content = (u'%s (%s): %s - In: %s%s%s%s%s%s%s%s%s%s%s%s' % 260 (autor or u'!NO AUTHOR!', 261 pjahr or u'o.J.', 262 titel or u'!NO TITLE!', 263 herausgeber and u'%s (Hrsg.): ' % herausgeber 264 or u'', 265 buchtitel or u'!NO BOOKTITLE!', 266 reihe and reihename 267 and u', (%s%s)' 268 % (reihename, 269 rjahr and u', Jahrg. %s' % rjahr 270 or band and volume 271 and u', Bd. %s' % volume 272 or u'') or u'', 273 auflage and u', (%s. Aufl.)' % auflage or u'', 274 verlag and u', %s' % verlag or u'', 275 ort and u', %s' % ort or u', o.O.', 276 band and suppl and u', Suppl. %s' % suppl or u'', 277 band and online and u', (online)' or u'', 278 band and svon and u', S. %s' % svon or u'', 279 band and sbis and u'-%s' % sbis or u'', 280 indruck and u', (%s)' % indruck or u'', 281 url and u', %s' % url or u'' 282 )) 283 284 285 elif typ in [u'Monographie', u'Herausgeberschrift']: 286 content = (u'%s (%s): %s - %s%s%s%s%s%s%s%s' % 287 (autor or u'!NO AUTHOR!', 288 pjahr or u'o.J.', 289 titel or u'!NO TITLE!', 290 herausgeber and u'%s (Hrsg.): ' % herausgeber 291 or u'', 292 reihe and reihename 293 and u', (%s%s)' 294 % (reihename, 295 rjahr and u', Jahrg. %s' % rjahr 296 or band and volume 297 and u', Bd. %s' % volume) or u'', 298 auflage and u', (%s. Aufl.)' % auflage or u'', 299 verlag and u', %s' % verlag or u'', 300 ort and u', %s' % ort or u', o.O.', 301 band and online and u', (online)' or u'', 302 indruck and u', (%s)' % indruck or u'', 303 url and u', %s' % url or u'' 304 )) 305 306 307 elif typ in [u'Edition', u'Uebersetzung']: 308 309 autor_orig = self._extractTagData(publ, u'autororiginal') 310 311 content = (u'%s (Hrsg.), %s (%s): %s - %s%s%s%s%s%s%s' % 312 (autor or u'!NO AUTHOR!', 313 autor_orig or u'', 314 pjahr or u'o.J.', 315 titel or u'!NO TITLE!', 316 reihe and reihename 317 and u', (%s%s)' 318 % (reihename, 319 rjahr and u', Jahrg. %s' % rjahr 320 or band and volume 321 and u', Bd. %s' % volume) or u'', 322 auflage and u', (%s. Aufl.)' % auflage or u'', 323 verlag and u', %s' % verlag or u'', 324 ort and u', %s' % ort or u', o.O.', 325 band and online and u', (online)' or u'', 326 indruck and u', (%s)' % indruck or u'', 327 url and u', %s' % url or u'' 328 )) 329 330 331 elif typ == u'Sonstiges': 332 333 zeitschrift = verlag 334 335 content = (u'%s (%s): %s - %s%s%s%s%s%s%s%s%s%s%s' % 336 (autor or u'!NO AUTHOR!', 337 pjahr or u'o.J.', 338 titel or u'!NO TITLE!', 339 zeitschrift or u'', 340 band and volume and u', %s' % volume or u'', 341 band and issue and u', (%s)' % issue or u'', 342 auflage and u', (%s. Aufl.)' % auflage or u'', 343 band and suppl and u', Suppl. %s' % suppl or u'', 344 tagungsname and u', (%s)' % tagungsname or u'', 345 band and online and u', (online)' or u'', 346 band and svon and u', S. %s' % svon or u'', 347 band and sbis and u'-%s' % sbis or u'', 348 indruck and u', (%s)' % indruck or u'', 349 url and u', %s' % url or u'' 350 )) 351 352 353 else: # Treat as article... 354 355 zeitschrift = verlag 356 357 content = (u'%s (%s): %s - %s%s%s%s%s%s%s%s%s%s%s' % 358 (autor or u'!NO AUTHOR!', 359 pjahr or u'o.J.', 360 titel or u'!NO TITLE!', 361 herausgeber and u'%s (Hrsg.): ' % herausgeber 362 or u'', 363 zeitschrift or u'', 364 band and volume and u', %s' % volume or u'', 365 band and issue and u', (%s)' % issue or u'', 366 band and suppl and u', Suppl. %s' % suppl or u'', 367 tagungsname and u', (%s)' % tagungsname or u'', 368 band and online and u', (online)' or u'', 369 band and svon and u', S. %s' % svon or u'', 370 band and sbis and u'-%s' % sbis or u'', 371 indruck and u', (%s)' % indruck or u'', 372 url and u', %s' % url or u'' 373 )) 374 375 376 url = Template(url_link).substitute(lfdnr=lfdnr) 377 entry = XMLEntry(url, 378 titel, 379 content, 380 description=abstract or u'', 381 created=pjahr, 382 portal_type=u'_publication', 383 tags=stichworte, 384 sources=[self._source_name,]) 385 self._entries.append(entry) 386 387 # Update statistics. 388 self._stats.entries = self._stats.entries + 1 389 self._stats.static_entries = self._stats.static_entries + 1 390 391 # Keep memory profile low. 392 if len(self._entries) % 1000 == 0: 393 self._writeEntries() 394 395 self.logger.debug(u'%s publications for faculty %s' % (num_pub, 396 fakultaet)) 397 398 self.logger.debug(u'Writing remaining entries') 399 self._writeEntries() 400 401 if self._stats.entries == 0: 402 self.logger.warn(u'No entries!') 403 self._stats.messages.append(u'WARNING: No entries! This could be' 404 u' due to the webpage beeing unreachable!') 405 self._stats.status = u'W' 406 return False 407 408 shutil.move(self._intermediate_temp_filename, self._temp_filename) 409 return retval
410