Package buildxml :: Package plugins :: Module stb
[hide private]
[frames] | no frames]

Source Code for Module buildxml.plugins.stb

  1  #!/usr/bin/python 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """ 
  5  Contains a plugin that harvests all vacancies from the Stellenbörse from 
  6  C{http://www.uni-freiburg.de/universitaet/organisation/stellenboerse} . 
  7   
  8  @author: Johannes Schwenk 
  9  @copyright: 2010, Johannes Schwenk 
 10  @version: 1.0 
 11  @date: 2010-09-15 
 12   
 13   
 14  """ 
 15   
 16  import sys 
 17  import shutil 
 18   
 19  # Imortant! 
 20  reload(sys) 
 21  sys.setdefaultencoding('utf-8') 
 22   
 23  from tools.BeautifulSoup import BeautifulStoneSoup 
 24  from xmlgetter.plugin import BaseSyncPlugin 
 25  from xmlgetter.xml import XMLEntry 
 26   
 27   
28 -class SyncPlugin_stb(BaseSyncPlugin):
29 """ 30 A plugin that harvests all vacancies from the Stellenbörse from 31 C{http://www.uni-freiburg.de/universitaet/organisation/stellenboerse} . 32 33 There is no need for consolidation, so L{_getData} is the only Function 34 that is overridden. 35 36 """ 37
38 - def _getData(self):
39 """ 40 Get all vacancies from the Stellenbörse. 41 42 It loads all vacancies from the URL provided in L{config.PLUGINS} as 43 XML and then fetches the details for each entry seperately. 44 45 """ 46 47 # Initialize statistics. 48 self._stats.entries = 0 49 self._stats.new_entries = 0 50 51 # Request the url with all links to vacancies... 52 response = self._requestURL(self._url) 53 if not response: 54 return False 55 56 # Get all "stelle" tags ... 57 soup = BeautifulStoneSoup(response.read()) 58 soup = soup.findAll(u'stelle') 59 if not soup: 60 self.logger.warn(u'No soup!') 61 return False 62 63 """ 64 ... and construct for every entry the url to the full description 65 of the vacancy. 66 """ 67 for line in soup: 68 url = u'http://info.verwaltung.uni-' +\ 69 u'freiburg.de/servuni/stellenuni.abfr1?ausgabeart=xml&' +\ 70 u'stellenid=%s&layout=v3' % line[u'stellenid'] 71 response = self._requestURL(url) 72 if not response: 73 self.logger.warn(u'Could not retrieve entry!') 74 continue 75 detail_soup = BeautifulStoneSoup(response.read()) 76 77 # Extract detailed information 78 descr = detail_soup.beschreibung and \ 79 detail_soup.beschreibung.string or u'' 80 pubdat = detail_soup.publikationsdatum and \ 81 detail_soup.publikationsdatum.string or u'' 82 creator = detail_soup.ansprechpartner and \ 83 detail_soup.ansprechpartner.string or u'' 84 kopfz = detail_soup.kopfzeile and \ 85 detail_soup.kopfzeile.string or u'' 86 kurzbeschr = detail_soup.kurzbeschreibung and \ 87 detail_soup.kurzbeschreibung.string or u'' 88 bewerba = detail_soup.bewerbungsadresse and \ 89 detail_soup.bewerbungsadresse.string or u'' 90 rueckfr = detail_soup.rueckfragen and \ 91 detail_soup.rueckfragen.string or u'' 92 rueckm = detail_soup.rueckemail and \ 93 detail_soup.rueckemail.string or u'' 94 email = detail_soup.email and detail_soup.email.string or u'' 95 tel = detail_soup.telefon and detail_soup.telefon.string or u'' 96 title = detail_soup.titel and detail_soup.titel.string or u'' 97 content = u'%s %s %s %s %s %s %s' \ 98 % (kopfz, 99 kurzbeschr, 100 bewerba, 101 rueckfr, 102 rueckm, 103 email, 104 tel,) 105 106 entry = XMLEntry(url, title, content, descr, created=pubdat, 107 creator=creator, portal_type=u'_stb_entry', 108 sources=[self._source_name,]) 109 110 # Update statistics. 111 self._stats.entries = self._stats.entries + 1 112 self._stats.static_entries = self._stats.static_entries + 1 113 114 # Keep memory profile low. 115 self._entries.append(entry) 116 if len(self._entries) % 10 == 0: 117 self._writeEntries() 118 119 self.logger.debug(u'Writing remaining entries') 120 self._writeEntries() 121 122 if self._stats.entries == 0: 123 self.logger.warn(u'No entries!') 124 self._stats.messages.append(u'WARNING: No entries! This could be' 125 u' due to the webpage beeing unreachable!') 126 self._stats.status = u'W' 127 return False 128 129 shutil.move(self._intermediate_temp_filename, self._temp_filename) 130 return True
131