Package buildxml :: Package tools :: Module removeInvalidUTF8
[hide private]
[frames] | no frames]

Source Code for Module buildxml.tools.removeInvalidUTF8

  1  #!/usr/bin/python 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """ 
  5  This program removes invalid UTF-8 multibyte sequences from a files. It does 
  6  so line by line. It also shrinks sequences of whitespace to one space and 
  7  replaces some invalid XHTML entities. 
  8   
  9  Usage:: 
 10  $./removeInvalidUTF8.py [options] 
 11   
 12  Command line options:: 
 13     -i <file> | --input=<file>  File with invalid utf-8 control characters 
 14     -o <file> | --output=<file> Output file 
 15     -h        | --help          This text 
 16   
 17   
 18  @author: Johannes Schwenk 
 19  @copyright: 2010, Johannes Schwenk 
 20  @version: 2.0 
 21  @date: 2010-09-15 
 22   
 23   
 24  """ 
 25   
 26   
 27   
 28  import codecs 
 29  import getopt 
 30  import sys 
 31  import re 
 32   
 33   
 34   
35 -def main(argv):
36 """ 37 Parse the command line options and call the function 38 L{removeInvalidUTF8FromFile} to do all the real work. 39 40 """ 41 infile = "in.txt" 42 outfile = "out.txt" 43 try: 44 opts, args = getopt.getopt(argv, 45 "i:o:h", 46 ["input=", "output=", "help"]) 47 except getopt.GetoptError: 48 usage() 49 sys.exit(2) 50 51 for opt, arg in opts: 52 if opt in ("-h", "--help"): 53 usage() 54 sys.exit() 55 elif opt in ("-i", "--input"): 56 infile = arg 57 elif opt in ("-o", "--output"): 58 outfile = arg 59 removeInvalidUTF8FromFile(infile, outfile)
60 61
62 -def removeInvalidUTF8FromFile(infile, outfile):
63 """ 64 This function opens the input and the output file, telling the codec 65 to replace the multibyte sequence with C{'\ufffd'} if it is invalid. It also 66 replaces sequences of whitespace with a singe space, using a regular 67 expression. Finally a range of unicode control characters is removed. 68 69 """ 70 f = codecs.open(infile, 'rw', 'UTF-8', errors='replace') 71 nf = codecs.open(outfile, 'w', 'UTF-8', errors='replace') 72 pst = re.compile(r'[ \t]{2,}') 73 for line in f: 74 line = pst.sub(u' ', line) 75 76 line = line.replace('&#x;', '') # portal www.mw.uni... TODO 77 for c in line: 78 if ord(c) > 0 and ord(c) < 9 or\ 79 ord(c) > 10 and ord(c) < 32: 80 line = line.replace(c, '') 81 nf.write(line) 82 f.close() 83 nf.close()
84 85 86
87 -def version():
88 """ 89 Displays version information for this program. 90 91 """ 92 print """=== removeInvalidUTF8.py - Version 1.0 === 93 94 Removes invalid control characters from utf-8 encoded files."""
95 96
97 -def usage():
98 """ 99 Display help on program usage and version information. 100 101 """ 102 103 version() 104 print """Usage: 105 $./removeInvalidUTF8.py [options] 106 107 Command line options: 108 -i <file> | --input=<file> File with invalid utf-8 control characters 109 -o <file> | --output=<file> Output file 110 -h | --help This text"""
111 112 113 # Run main function when called from the command line... 114 if __name__ == "__main__": 115 main(sys.argv[1:]) 116