1
2
3
4 """
5 This program removes invalid UTF-8 multibyte sequences from a files. It does
6 so line by line. It also shrinks sequences of whitespace to one space and
7 replaces some invalid XHTML entities.
8
9 Usage::
10 $./removeInvalidUTF8.py [options]
11
12 Command line options::
13 -i <file> | --input=<file> File with invalid utf-8 control characters
14 -o <file> | --output=<file> Output file
15 -h | --help This text
16
17
18 @author: Johannes Schwenk
19 @copyright: 2010, Johannes Schwenk
20 @version: 2.0
21 @date: 2010-09-15
22
23
24 """
25
26
27
28 import codecs
29 import getopt
30 import sys
31 import re
32
33
34
36 """
37 Parse the command line options and call the function
38 L{removeInvalidUTF8FromFile} to do all the real work.
39
40 """
41 infile = "in.txt"
42 outfile = "out.txt"
43 try:
44 opts, args = getopt.getopt(argv,
45 "i:o:h",
46 ["input=", "output=", "help"])
47 except getopt.GetoptError:
48 usage()
49 sys.exit(2)
50
51 for opt, arg in opts:
52 if opt in ("-h", "--help"):
53 usage()
54 sys.exit()
55 elif opt in ("-i", "--input"):
56 infile = arg
57 elif opt in ("-o", "--output"):
58 outfile = arg
59 removeInvalidUTF8FromFile(infile, outfile)
60
61
63 """
64 This function opens the input and the output file, telling the codec
65 to replace the multibyte sequence with C{'\ufffd'} if it is invalid. It also
66 replaces sequences of whitespace with a singe space, using a regular
67 expression. Finally a range of unicode control characters is removed.
68
69 """
70 f = codecs.open(infile, 'rw', 'UTF-8', errors='replace')
71 nf = codecs.open(outfile, 'w', 'UTF-8', errors='replace')
72 pst = re.compile(r'[ \t]{2,}')
73 for line in f:
74 line = pst.sub(u' ', line)
75
76 line = line.replace('&#x;', '')
77 for c in line:
78 if ord(c) > 0 and ord(c) < 9 or\
79 ord(c) > 10 and ord(c) < 32:
80 line = line.replace(c, '')
81 nf.write(line)
82 f.close()
83 nf.close()
84
85
86
88 """
89 Displays version information for this program.
90
91 """
92 print """=== removeInvalidUTF8.py - Version 1.0 ===
93
94 Removes invalid control characters from utf-8 encoded files."""
95
96
98 """
99 Display help on program usage and version information.
100
101 """
102
103 version()
104 print """Usage:
105 $./removeInvalidUTF8.py [options]
106
107 Command line options:
108 -i <file> | --input=<file> File with invalid utf-8 control characters
109 -o <file> | --output=<file> Output file
110 -h | --help This text"""
111
112
113
114 if __name__ == "__main__":
115 main(sys.argv[1:])
116