Attachment 'encdet.py'
Download
Toggle line numbers
1 #!/usr/bin/env python
2 # -*- encoding: japanese.ms932 -*-
3
4 # encdet.py - An encoding detector
5 # by Yusuke Shinyama
6 # * public domain *
7
8 import sys, re
9
10
11 ## EncodingRecognizer
12 ## - a finite automaton which receives octets
13 ##
14 class EncodingRecognizer:
15
16 SCORE_DEFAULT = 0.5
17 DEATH_PENALTY = -100
18 GIVEUP_THRESHOLD = -1000
19
20 # character sets: must be exclusive!
21 CHARSET = [
22 # zenkaku-kana
23 (1.5, re.compile(u"[-ñ]"), 0x01),
24 (1.5, re.compile(u"[@-]"), 0x02),
25 (1.0, re.compile(u"[[RSTU]"), 0x03),
26
27 # hankaku latin
28 (1.2, re.compile(u"[a-zA-Z0-9]"), 0x04),
29 (0.0, re.compile(u"[\u00c0-\u00ff]"), 0x04),
30
31 # hankaku-kana
32 (0.8, re.compile(u"[\uff66-\uff9d]"), 0x08),
33
34 # zenkaku-alphanum
35 (1.2, re.compile(u"[`-y-O-X]"), 0x10),
36
37 # kanji
38 (1.0, re.compile(u"[\u4e00-\u9fff]"), 0x20),
39
40 ]
41
42 def __init__(self, encoding):
43 self.encoding = encoding
44 self.ch = ""
45 self.state = 1
46 self.partial_score = 0.0
47 self.total_score = 0.0
48 self.chunk_type = 0
49 return
50
51 def __repr__(self):
52 return "<EncodingRecognizer: %s, state=%d, chunk_type=%s, partial_score=%d, total_score=%d>" % \
53 (self.encoding, self.state, self.chunk_type, self.partial_score, self.total_score)
54
55 def die(self):
56 #print "died:", self
57 self.total_score += self.DEATH_PENALTY
58 if self.total_score <= self.GIVEUP_THRESHOLD:
59 # game is over...
60 #print "giveup:", self
61 self.state = 0
62 else:
63 # try again...
64 self.state = 1
65 self.partial_score = 0
66 self.ch = ""
67 return
68
69 def flush(self):
70 self.total_score += self.partial_score * self.partial_score
71 self.partial_score = 0.0
72 return
73
74 def accept(self, s):
75 try:
76 c = unicode(s, self.encoding)
77 except UnicodeError:
78 c = ""
79 for (score, pat, flags) in self.CHARSET:
80 if pat.match(c):
81 if self.chunk_type == 0 or not (self.chunk_type & flags):
82 self.flush()
83 self.chunk_type = flags
84 self.partial_score += score
85 break
86 else:
87 self.flush()
88 self.chunk_type = 0
89 self.partial_score += self.SCORE_DEFAULT
90 return
91
92 def finish(self):
93 self.flush()
94 if 1 < self.state:
95 self.die()
96 return
97
98
99 ## CHARACTER SETS
100
101
102 ## ISO-8859-*
103 ##
104 class ISO8859_Recognizer(EncodingRecognizer):
105
106 def __init__(self):
107 return EncodingRecognizer.__init__(self, "iso8859_1")
108
109 def feed(self, c):
110 if self.state == 0: # already dead?
111 return
112
113 elif self.state == 1: # ascii or iso?
114 if c < 0x7f or (0xa0 <= c and c <= 0xff):
115 self.state = 1
116 self.accept(chr(c))
117
118 else:
119 self.die()
120
121 return
122
123
124 ## EUC-JP
125 ##
126 class EUCJP_Recognizer(EncodingRecognizer):
127
128 def __init__(self):
129 self.hankaku = False
130 return EncodingRecognizer.__init__(self, "japanese.euc_jp")
131
132 def feed(self, c):
133 if self.state == 0: # already dead?
134 return
135
136 # 1stbyte
137 elif self.state == 1:
138 if c < 0x7f: # ascii?
139 # succeed
140 self.state = 1
141 self.accept(chr(c))
142 self.ch = ""
143 # IGNORE EUC-JP hankaku chars, no one is using
144 # elif 0x8e == c: # hankaku-kana 1stbyte?
145 # # next
146 # self.state = 2
147 # self.ch = chr(c)
148 # self.hankaku = True
149 elif 0xa1 <= c and c <= 0xfe: # kanji 1stbyte?
150 # next
151 self.state = 2
152 self.ch = chr(c)
153 self.hankaku = False
154 else:
155 self.die()
156
157 # 2ndbyte
158 elif self.state == 2:
159 if self.hankaku and (0xa1 <= c and c <= 0xdf): # hankaku-kana 2ndbyte?
160 # succeed
161 self.ch += chr(c)
162 self.accept(self.ch)
163 self.state = 1
164 self.ch = ""
165 elif not self.hankaku and (0xa1 <= c and c <= 0xfe): # kanji 2ndbyte?
166 # succeed
167 self.ch += chr(c)
168 self.accept(self.ch)
169 self.state = 1
170 self.ch = ""
171 else:
172 self.die()
173
174 return
175
176
177 ## CP932
178 ##
179 class CP932_Recognizer(EncodingRecognizer):
180
181 def __init__(self):
182 return EncodingRecognizer.__init__(self, "japanese.ms932")
183
184 def feed(self, c):
185 if self.state == 0: # already dead?
186 return
187
188 # 1stbyte
189 elif self.state == 1:
190 if c < 0x7f: # ascii?
191 # succeed
192 self.state = 1
193 self.accept(chr(c))
194 self.ch = ""
195 elif 0xa1 <= c and c <= 0xdf: # hankaku-kana?
196 # succeed
197 self.state = 1
198 self.accept(chr(c))
199 self.ch = ""
200 elif (0x81 <= c and c <= 0x9f) or (0xe0 <= c and c <= 0xee) \
201 or (0xfa <= c and c <= 0xfc): # kanji 1stbyte?
202 # next
203 self.state = 2
204 self.ch = chr(c)
205 else:
206 self.die()
207
208 # 2ndbyte
209 elif self.state == 2:
210 if 0x40 <= c and c <= 0xfc and c != 0x7f: # kanji 2ndbyte?
211 # succeed
212 self.accept(self.ch+chr(c))
213 self.state = 1
214 self.ch = ""
215 else:
216 self.die()
217
218 return
219
220
221 ## UTF-8
222 ##
223 class UTF8_Recognizer(EncodingRecognizer):
224
225 def __init__(self):
226 self.left = 0
227 return EncodingRecognizer.__init__(self, "utf8")
228
229 def feed(self, c):
230 if self.state == 0: # already dead?
231 return
232
233 # 1stbyte
234 elif self.state == 1:
235 if c <= 0x7f: # 00xxxxxx: 1byte only?
236 # succeed
237 self.state = 1
238 self.accept(chr(c))
239 self.ch = ""
240 elif c & 0xe0 == 0xc0: # 110xxxxx: 2bytes
241 # next
242 self.state = 2
243 self.left = 1
244 self.ch = chr(c)
245 elif c & 0xf0 == 0xe0: # 1110xxxx: 3bytes
246 # next
247 self.state = 2
248 self.left = 2
249 self.ch = chr(c)
250 elif c & 0xf8 == 0xf0: # 11110xxx: 4bytes
251 # next
252 self.state = 2
253 self.left = 3
254 self.ch = chr(c)
255 elif c & 0xfc == 0xf8: # 111110xx: 5bytes
256 # next
257 self.state = 2
258 self.left = 4
259 self.ch = chr(c)
260 else:
261 self.die()
262
263 # n-th byte (where 2<=n)
264 else:
265 if c & 0xc0 == 0x80: # 10xxxxxx: continuous?
266 self.state += 1
267 self.left -= 1
268 self.ch += chr(c)
269 if self.left == 0: # finished?
270 # succeed
271 self.state = 1
272 self.accept(self.ch)
273 self.ch = ""
274 else:
275 # next
276 pass
277 else:
278 self.die()
279
280 return
281
282
283 # guess
284 def guess(s):
285 recognizer = [
286 EUCJP_Recognizer(),
287 CP932_Recognizer(),
288 ISO8859_Recognizer(),
289 UTF8_Recognizer()
290 ]
291 for c in s:
292 for r in recognizer:
293 r.feed(ord(c))
294 for r in recognizer:
295 r.finish()
296 #print r
297 recognizer.sort(lambda a,b: cmp(b.total_score, a.total_score))
298 return recognizer[0].encoding
299
300 # test suite
301 def test(s0, test_encodings):
302 false_encodings = [ "japanese.euc_jp", "japanese.ms932", "utf8", "iso8859_1" ]
303 for enc1 in test_encodings:
304 try:
305 s = s0.encode(enc1)
306 except UnicodeError:
307 continue
308 print "try '%s' in %s (%s)" % (s0, enc1, " ".join(map(lambda c:"%02x" % ord(c), s)))
309 for enc2 in false_encodings:
310 if enc1 != enc2:
311 try:
312 x = str(unicode(s, enc2))
313 print " (could be: '%s' in %s)" % (x, enc2)
314 except UnicodeError:
315 continue
316 genc = guess(s)
317 if genc == enc1:
318 print " CORRECT:", genc
319 else:
320 print " ! INCORRECT:", genc
321 print
322 return
323
324 def test_suite():
325 # kana only
326 test(u"±ñÉ¿Í", ["japanese.euc_jp", "japanese.ms932", "utf8"])
327 # kana + alphanum
328 test(u"AÍBÆCÅ é", ["japanese.euc_jp", "japanese.ms932", "utf8"])
329 # kana + kanji
330 test(u"©V·j
[X", ["japanese.euc_jp", "japanese.ms932", "utf8"])
331 # kanji + hankakukana
332 test(u"³èÄÞ·ÒÝÄ", ["japanese.ms932", "utf8"])
333 # iso8859-1
334 test(u"Enzyklop\u00e4die", ["utf8", "iso8859_1"])
335 return
336
337 # main
338 test_suite(); sys.exit(0)
339 if __name__ == "__main__":
340 import fileinput
341 for s in fileinput.input():
342 print guess(s)
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.