提取文档中的图象标识信息
Shuguang Yang <[email protected]> reply-to [email protected] to python-cn`CPyUG`华蟒用户组 <[email protected]> date Wed, Aug 20, 2008 at 15:50 subject [CPyUG:62863] 用python提取文档中的图象,图象的标识信息
应用 Python 解决一些实际问题(
的提取嵌入在文档中的图像部分
1 import sys
2 import os
3 import string
4
5 headers = [('JFIF', 6, 'jpg'), ('GIF', 0, 'gif'), ('PNG', 1, 'png')]
6 # headers 中的offset为什么分别是6, 0, 1,是标识前面的数据长度吗?是什么数据?
7 marker = []
8 filename = '/path/to/a/file'
9
10 try:
11 fid = open(filename, 'rb')
12 except:
13 sys.exit(1)
14
15 s = 0
16
17 for line in fid:
18 for flag, offset, ext in headers:
19 index = string.find(line, flag)
20 if index > 0:
21 pos = s + index - offset
22 marker.append((pos, ext))
23 s += len(line)
24
25 fid.seek(0)
26 j = len(marker)
27 imgnum = 0
28 if j == 0:
29 print 'No images included in this document'
30 sys.exit(1)
31
32 for i in range(j):
33 info = marker[i]
34 thispos = info[0]
35 thisext = info[1]
36 if i == j-1:
37 nextpos = s
38 gap = nextpos - thispos
39 else:
40 nextinfo = marker[i+1]
41 nextpos = nextinfo[0]
42 gap = nextpos - thispos
43 fid.seek(thispos)
44 data = fid.read(gap)
45 imgname = 'imgname%02d.%s' % (i, thisext)
46 fid1 = open(imgname, 'wb')
47 fid1.write(data)
48 fid1.close()
49 imgnum += 1
50
51 fid.close()
52 print '%02d images have been extracted' % imgnum
反馈
创建 by -- ZoomQuiet [DateTime(2008-08-20T07:55:11Z)]