⇤ ← Revision 1 as of 2008-08-20 07:55:11
Size: 2007
Comment:
|
Size: 1915
Comment:
|
Deletions are marked like this. | Additions are marked like this. |
Line 29: | Line 29: |
# headers 中的offset为什么分别是6, 0, 1,是标识前面的数据长度吗?是什么数据? | |
Line 31: | Line 30: |
filename = '/path/to/a/file' | filename = '/path/to/your/file' |
Line 49: | Line 48: |
j = len(marker) | |
Line 51: | Line 49: |
if j == 0: | if len(marker) == 0: |
Line 55: | Line 53: |
for i in range(j): info = marker[i] |
for info in marker: |
Line 59: | Line 56: |
if i == j-1: | index = marker.index(info) try: nextinfo = marker[index + 1] nextpos = nextinfo[0] gap = nextpos - thispos except IndexError: |
Line 61: | Line 63: |
gap = nextpos - thispos else: nextinfo = marker[i+1] nextpos = nextinfo[0] |
|
Line 68: | Line 66: |
imgname = 'imgname%02d.%s' % (i, thisext) | imgname = 'imgname%02d.%s' % (index, thisext) |
Line 75: | Line 73: |
print '%02d images have been extracted' % imgnum | print '%02d images have been extracted' % imgnum |
提取文档中的图象标识信息
Shuguang Yang <[email protected]> reply-to [email protected] to python-cn`CPyUG`华蟒用户组 <[email protected]> date Wed, Aug 20, 2008 at 15:50 subject [CPyUG:62863] 用python提取文档中的图象,图象的标识信息
应用 Python 解决一些实际问题(
的提取嵌入在文档中的图像部分
1 import sys
2 import os
3 import string
4
5 headers = [('JFIF', 6, 'jpg'), ('GIF', 0, 'gif'), ('PNG', 1, 'png')]
6 marker = []
7 filename = '/path/to/your/file'
8
9 try:
10 fid = open(filename, 'rb')
11 except:
12 sys.exit(1)
13
14 s = 0
15
16 for line in fid:
17 for flag, offset, ext in headers:
18 index = string.find(line, flag)
19 if index > 0:
20 pos = s + index - offset
21 marker.append((pos, ext))
22 s += len(line)
23
24 fid.seek(0)
25 imgnum = 0
26 if len(marker) == 0:
27 print 'No images included in this document'
28 sys.exit(1)
29
30 for info in marker:
31 thispos = info[0]
32 thisext = info[1]
33 index = marker.index(info)
34 try:
35 nextinfo = marker[index + 1]
36 nextpos = nextinfo[0]
37 gap = nextpos - thispos
38 except IndexError:
39 nextpos = s
40 gap = nextpos - thispos
41 fid.seek(thispos)
42 data = fid.read(gap)
43 imgname = 'imgname%02d.%s' % (index, thisext)
44 fid1 = open(imgname, 'wb')
45 fid1.write(data)
46 fid1.close()
47 imgnum += 1
48
49 fid.close()
50 print '%02d images have been extracted' % imgnum
反馈
创建 by -- ZoomQuiet [DateTime(2008-08-20T07:55:11Z)]