|
Size: 2207
Comment:
|
Size: 2630
Comment:
|
| Deletions are marked like this. | Additions are marked like this. |
| Line 3: | Line 3: |
| ''' 含有章节索引的 *PUG 文章通用模板 ''' ::-- ["hoxide"] [[[DateTime(2006-04-29T09:12:35Z)]]] [[TableOfContents]] | ''' 含有章节索引的 *PUG 文章通用模板 ''' ::-- ["jigloo"] [[TableOfContents]] |
| Line 9: | Line 9: |
| ''简述'' 网易blog的相册比原来的网易相册复杂许多,一层一层的javascript分析头都大了,所以就用pywin32调用IE来干这个活了。 | ''简述'' 网易blog的相册比原来的网易相册复杂许多,一层一层的javascript分析头都大了,所以就用pywin32调用IE来干这个活了。又改了改,速度快了点。 |
| Line 20: | Line 20: |
| def __init__(self, name, nextword=u'下一页'): | def __init__(self, name, nextword=u'下一页', percount=8): |
| Line 23: | Line 23: |
| self.percount = percount self.curpage = 1 self.pages = 100 #we will modify it's value in self.connect() |
|
| Line 24: | Line 27: |
| def __index__(self): | def __home__(self): |
| Line 26: | Line 30: |
| def __indexloaded__(self): return any([True for x in self.ie.Document.links if x.href.find('prev')] >= 0) def visible(self): self.ie.Visible = True |
def __pageloaded__(self): return self.curpage >= self.pages and True or len(self.imgs_href()) >= self.percount def __pages__(self): url = [x.href for x in self.ie.Document.links if x.href.find('.blog.163.com/album/#p') >= 0][-2:][0] return int(url[url.find('#p')+2:]) def visible(self, v=True): self.ie.Visible = v |
| Line 31: | Line 41: |
| Line 32: | Line 43: |
| self.ie.Navigate2(self.__index__()) | self.ie.Navigate2(self.__home__()) |
| Line 36: | Line 47: |
| while not self.__indexloaded__(): | while not self.__pageloaded__(): |
| Line 38: | Line 49: |
| self.pages = self.__pages__() | |
| Line 39: | Line 51: |
| Line 40: | Line 53: |
| link = ([x for x in self.ie.Document.links if x.innerText.find(self.nextword) >= 0])[0] | self.curpage += 1 if self.curpage > self.pages: return False link = [x for x in self.ie.Document.links if x.innerText.find(self.nextword) >= 0][0] |
| Line 42: | Line 58: |
| time.sleep(2) return self |
while not self.__pageloaded__(): time.sleep(1) return True |
| Line 47: | Line 65: |
| return [urlconv(x.href) for x in self.ie.Document.links if x.href.find(u'prevPhoto') >=0] | return [urlconv(u) for u in set([x.href for x in self.ie.Document.links if x.href.find(u'prevPhoto') >=0])] |
| Line 50: | Line 69: |
| imgurls = [] ab = Album163blog(name, u'下一页') |
#url = 'http://dwl2981332.blog.163.com/album/#p1' ab = Album163blog(name) |
| Line 53: | Line 72: |
| Line 55: | Line 75: |
| if len(links) == 0: | print '\n'.join(links) if not ab.next(): |
| Line 57: | Line 78: |
| print '\n'.join(links) ab.next() |
|
| Line 60: | Line 79: |
| == 提醒 == 如果地址倒出不完全的话是网速较慢引起的,增加程序中翻页函数(next)的sleep时间即可。 == 反馈 == |
含有章节索引的 *PUG 文章通用模板 ::-- ["jigloo"] TableOfContents
1. 导出网易博客相册的图片地址
简述 网易blog的相册比原来的网易相册复杂许多,一层一层的javascript分析头都大了,所以就用pywin32调用IE来干这个活了。又改了改,速度快了点。
1.1. 代码
# coding:cp936
import win32com.client
import time
import re
import sys
class Album163blog:
def __init__(self, name, nextword=u'下一页', percount=8):
self.name = name
self.nextword = nextword
self.percount = percount
self.curpage = 1
self.pages = 100 #we will modify it's value in self.connect()
self.ie = win32com.client.Dispatch('InternetExplorer.Application')
def __home__(self):
return 'http://%s.blog.163.com/album/' % self.name
def __pageloaded__(self):
return self.curpage >= self.pages and True or len(self.imgs_href()) >= self.percount
def __pages__(self):
url = [x.href for x in self.ie.Document.links if x.href.find('.blog.163.com/album/#p') >= 0][-2:][0]
return int(url[url.find('#p')+2:])
def visible(self, v=True):
self.ie.Visible = v
return self
def connect(self):
self.ie.Navigate2(self.__home__())
time.sleep(1)
while self.ie.Busy and self.ie.ReadyState != 4: #READYSTATE_COMPLETE
time.sleep(1)
while not self.__pageloaded__():
time.sleep(1)
self.pages = self.__pages__()
return self
def next(self):
self.curpage += 1
if self.curpage > self.pages:
return False
link = [x for x in self.ie.Document.links if x.innerText.find(self.nextword) >= 0][0]
link.click()
while not self.__pageloaded__():
time.sleep(1)
return True
def imgs_href(self):
def urlconv(url):
return re.sub(r'prevPhoto.do\?', 'prevPhDownload.do?host=%s&' % self.name, url)
return [urlconv(u) for u in set([x.href for x in self.ie.Document.links if x.href.find(u'prevPhoto') >=0])]
if __name__ == '__main__':
name = sys.argv[1]
#url = 'http://dwl2981332.blog.163.com/album/#p1'
ab = Album163blog(name)
ab.visible().connect()
while True:
links = ab.imgs_href()
print '\n'.join(links)
if not ab.next():
break