含有章节索引的 *PUG 文章通用模板 ::-- jigloo
Contents
1. 导出网易博客相册的图片地址
简述 网易blog的相册比原来的网易相册复杂许多,一层一层的javascript分析头都大了,所以就用pywin32调用IE来干这个活了。又改了改,速度快了点。
1.1. 代码
# coding:cp936 import win32com.client import time import re import sys class Album163blog: def __init__(self, name, nextword=u'下一页', percount=8): self.name = name self.nextword = nextword self.percount = percount self.curpage = 1 self.pages = 100 #we will modify it's value in self.connect() self.ie = win32com.client.Dispatch('InternetExplorer.Application') def __home__(self): return 'http://%s.blog.163.com/album/' % self.name def __pageloaded__(self): return self.curpage >= self.pages and True or len(self.imgs_href()) >= self.percount def __pages__(self): url = [x.href for x in self.ie.Document.links if x.href.find('.blog.163.com/album/#p') >= 0][-2:][0] return int(url[url.find('#p')+2:]) def visible(self, v=True): self.ie.Visible = v return self def connect(self): self.ie.Navigate2(self.__home__()) time.sleep(1) while self.ie.Busy and self.ie.ReadyState != 4: #READYSTATE_COMPLETE time.sleep(1) while not self.__pageloaded__(): time.sleep(1) self.pages = self.__pages__() return self def next(self): self.curpage += 1 if self.curpage > self.pages: return False link = [x for x in self.ie.Document.links if x.innerText.find(self.nextword) >= 0][0] link.click() while not self.__pageloaded__(): time.sleep(1) return True def imgs_href(self): def urlconv(url): return re.sub(r'prevPhoto.do\?', 'prevPhDownload.do?host=%s&' % self.name, url) return [urlconv(u) for u in set([x.href for x in self.ie.Document.links if x.href.find(u'prevPhoto') >=0])] if __name__ == '__main__': name = sys.argv[1] #name = 'dwl2981332' ab = Album163blog(name) ab.visible().connect() while True: links = ab.imgs_href() print '\n'.join(links) if not ab.next(): break