##language:zh #pragma section-numbers on ''' 网页抓取实作 ''' ::-- ZoomQuiet [<>] <> ## 默许导航,请保留 <> = 献给买基金的朋友 = == Jun Tsai == {{{Jun Tsai hide details 3:53 pm (30 minutes ago) reply-to python-chinese@lists.python.cn to python-chinese@lists.python.cn date Aug 2, 2007 3:53 PM subject [python-chinese] 献给买基金的朋友. 今天用python写的一个脚本,来自动抓取今日基金的收益情况(懒得去网站看),刚学习python,错误的地方多指正. }}} {{{#!python # -*- coding: UTF-8 -*- import httplib, urllib,re import datetime FUND_CODE = "161706" SALE_DATE="2007-06-22" SALE_MONEY = 5000.0 TODAY_DATE=datetime.date.today() PANEL = "biz.finance.sina.com.cn" USERAGENT = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv: 1.8.0.1) Gecko/20060111 Firefox/1.5.0.1' PATH="/fundinfo/open/lsjz.php?fund_code=" """ 用来自动抓取基金的值,获取对应的利润情况. author:jun tsai revision:$Revision: 3191 $ since:0.1 """ def get_found_value(fund_code,sale_date,sale_money): """自动抓取基金净值的脚本程序,通过给定的基金代码,买基金的日期,以及投入使用的钱, 来自动抓取基金的净值,以及利润 """ params = urllib.urlencode({"startdate1":sale_date,"enddate1":TODAY_DATE}) headers = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", 'Referer' :'https://'+PANEL+PATH+fund_code, 'User-Agent':USERAGENT } conn = httplib.HTTPConnection(PANEL) conn.request("POST", PATH+fund_code, params, headers) response = conn.getresponse() data = response.read() data=data.decode ("gb2312") conn.close(); pattern = '(.+)\('+fund_code+'\)' all_matches = re.findall(pattern,data); fund_name = all_matches[0].encode("utf-8") # print all_matches[0] pattern='<a href=\'./lsjz_dwjz.php\?jzrq=(.*)\'[\s]+target=_blank>(.*)</a>' all_matches=re.findall(pattern,data); today_value = float(all_matches[0][1]) sale_value = float(all_matches[len(all_matches)-1][1]) sale_count = sale_money/sale_value value=(today_value-sale_value)*sale_count print "|"+construct_block(10,fund_code)+"|"+construct_head_block(20,fund_name)+"|" +construct_block(10,sale_value.__str__())+"|" +construct_block(20,sale_count.__str__())+"|" +construct_block(10,today_value.__str__())+"|" +construct_block(20,value.__str__())+"|" def construct_block(length,str): r=' '+str while(length>len(r)): r+=' ' return r def construct_head_block(length,str): r=' '+str head_str_len=len( str.decode("utf-8")) while(length>(len(r)-head_str_len)): r+=' ' return r print "+-----------------------------------------------------------------------------------------------+" print "|"+construct_head_block(10,"代码")+"|"+construct_head_block(20,"名称")+"|"+construct_head_block(10,"购买净值")+"|"+construct_head_block(20,"购买数")+"|"+construct_head_block(10,"今日净值")+"|"+construct_head_block(20,"利润")+"|" print "+-----------------------------------------------------------------------------------------------+" get_found_value("161706","2007-06-22",5000.0) get_found_value("260110","2007-06-10", 5000.0) get_found_value("070011","2007-06-23",5000.0) print "+-----------------------------------------------------------------------------------------------+" }}} == shily escape == {{{shily escape <jelly1982@gmail.com> hide details 4:01 pm (21 minutes ago) reply-to python-chinese@lists.python.cn to python-chinese@lists.python.cn date Aug 2, 2007 4:01 PM subject Re: [python-chinese] 献给买基金的朋友. 呵呵, 巧了. 我这两天也写了一个. 不过没有算利润的这些东西 }}} {{{#!python #!/usr/bin/env python # -*- coding: utf-8 -*- import re import urllib import time from sqlalchemy import * class Fund: def __init__(self, code): self.code = code self.attributes = {} def __setitem__(self, key, value): self.attributes[key] = value def __getitem__(self, key): return self.attributes[key] fund_all = {} fund_url = 'http://my.fund.163.com/stock/rankkfs.htm' u_sock = urllib.urlopen(fund_url) fund_str = u_sock.read().decode('gb2312') fund_str = fund_str.encode('utf-8') print fund_str u_sock.close() r_item_pattern = re.compile(r'<tr align="center" bgcolor="(#EFEFEF|#E7F3FE)" class="bzi">(.*?)</tr>', re.DOTALL) r_anchor_pattern = re.compile(r'<td><a href=.*?>(.*?)</a></td>') r_normal_pattern = re.compile(r'<td>([-0-9]*\.*\d*)</td>') fund_list = r_item_pattern.findall(fund_str) file_name = time.strftime('%Y%m%d') + '.html' f = open( file_name, 'w') db = BoundMetaData("mysql://root:clhclh@localhost/testcase?charset=utf8", echo=True) funds = Table('funds', db, autoload=True) for item in fund_list: i = funds.insert() s = item[1] f.write(s) anchor_tuple = r_anchor_pattern.findall(s) fund = Fund(anchor_tuple[0]) fund['name'] = anchor_tuple[1] fund['company'] = anchor_tuple[2] normal_tuple = r_normal_pattern.findall(s) fund['date'] = normal_tuple[0] fund['util'] = normal_tuple[1] fund['total'] = normal_tuple[2] fund['rate'] = normal_tuple[3] funds.insert().execute({'name':fund['name'],'code':fund.code,'date':fund['date'],'util':fund['util'],'total':fund['total'],'rate':fund['rate'],'company':fund['company']}) # sql = "insert into funds(name, code, `date`, util, total, rate, company) values('%s', '%s', '%s', '%s', '%s', '%s', '%s')" \ # % (fund['name'], fund.code, fund['date'], fund['util'], fund['total'], fund['rate'], fund['company']) # try: # print sql # cursor.execute(sql) # except Exception, e: g # print e # fund_all[fund.code] = fund # f.write(s) #conn.commit() #cursor.close() #conn.close() f.close() }}} === ruby版: === {{{ #!/usr/bin/env ruby # Time-stamp: <2007-08-02[星期四] 14:15:47 dongsheng> require 'net/http' require 'iconv' url = URI.parse('http://my.fund.163.com/stock/rankkfs.htm') req = Net::HTTP::Get.new (url.path) res = Net::HTTP.start(url.host, url.port) do |http| http.request(req) end content = res.body #content = Iconv.iconv('utf-8', 'gb2312', content) #puts content re_items = /\<tr align="center" bgcolor="(#EFEFEF|#E7F3FE)" class="bzi">(.*?)\<\/tr\>/im re_anchor = /\<td\>\<a href=.*?\>(.*?)\<\/a\>\<\/td\>/im re_normal = /\<td\>([0-9-]*\.*\d*)\<\/td\>/im content.scan(re_items) do |x,y| anchors = y.scan(re_anchor) puts "code: #{Iconv.iconv('utf-8', 'gb2312', anchors[0].to_s)}" puts "name: #{Iconv.iconv('utf-8', 'gb2312', anchors[1].to_s)}" puts "company: #{Iconv.iconv('utf-8', 'gb2312', anchors[2].to_s)}" normal = y.scan(re_normal) puts "date: #{normal[0]}" puts "util: #{normal[1]}" puts "total: #{normal[2]}" puts "rate: #{normal[3]}" end }}} == 反馈 ==