作者ericsk (认真的艾瑞克)
看板Python
标题[范例] 无名小站相簿 grabber
时间Mon Dec 26 17:21:46 2005
使用方法: *.py <帐号> <相簿编号>
目前的版本是搭配系统中的 wget 来抓 URL
如果大家有什麽建议改进的地方都可以提出来讨论 ^^; 互相学习罗
#!/usr/bin/python
import urllib2
import re
import sys
import os
global addr
addr = "
http://www.wretch.cc/album"
def get_pic(path):
url = re.sub('&', r'&', addr+path)
r = urllib2.Request(url)
r.add_header('Referer', addr)
r.add_header('User-Agent', 'Mozilla 5.0')
cont = urllib2.urlopen(r)
c = cont.readlines()
for line in c:
if re.match('.*<img id=\'DisplayImage\'.*?></a>', line):
mat = re.findall('(http.*?)\'', line)
if mat:
os.system('/usr/bin/wget --header=\'Referer:
http://www.wretch.cc/album\' --header=\'User-Agent: Mozilla 5.0\' '+mat[0])
def grab(lines):
flag = 0;
for line in lines:
if re.match(".*show.php.*", line):
flag = flag +1
mat = re.findall('<a href=".(.*?)" ><.*?</a>', line)
if mat:
get_pic(mat[0])
if flag == 0: return True
else: return False
if __name__ == "__main__":
user = sys.argv[1]
book = sys.argv[2]
url = addr+"/album.php?id="+user+"&book="+book
i = 1
while True:
url2 = url+"&page="+str(i)
req = urllib2.Request(url2)
req.add_header('Referer', addr)
req.add_header('User-Agent', 'Mozilla 5.0')
page = urllib2.urlopen(req)
if (grab(page.readlines())):
break;
i = i + 1
--
※ 发信站: 批踢踢实业坊(ptt.cc)
◆ From: 140.112.31.143
1F:推 wawawa:我觉得看别人的范例是学习最快的方法了:) 12/26 17:42