Python爬取妹子图

chunguang · 发表于 2018-7-19 15:48:36

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

看了小甲鱼的视频，在煎蛋网上试着爬取妹子图，结果出现下列错误：
Traceback (most recent call last):
  File "C:/Users/lenovo/Desktop/爬虫(妹子图).py", line 43, in <module>
download_mm()
  File "C:/Users/lenovo/Desktop/爬虫(妹子图).py", line 40, in download_mm
img_addrs=find_imgs(page_url)
  File "C:/Users/lenovo/Desktop/爬虫(妹子图).py", line 15, in find_imgs
html=url_open(url).decode('utf-8')
  File "C:/Users/lenovo/Desktop/爬虫(妹子图).py", line 6, in url_open
response=urllib.request.urlopen(url)
  File "E:\Python\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
  File "E:\Python\lib\urllib\request.py", line 531, in open
response = meth(req, response)
  File "E:\Python\lib\urllib\request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
  File "E:\Python\lib\urllib\request.py", line 569, in error
return self._call_chain(*args)
  File "E:\Python\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
  File "E:\Python\lib\urllib\request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found

代码如下：
import urllib.request
import os
def url_open(url):
req=urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36')
response=urllib.request.urlopen(url)
html=response.read()
return html
def get_page(url):
html=url_open(url).decode('utf-8')
a=html.find('current-comment-page')+23
b=html.find(']',a)
return html[a:b]
def find_imgs(url):
html=url_open(url).decode('utf-8')
img_addrs=[]
a=html.find('img scr=')
while a!=-1:
      b=html.find('.jpg',a,a+255)
      if b!=-1:
         img_addrs.append(html[a+9:b+4])
      else:
         b=a+9
      a=html.find('img scr=',b)
return img_addrs
def save_imgs(folder,img_addrs):
for each in img_addrs:
      filename=each.split('/')[-1]
      with open(filename,'wb') as f:
         img=url_open(each)
         f.write(img)
def download_mm(folder='OOXX',pages=10):
os.mkdir(folder)
os.chdir(folder)
url="https://jandan.net/ooxx/"
page_num=int(get_page(url))
for i in range(pages):
      page_num-=i
      page_url=url+'page-'+str(page_num)+'#comments'
      img_addrs=find_imgs(page_url)
      save_imgs(folder,img_addrs)
if __name__=='__main__':
download_mm()

BngThea · 发表于 2018-7-19 16:32:06

煎蛋网貌似挂了？

拉了盏灯 · 发表于 2018-7-19 16:40:35

404没找到页面，估计是链接粗问题了。

chunguang · 发表于 2018-7-20 16:49:12

拉了盏灯发表于 2018-7-19 16:40
404没找到页面，估计是链接粗问题了。

并没有呀

咕咕鸡鸽鸽 · 发表于 2018-7-20 21:40:56

煎蛋网变为动态加载图片=。=
参考方法：https://blog.csdn.net/yxwb1253587469/article/details/52233562

一路吧 · 发表于 2018-7-20 22:33:35

煎蛋那个早就更新换代了

titanss · 发表于 2018-7-22 23:30:10

http://bbs.fishc.org/thread-118812-1-4.html
这个可以，但是下载了60+也报错了

新人 · 发表于 2018-7-23 08:59:29

爬虫都是有时效性的网站跟新很快的

mongoole · 发表于 2018-7-23 09:28:28

titanss 发表于 2018-7-22 23:30
http://bbs.fishc.org/thread-118812-1-4.html
这个可以，但是下载了60+也报错了

60+是什么意思？什么报错信息贴出来呗，我好改进一下程序嘛~

titanss · 发表于 2018-7-24 00:18:24

mongoole 发表于 2018-7-23 09:28
60+是什么意思？什么报错信息贴出来呗，我好改进一下程序嘛~

我下载到第三页就报错了，断开链接

titanss · 发表于 2018-7-24 00:25:24

mongoole 发表于 2018-7-23 09:28
60+是什么意思？什么报错信息贴出来呗，我好改进一下程序嘛~

Traceback (most recent call last):
  File "C:\Users\Administrator\Desktop\new.py", line 101, in <module>
get_response(url,headers)
  File "C:\Users\Administrator\Desktop\new.py", line 17, in get_response
get_hash(html)
  File "C:\Users\Administrator\Desktop\new.py", line 24, in get_hash
get_url(hashs)
  File "C:\Users\Administrator\Desktop\new.py", line 49, in get_url
get_imgurl(i,r)
  File "C:\Users\Administrator\Desktop\new.py", line 70, in get_imgurl
download_img(url)
  File "C:\Users\Administrator\Desktop\new.py", line 79, in download_img
urllib.request.urlretrieve(url,'D:\\jimg\\' + filename)
  File "C:\Python36\lib\urllib\request.py", line 248, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
  File "C:\Python36\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
  File "C:\Python36\lib\urllib\request.py", line 526, in open
response = self._open(req, data)
  File "C:\Python36\lib\urllib\request.py", line 544, in _open
'_open', req)
  File "C:\Python36\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
  File "C:\Python36\lib\urllib\request.py", line 1346, in http_open
return self.do_open(http.client.HTTPConnection, req)
  File "C:\Python36\lib\urllib\request.py", line 1321, in do_open
r = h.getresponse()
  File "C:\Python36\lib\http\client.py", line 1331, in getresponse
response.begin()
  File "C:\Python36\lib\http\client.py", line 297, in begin
version, status, reason = self._read_status()
  File "C:\Python36\lib\http\client.py", line 258, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Python36\lib\socket.py", line 586, in readinto
return self._sock.recv_into(b)
TimeoutError: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。

cotexA8 · 发表于 2018-7-24 11:56:24

你可以参考一下我的代码，刚学习，刚写的，爬了300多张没问题

import urllib.request
import os
import base64
import re
import random
from urllib.error import URLError
def handle_base64(hd_object):
return str(base64.b64decode(hd_object))[2:-1]
def get_iplist(url):
#获取代理ip地址
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4092.1 Safari/537.36')
html = urllib.request.urlopen(req).read().decode('utf-8')
ipaddress = re.findall(r'((?:(?:[0,1]?\d?\d|2[0-4]\d|25[0-5])\.){3}(?:[0,1]?\d?\d|2[0-4]\d|25[0-5]))(</td>\s+<td>)(\d+)(</td>\s+<td>)([^"]+)(</td>\s+<td class="country">)透明(</td>\s+<td>)HTTP',html)
iplist = []
i = 0
for each in ipaddress:
iplist.append(each[0]+':'+each[2])
i+=1
return iplist
def proxy_ip(iplist):
#使用代理IP
ip = random.choice(iplist)
print("当前使用的IP：%s"%ip)
proxy_support = urllib.request.ProxyHandler({'http':ip})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
def url_open(url,iplist):
try:
#加入浏览器标识
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4092.1 Safari/537.36')
response = urllib.request.urlopen(req)
except URLError as e:
print("Ip不能用,重新选择IP\n")
proxy_ip(iplist)
return url_open(url,iplist)
else:
html = response.read()
return html
def get_page(url,iplist):
html = url_open(url,iplist).decode('utf-8')
#a = html.find('current-comment-page') + 23
#b = html.find("]", a)
#return html[a:b]
#查找页码
#pattern = re.compile('current-comment-page.+\[(.+)\]')
page = re.search('current-comment-page.+\[(.+)\]',html)
return page.group(1)
def find_imgs(page_url,iplist):
html = url_open(page_url,iplist).decode('utf-8')
print('正在访问：%s'%page_url)
#从html中找到图片的hash码
#"img-hash">Ly93dzMuc2luYWltZy5jbi9tdzYwMC8wMDczb2I2UGd5MWZ0amo1cW5zMWdqMzB6azBucXRkMi5qcGc=</span><br />
pattern = re.compile('"img-hash">(.+?)<')
hashlist = pattern.findall(html)
img_addr = []
for each in hashlist:
#解密，得到图片真是地址
img_addr.append('http:'+handle_base64(each))
return img_addr
def save_imgs(folder,img_addr,iplist):
for each in img_addr:
filename = each.split('/')[-1]
if os.path.isfile(filename):
continue
else:
with open(filename, 'wb') as f:
img=url_open(each,iplist)
f.write(img)
def download_jiandan(folder='picture',pages=10):
if os.path.exists(folder):
os.chdir(folder)
else:
os.mkdir(folder)
os.chdir(folder)
url='http://jandan.net/ooxx'
urlip = 'http://www.xicidaili.com'
iplist = get_iplist(urlip)
#proxy_ip(iplist) 初始使用本机IP
page_num=int(get_page(url,iplist))
for i in range(pages):
page_num -= 1;
page_url = url+'/page-'+str(page_num)+'#comments'
img_addr = find_imgs(page_url,iplist)
save_imgs(folder,img_addr,iplist)
if __name__ == '__main__':
download_jiandan()

复制代码

mongoole · 发表于 2018-8-3 09:46:34

本帖最后由 mongoole 于 2018-8-3 10:01 编辑

titanss 发表于 2018-7-24 00:25
Traceback (most recent call last):
File "C:%users\Administrator\Desktop\new.py", line 101, in
...

有可能是你网速问题呢！

titanss · 发表于 2018-8-3 14:13:16

mongoole 发表于 2018-8-3 09:46
有可能是你网速问题呢！

好像真的是呢，谢谢了，学习了

程序员的救赎 · 发表于 2018-9-6 00:51:38

titanss 发表于 2018-8-3 14:13
好像真的是呢，谢谢了，学习了

你可以考虑下将帮他的回复设置为最佳答案。

账号		自动登录	找回密码
密码			立即注册