爬取煎蛋ooxx图片测试代码（对小甲鱼老师课程代码的魔改）

EzioA · 发表于 2018-7-21 00:57:23

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

本帖最后由 EzioA 于 2018-7-21 10:57 编辑

昨天看到小甲鱼老师这一讲的内容，自己尝试，发现完全行不通了。审查jandan.net/ooxx/的网页源代码，发现源代码中并没有真正的图片地址，而是浏览器调用了JS对源代码进行了修改，在F12中才能看到源地址。
查了很多资料，跟踪请求包，手动分析了调用的JS的逻辑，把实际使用的JS分离出来，通过execjs模块来执行JS代码。
可用代码如下：

from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import execjs
import time
import multiprocessing as mp
headers = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
url = 'http://jiandan.net/ooxx'
##processings = []
##current_page = 0 在多进程执行中，子进程使用这种定义在主进程中的全局变量会出现奇怪的问题（fork出子进程时生成的全局变量怕不是直接从代码段拷贝过去了 XD）
def main():
t1 = time.time()
## for _offset in range(10):
## processing = mp.Process(target = func, args = (soup, opener, _offset, current_page), name = 'T'+str(_offset))
## processings.append(processing)
## processing.start()
## for _offset in range(10):
## processings[_offset].join()
## for _ in range(4):
## processings[_].join()
## print("hello, world")
pool = mp.Pool(4)
for _offset in range(10):
pool.apply_async(func = task, args = (_offset,))
pool.close()
pool.join()
## pool.map(func, args)
## for each in threads:
## each.start()
## each.join()
## func(soup, opener)
temp = time.time() - t1
line = "煎蛋进程池爬10页时间:"+str(temp)
with open("jandan_pools.txt", 'w') as txt:
txt.write(line)
def task(_offset):
global headers, url ##由于这两个变量我们不会进行修改，不存在fork出的进程与父全局变量之间的问题
opener = urllib.request.build_opener()
opener.add_handler = [headers]
html = opener.open(url).read()
soup = BeautifulSoup(html, 'html5lib')
current_page = int(soup.find(class_ = 'current-comment-page').text[1:-1])
my_page = current_page - _offset
html = opener.open(''.join([url, '/page-', str(my_page)])).read()
print("Open")
soup = BeautifulSoup(html, 'html5lib')
ol = soup.find(name = 'ol', class_ = "commentlist")
li = ol.find_all(name = 'li')
count = 0
for each in li:
if each.has_attr('id') and not each.has_attr('class'):
img = each.find(name = 'img')
img_hash = img.next_sibling.text
##调用JS代码
with open("./func1.js", encoding='UTF-8') as f:
line = f.readline()
func = ''
while line:
func += line
line = f.readline()
ctx = execjs.compile(func)
img_url = (ctx.call("jandan_load_img", img_hash))
with open(''.join([r"C:/Users/ezio7/Desktop/img/", str(_offset), '_', str(count), img_url[-4:]]), 'wb') as f:
print("T"+str(_offset))
f.write(opener.open(''.join(['http:', img_url])).read())
count += 1
return 1
if __name__ == "__main__":
main()

复制代码

这是多线程版本的，爬取10个页面用时80~90秒。

from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import execjs
import time
import multiprocessing as mp
import threading
headers = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
url = 'http://jiandan.net/ooxx'
##threads = []
##lock = threading.Lock()
def main():
## for _ in range(4):
## thread = threading.Thread(target = func, args = (soup, opener), name = 'T'+str(_))
## threads.append(thread)
t1 = time.time()
## for each in threads:
## each.start()
## each.join()
func()
temp = time.time() - t1
line = "煎蛋单进程爬10页时间:"+str(temp)
with open("jandan_time.txt", 'w') as txt:
txt.write(line)
def func():
global headers, url
count = 0
page = 0
opener = urllib.request.build_opener()
opener.add_handler = [headers]
html = opener.open(url).read()
soup = BeautifulSoup(html, 'html5lib')
current_page = int(soup.find(class_ = 'current-comment-page').text[1:-1])
## lock.acquire()
while page < 10:
ol = soup.find(name = 'ol', class_ = "commentlist")
li = ol.find_all(name = 'li')
for each in li:
if each.has_attr('id') and not each.has_attr('class'):
img = each.find(name = 'img')
img_hash = img.next_sibling.text
##调用JS代码
with open("./func1.js", encoding='UTF-8') as f:
line = f.readline()
func = ''
while line:
func += line
line = f.readline()
ctx = execjs.compile(func)
img_url = (ctx.call("jandan_load_img", img_hash))
## print(img_url)
with open(''.join([r"C:/Users/ezio7/Desktop/img", '/_', str(count), img_url[-4:]]), 'wb') as f:
f.write(opener.open(''.join(['http:', img_url])).read())
count += 1
current_page -= 1
page += 1
print(current_page, '\t', page)
html = opener.open(''.join([url, '/page-', str(current_page)])).read()
soup = BeautifulSoup(html, 'html5lib')
## lock.release()
if __name__ == "__main__":
main()

复制代码

这是单进程版本的，用时130+秒。

分析修改后的JS代码见附件，其中为了取代原JS代码中的某些外部函数，引入了相同功能的函数。
完全使用暴力爬取，宕掉的可能性极高，只是为了测试多进程和普通爬取的速率差别（代理和延迟设置太麻烦 XD）
py文件和js文件请放在同一路径下，下载路径请自行修改。

煎蛋妹子OOXX图片爬取测试代码.zip (8.1 KB, 下载次数: 26)

珍惜每一个福利网站，请小心使用暴力爬虫

另外，不知道为什么别的爬虫是I/O密集型，用多线程效果蛮好的，而我的这段爬虫是计算密集型的。因为使用了execjs的缘故？听说这个模块挺慢的。

FishC_TYRANT · 发表于 2018-8-20 20:53:40

感谢

JiaoJie · 发表于 2018-8-25 12:34:04

原来问题在这

wyqflysun · 发表于 2018-8-31 13:06:34

账号		自动登录	找回密码
密码			立即注册

[作品展示] 爬取煎蛋ooxx图片测试代码（对小甲鱼老师课程代码的魔改）

马上注册，结交更多好友，享用更多功能^_^