|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 EzioA 于 2018-7-21 10:57 编辑
昨天看到小甲鱼老师这一讲的内容,自己尝试,发现完全行不通了。审查jandan.net/ooxx/的网页源代码,发现源代码中并没有真正的图片地址,而是浏览器调用了JS对源代码进行了修改,在F12中才能看到源地址。
查了很多资料,跟踪请求包,手动分析了调用的JS的逻辑,把实际使用的JS分离出来,通过execjs模块来执行JS代码。
可用代码如下:
- from bs4 import BeautifulSoup
- import urllib.request
- import urllib.parse
- import execjs
- import time
- import multiprocessing as mp
- headers = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
- url = 'http://jiandan.net/ooxx'
- ##processings = []
- ##current_page = 0 在多进程执行中,子进程使用这种定义在主进程中的全局变量会出现奇怪的问题(fork出子进程时生成的全局变量怕不是直接从代码段拷贝过去了 XD)
- def main():
- t1 = time.time()
- ## for _offset in range(10):
- ## processing = mp.Process(target = func, args = (soup, opener, _offset, current_page), name = 'T'+str(_offset))
- ## processings.append(processing)
- ## processing.start()
- ## for _offset in range(10):
- ## processings[_offset].join()
- ## for _ in range(4):
- ## processings[_].join()
- ## print("hello, world")
- pool = mp.Pool(4)
- for _offset in range(10):
- pool.apply_async(func = task, args = (_offset,))
- pool.close()
- pool.join()
- ## pool.map(func, args)
- ## for each in threads:
- ## each.start()
- ## each.join()
- ## func(soup, opener)
- temp = time.time() - t1
- line = "煎蛋进程池爬10页时间:"+str(temp)
- with open("jandan_pools.txt", 'w') as txt:
- txt.write(line)
-
- def task(_offset):
- global headers, url ##由于这两个变量我们不会进行修改,不存在fork出的进程与父全局变量之间的问题
- opener = urllib.request.build_opener()
- opener.add_handler = [headers]
- html = opener.open(url).read()
- soup = BeautifulSoup(html, 'html5lib')
- current_page = int(soup.find(class_ = 'current-comment-page').text[1:-1])
-
- my_page = current_page - _offset
- html = opener.open(''.join([url, '/page-', str(my_page)])).read()
- print("Open")
- soup = BeautifulSoup(html, 'html5lib')
- ol = soup.find(name = 'ol', class_ = "commentlist")
- li = ol.find_all(name = 'li')
- count = 0
- for each in li:
- if each.has_attr('id') and not each.has_attr('class'):
- img = each.find(name = 'img')
- img_hash = img.next_sibling.text
- ##调用JS代码
- with open("./func1.js", encoding='UTF-8') as f:
- line = f.readline()
- func = ''
- while line:
- func += line
- line = f.readline()
- ctx = execjs.compile(func)
- img_url = (ctx.call("jandan_load_img", img_hash))
-
- with open(''.join([r"C:/Users/ezio7/Desktop/img/", str(_offset), '_', str(count), img_url[-4:]]), 'wb') as f:
- print("T"+str(_offset))
- f.write(opener.open(''.join(['http:', img_url])).read())
- count += 1
- return 1
-
-
- if __name__ == "__main__":
- main()
复制代码
这是多线程版本的,爬取10个页面用时80~90秒。
- from bs4 import BeautifulSoup
- import urllib.request
- import urllib.parse
- import execjs
- import time
- import multiprocessing as mp
- import threading
- headers = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
- url = 'http://jiandan.net/ooxx'
- ##threads = []
- ##lock = threading.Lock()
- def main():
- ## for _ in range(4):
- ## thread = threading.Thread(target = func, args = (soup, opener), name = 'T'+str(_))
- ## threads.append(thread)
- t1 = time.time()
- ## for each in threads:
- ## each.start()
- ## each.join()
- func()
- temp = time.time() - t1
- line = "煎蛋单进程爬10页时间:"+str(temp)
- with open("jandan_time.txt", 'w') as txt:
- txt.write(line)
-
- def func():
- global headers, url
- count = 0
- page = 0
- opener = urllib.request.build_opener()
- opener.add_handler = [headers]
- html = opener.open(url).read()
- soup = BeautifulSoup(html, 'html5lib')
- current_page = int(soup.find(class_ = 'current-comment-page').text[1:-1])
-
- ## lock.acquire()
- while page < 10:
- ol = soup.find(name = 'ol', class_ = "commentlist")
- li = ol.find_all(name = 'li')
- for each in li:
- if each.has_attr('id') and not each.has_attr('class'):
- img = each.find(name = 'img')
- img_hash = img.next_sibling.text
- ##调用JS代码
- with open("./func1.js", encoding='UTF-8') as f:
- line = f.readline()
- func = ''
- while line:
- func += line
- line = f.readline()
- ctx = execjs.compile(func)
- img_url = (ctx.call("jandan_load_img", img_hash))
- ## print(img_url)
- with open(''.join([r"C:/Users/ezio7/Desktop/img", '/_', str(count), img_url[-4:]]), 'wb') as f:
- f.write(opener.open(''.join(['http:', img_url])).read())
- count += 1
- current_page -= 1
- page += 1
- print(current_page, '\t', page)
- html = opener.open(''.join([url, '/page-', str(current_page)])).read()
- soup = BeautifulSoup(html, 'html5lib')
- ## lock.release()
-
- if __name__ == "__main__":
- main()
复制代码
这是单进程版本的,用时130+秒。
分析修改后的JS代码见附件,其中为了取代原JS代码中的某些外部函数,引入了相同功能的函数。
完全使用暴力爬取,宕掉的可能性极高,只是为了测试多进程和普通爬取的速率差别(代理和延迟设置太麻烦 XD)
py文件和js文件请放在同一路径下,下载路径请自行修改。
煎蛋妹子OOXX图片爬取测试代码.zip
(8.1 KB, 下载次数: 26)
珍惜每一个福利网站,请小心使用暴力爬虫
另外,不知道为什么别的爬虫是I/O密集型,用多线程效果蛮好的,而我的这段爬虫是计算密集型的。因为使用了execjs的缘故?听说这个模块挺慢的。 |
|