|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 mongoole 于 2018-5-16 14:41 编辑
1.只选取了1080p的页面进行爬虫来的。
2.还请大家看看,有什么需要修进、改正的地方请指出来,大家互相一起提升哈。
- #coding:utf-8
- import requests
- from lxml import etree
- import re
- import time
- class Dygang:
- def __init__(self,url):
- self.url = url
- def get_response(self):
- '''获取该网页的内容'''
- response = requests.get(self.url)
- response.encoding = 'gbk'
- self.html = response.text
- return self.html
- # print(html)
- def get_url(self):
- '''获取页面所有的电源的URL'''
- self.movie_url = re.findall('<a href="(.+?)" target="_blank" class="classlinkclass">(.+?)</a>',self.html)
- # print(self.movie_url)
- return self.movie_url
- def tgt_url(self):
- '''获取最终种子链接'''
- for url,name in self.movie_url:
- # print(url,name)
- response = requests.get(url)
- response.encoding = 'gbk'
- self.html = response.text
- selector = etree.HTML(self.html)
- try:
- torret_url = selector.xpath('//*[@id="dede_content"]/table/tbody/tr/td/a/@href')[0]
- except:
- break
- # print(torret_url)
- url_dict[name] = torret_url
- def wirte_tgt(self):
- '''内容写入'''
- with open('dygang.txt','a') as file:
- for key,values in url_dict.items():
- file.write(('{0},{1}\n').format(key,values))
- # print('%s<----->%s ------>种子已保存完成'%(key,values))
- def url_start(self):
- self.get_response()
- self.get_url()
- global url_dict
- url_dict = {}
- self.tgt_url()
- # print(url_dict)
- self.wirte_tgt()
- def main():
- for i in range(0,180):
- if i == 1:
- url = 'http://www.dygang.net/1080p/index.htm'
- else:
- url = 'http://www.dygang.net/1080p/index_' + str(i) + '.htm'
- dygang = Dygang(url)
- dygang.url_start()
- time.sleep(5)
- print('%s <--------------> 已爬取完成' % url)
- if __name__ == '__main__':
- main()
复制代码 |
|