import scrapy
from tutoria.items import DmozItem
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains =["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
sel = scrapy.selector.Selector(response)
sites = sel.xpath('//div[@class="site-item "]/div[@class="title-and-desc"]')
items = []
for site in sites:
item = DmozItem()
item['title'] = site.xpath('a/div/text()').extract()
item['link'] = site.xpath('a/@href').extract()
item['desc'] = site.xpath('div[@class="site-descr "]/text()').extract()[0].strip()
items.append(item)
return items
由于网站修改,书中代码已经失效,无法爬取实际信息
修改选择器匹配加深记忆