python - scrapy递归爬虫问题
怪我咯
怪我咯 2017-04-17 17:30:55
[Python讨论组]

sipder如下:
`#-"coding:utf-8 -"-
import sys, os
reload(sys)

from scrapy.spiders import Spider
from scrapy.spiders import Request
from scrapy.selector import Selector
from network.items import WandoujiawangyeItem

base = "D:/python_workspace/datasets/"

class WandoujiawangyeSpider(Spider):

name = "wandoujiawangye"
download_delay = 1
allowed_domains = ["wandoujia.com"]  
start_urls = [  
   "http://www.wandoujia.com/category/app" ,
   "http://www.wandoujia.com/category/game"          
]  

def parse(self, response):
    items =[]
    sel = Selector(response)
    big_urls = sel.xpath('//li[@class=\"parent-cate\"]/a/@href').extract()  
    big_titles = sel.xpath('//li[@class=\"parent-cate\"]/a/text()').extract()

    second_urls = sel.xpath('//li[@class=\"child-cate\"]/a/@href').extract()
    second_titles = sel.xpath('//li[@class=\"child-cate\"]/a/text()').extract()
      
    for i in range(0,len(big_titles)-1):
        file_name = base +big_titles[i]
  #创建目录
        if(not os.path.exists(file_name)):
            os.makedirs(file_name)
        for j in range(0,len(second_titles)):
            item =WandoujiawangyeItem()                
            item['parent_url'] = big_urls[i]
            item['parent_title'] = big_titles[i]
            if_belong = second_urls[j].startswith(item['parent_url'])
            if(if_belong):
                second_file_name =file_name + '/' +second_titles[j]
                if(not os.path.extists(second_file_name)):
                    os.makedirs(second_file_name)
                item['second_url'] = second_urls[j]
                item['second_title'] = second_titles[j]
                item['path'] = second_file_name
                items.append(item)
    for item in items:
        yield Request(url=item['second_url'],meta={'item_1':item},callbck=self.second_parse,dont_filter=True)
 #对于返回的小类的url,再进行递归请求
def second_parse(self,response):
    sel = Selector(response)
    item_1 = response.meta['item_1']
    items = []
    bigUrls = sel.xpath('//a/@href').extract()   

    for i in range(0,len(bigUrls)):
        if_belong = bigUrls[i].endswith('.shtml') and bigUrls[i].startswith(item_1['parent_url'])
        if(if_belong):
            item = WandoujiawangyeItem()
            item['parent_title']=item_1['parent_title']
            item['parent_url']=item_1['parent_url']
            item['second_url']=item_1['second_url']
            item['second_title']=item_1['second_title']
            item['path']=item_1['path']
            item['link_url']=bigUrls[i]
            items.append(item)
    for item in items:
        yield Request(url=item['link_url'],meta={'item_2':item},callback=self.detail_parse,dont_filter=True)
def detail_parse(self,response):
    sel=Selector(response)
    item=response.mata['item_2']
    content=""                  
    head=sel.xpath('//span[@class=\"title\"]/text()')         
    content_list=sel.xpath('//p[@class=\"desc-info\"]/p/text()') 
    for content_one in content_list:
        content+=content_one   
    item['head']=head              
    item['content']=content     
    yield item
        

为什么我只爬得出big_titles出来?好像不能和第二级建立连接,second_titles等都出不来,哪里有问题吗?在线等,求救!

`

怪我咯
怪我咯

走同样的路,发现不同的人生

全部回复(0)
热门教程
更多>
最新下载
更多>
网站特效
网站源码
网站素材
前端模板
关于我们 免责申明 意见反馈 讲师合作 广告合作 最新更新
php中文网:公益在线php培训,帮助PHP学习者快速成长!
关注服务号 技术交流群
PHP中文网订阅号
每天精选资源文章推送
PHP中文网APP
随时随地碎片化学习
PHP中文网抖音号
发现有趣的

Copyright 2014-2025 https://www.php.cn/ All Rights Reserved | php.cn | 湘ICP备2023035733号