Untitled
raw download clone
TEXT
views 28
,
size 3548 b
import scrapy
import logging
from scrapy.spiders import CrawlSpider
from scrapy.http import Request
from scrapy.loader import ItemLoader
from learning_scrapy.items import Product, ProductLoader

logger = logging.getLogger(__name__)


class MySpider(CrawlSpider):
    name = 'test'
    allowed_domains = ['example.com']
    # start_urls = [
    #     'https://rozetka.com.ua/apple_mqd32ua_a/p17929266/characteristics/'
    # ]

    # def parse(self, response):
    #     spec_keys = response.xpath('//dl[@class="product-characteristics__list"]/dt/span/text()').getall()
    #     spec_values = response.xpath('//dl[@class="product-characteristics__list"]/dd/ul/li//text()').getall()
    #     # name = response.xpath('//div[@class="product__heading"]/h1[@class="product__title"]/text()').getall()
    #     # about = response.xpath('//p[@class="product-about_brief"]/text()')
    #     # logger.warning(name)
    #     # item = Product()
    #     # item['specification'] = {}
    #     # item['specification'] = dict(zip(dt, dd))
    #     item_loader = ProductLoader(spec_keys=spec_keys, spec_values=spec_values)
    #     item_loader.add_value('specification', {})
    #     item_loader.add_value('id', 1)
        # return item_loader.load_item()

    def start_requests(self):
        product = ItemLoader(item=Product())
        meta = {'product': product.load_item()}
        about_url = "https://rozetka.com.ua/apple_mqd32ua_a/p17929266/"
        request = Request(about_url, callback=self.parse_links, meta=meta)
        yield request

    # изучить follow  и url join
    def parse_links(self, response):
        links = response.xpath('//li[@class="product-tabs__item"]/a/@href').getall()
        logger.warning(self.links_handlers.get(0))
        logger.warning(self.parse_about)
        for i, l in enumerate(links):
            # if l.find('review') == -1:\
            if i in [0]:
                yield Request(url=l, callback=lambda response: self.links_handlers.get(i)(response), dont_filter=True)

    # узнать что делает фильтер
    def parse_about(self, response):
        logger.warning(response.meta)
        product = ItemLoader(item=response.meta['product'], response=response)
        name = response.xpath('//div[@class="product__heading"]/h1[@class="product__title"]/text()').get()
        about = response.xpath('//p[@class="product-about_brief"]/text()').get()
        product.add_value('name', name)
        product.add_value('about', about)
        return product.load_item()

    def parse_specifications(self, response):
        spec_keys = response.xpath('//dl[@class="product-characteristics__list"]/dt/span/text()').getall()
        spec_values = response.xpath('//dl[@class="product-characteristics__list"]/dd/ul/li//text()').getall()
        product = ProductLoader(item=response.meta['product'],
                                spec_keys=spec_keys, spec_values=spec_values, response=response)
        product.add_value('specification', {})
        return product.load_item()

    # def parse_comments(self, response):
    #     pass
    #
    # def parse_photo(self, response):
    #     pass
    #
    # def parse_accessories(self, response):
    #     pass

    links_handlers = {0: parse_about,
                      1: parse_specifications
                      }
                      # 2: parse_comments,
                      # 3: parse_photo,
                      # 5: parse_accessories}
close fullscreen
Login or Register to edit or fork this paste. It's free.