# -*- encoding: utf-8 -*-

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.loader.processor import TakeFirst
from scrapy.contrib.loader import XPathItemLoader
from scrapy.selector import HtmlXPathSelector
from ..items import DatamosruItem

class DatamosruLoader(XPathItemLoader):
    default_output_processor = TakeFirst()

class DatamosruSpider(CrawlSpider):
    name = "datamosru"
    allowed_domains = ["data.mos.ru"]
    start_urls = ["http://data.mos.ru/datasets/param/"]

    rules = (
        Rule(SgmlLinkExtractor(allow=('/datasets/\d',)), callback='parse_item'),
    )

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        l = DatamosruLoader(DatamosruItem(), hxs)
        l.add_xpath('description', "//dl/dd[1]/text()")
        l.add_xpath('category', "//dl/dd[2]/text()")
        l.add_xpath('date', "//dl/dd[3]/text()")
        l.add_xpath('update', "//dl/dd[4]/text()")
        l.add_xpath('format', "//dl/dd[5]/text()")
        l.add_xpath('geodata', "//dl/dd[6]/text()")
        l.add_value('url', response.url)
        l.add_value('code', response.url.split("/")[4].split("_")[0])
        return l.load_item()