Парсинг сайтов с помощью фреймворка Scrapy
Добавлено: 03 янв 2013, 14:55
Парсинг сайтов с помощью фреймворка Scrapy
Написал статью, просьба покритиковать.
Написал статью, просьба покритиковать.
Геоинформационные системы (ГИС) и Дистанционное зондирование Земли
https://gis-lab.info/forum/
Код: Выделить всё
E:\python>c:\python27\scripts\scrapy.exe startproject name
Traceback (most recent call last):
File "C:\Python27\lib\runpy.py", line 162, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "C:\Python27\lib\runpy.py", line 72, in _run_code
exec code in run_globals
File "c:\python27\scripts\scrapy.exe\__main__.py", line 9, in <module>
File "C:\Python27\lib\site-packages\scrapy\cmdline.py", line 122, in execute
cmds = _get_commands_dict(settings, inproject)
File "C:\Python27\lib\site-packages\scrapy\cmdline.py", line 46, in _get_commands_dict
cmds = _get_commands_from_module('scrapy.commands', inproject)
File "C:\Python27\lib\site-packages\scrapy\cmdline.py", line 29, in _get_commands_from_module
for cmd in _iter_command_classes(module):
File "C:\Python27\lib\site-packages\scrapy\cmdline.py", line 20, in _iter_command_classes
for module in walk_modules(module_name):
File "C:\Python27\lib\site-packages\scrapy\utils\misc.py", line 71, in walk_modules
submod = import_module(fullpath)
File "C:\Python27\lib\importlib\__init__.py", line 37, in import_module
__import__(name)
File "C:\Python27\lib\site-packages\scrapy\commands\version.py", line 6, in <module>
import OpenSSL
File "C:\Python27\lib\site-packages\OpenSSL\__init__.py", line 8, in <module>
from OpenSSL import rand, crypto, SSL
File "C:\Python27\lib\site-packages\OpenSSL\rand.py", line 11, in <module>
from OpenSSL._util import (
File "C:\Python27\lib\site-packages\OpenSSL\_util.py", line 6, in <module>
from cryptography.hazmat.bindings.openssl.binding import Binding
ImportError: No module named cryptography.hazmat.bindings.openssl.binding
Код: Выделить всё
<ul class="list-unstyled">
<li><b>Продолжительность:</b> 97 мин.</li><br>
<li><b>Жанр:</b> <a href="/film/al/" itemprop="genre">Мелодрама</a>.</li>
<li><b>Страна:</b> <a href="/film/fo/">Россия</a>.</li>
<li><b>Режиссер:</b> <a href="/star/68166/">Дмитрий Константинов</a>.</li>
<li><b>Сценарий:</b> <a href="/star/68166/">Дмитрий Константинов</a>.</li>
<li><b>Продюсеры:</b> <a href="/star/22142/">Рубен Дишдишян</a>.</li>
<li><b>Оператор:</b> <a href="/star/137587/">Андрей Гуркин</a>.</li>
<li><b>Художник:</b> <a href="/star/3339230/">Игорь Тихонов</a>
</ul>
Код: Выделить всё
sel.xpath('//div[@class="col-xs-10 col-md-8"]/ul[@class="list-unstyled"]/li[1]/text()').extract()
Код: Выделить всё
sel.xpath('//div[@class="col-xs-10 col-md-8"]/ul[@class="list-unstyled"]/li/b[text() = "%s"]' % u"Продолжительность:" ).extract()
Код: Выделить всё
sel.xpath('//div[@class="col-xs-10 col-md-8"]/ul[@class="list-unstyled"]/li/b[text() = "%s"]/following-sibling::text()[1]' % u"Продолжительность:" ).extract()
Код: Выделить всё
f = list(open(ur'cities.txt','r'))
cities = {}
for i in f:
cities[i.split(',')[1:2][0]] = i.split(',')[0:1][0]
class RabotaPipeline(object):
def process_item(self, item, spider):
global cities
if item['strana']:
item['strana'] = cities[item['strana']]
return item
else:
pass
return item
Код: Выделить всё
l.add_xpath('strana', '//div[@id="job-details"]/*/span[@itemprop="jobLocation"]/text()')
Код: Выделить всё
/usr/lib/python3.4/importlib/_bootstrap.py:321: ScrapyDeprecationWarning: Module `scrapy.contrib.spiders` is deprecated, use `scrapy.spiders` instead
return f(*args, **kwds)
/usr/lib/python3.4/importlib/_bootstrap.py:321: ScrapyDeprecationWarning: Module `scrapy.contrib.linkextractors` is deprecated, use `scrapy.linkextractors` instead
return f(*args, **kwds)
/usr/lib/python3.4/importlib/_bootstrap.py:321: ScrapyDeprecationWarning: Module `scrapy.contrib.linkextractors.sgml` is deprecated, use `scrapy.linkextractors.sgml` instead
return f(*args, **kwds)
Traceback (most recent call last):
File "/usr/local/bin/scrapy", line 11, in <module>
sys.exit(execute())
File "/usr/local/lib/python3.4/dist-packages/scrapy/cmdline.py", line 141, in execute
cmd.crawler_process = CrawlerProcess(settings)
File "/usr/local/lib/python3.4/dist-packages/scrapy/crawler.py", line 238, in __init__
super(CrawlerProcess, self).__init__(settings)
File "/usr/local/lib/python3.4/dist-packages/scrapy/crawler.py", line 129, in __init__
self.spider_loader = _get_spider_loader(settings)
File "/usr/local/lib/python3.4/dist-packages/scrapy/crawler.py", line 325, in _get_spider_loader
return loader_cls.from_settings(settings.frozencopy())
File "/usr/local/lib/python3.4/dist-packages/scrapy/spiderloader.py", line 33, in from_settings
return cls(settings)
File "/usr/local/lib/python3.4/dist-packages/scrapy/spiderloader.py", line 20, in __init__
self._load_all_spiders()
File "/usr/local/lib/python3.4/dist-packages/scrapy/spiderloader.py", line 28, in _load_all_spiders
for module in walk_modules(name):
File "/usr/local/lib/python3.4/dist-packages/scrapy/utils/misc.py", line 71, in walk_modules
submod = import_module(fullpath)
File "/usr/lib/python3.4/importlib/__init__.py", line 109, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 2254, in _gcd_import
File "<frozen importlib._bootstrap>", line 2237, in _find_and_load
File "<frozen importlib._bootstrap>", line 2226, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 1200, in _load_unlocked
File "<frozen importlib._bootstrap>", line 1129, in _exec
File "<frozen importlib._bootstrap>", line 1471, in exec_module
File "<frozen importlib._bootstrap>", line 321, in _call_with_frames_removed
File "/root/orphanage/orphanage/spiders/detskiedomiki.py", line 2, in <module>
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
File "/usr/local/lib/python3.4/dist-packages/scrapy/contrib/linkextractors/sgml.py", line 7, in <module>
from scrapy.linkextractors.sgml import *
File "/usr/local/lib/python3.4/dist-packages/scrapy/linkextractors/sgml.py", line 6, in <module>
from sgmllib3 import SGMLParser
ImportError: No module named 'sgmllib3'
Код: Выделить всё
from scrapy.linkextractors import LinkExtractor