using following code, can scrapy crawl pages of site, parse pages , return results of each page parse item processing in pipeline.
my issue cannot work out how process start_url page. start_url never gets passed parse_item function.
what missing?
class genericspider(crawlspider): name = "generic" allowed_domains = [] start_urls = [] ignored_extensions = [ # images 'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif', 'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg', # audio 'mp3', 'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff', # office suites 'xls', 'xlsx', 'ppt', 'pptx', 'doc', 'docx', 'odt', 'ods', 'odg', 'odp', # other 'css', 'exe', 'bin', 'rss', 'zip', 'rar', ] rules = [ rule(linkextractor(deny_extensions=ignored_extensions), follow=true, callback='parse_item') ] def __init__(self, start_url, source, *args, **kwargs): super(genericspider, self).__init__(*args, **kwargs) #set common settings bootstrap.init(self, kwargs) self.source = source self.start_urls = [start_url] self.allowed_domains = [urlparse.urlparse(start_url).hostname] def parse_item(self, response): process response , return item ....
you'll want define parse_start_url, following should do:
class genericspider(crawlspider): name = "generic" allowed_domains = [] start_urls = [] ... def parse_item(self, response): process response , return item .... parse_start_url = parse_item
Comments
Post a Comment