mirror of
https://github.com/NohamR/phpBB-forum-scraper.git
synced 2026-02-22 02:25:43 +00:00
Added clarifiers and fixed the return
This commit is contained in:
@@ -7,10 +7,12 @@ from scrapy.http import Request
|
||||
class PhpbbSpider(scrapy.Spider):
|
||||
|
||||
name = 'phpBB'
|
||||
#Domain only, no urls
|
||||
allowed_domains = ['']
|
||||
start_urls = ['']
|
||||
username = ''
|
||||
password = ''
|
||||
# False if you dont need to login, true if you do.
|
||||
form_login = False
|
||||
|
||||
def parse(self, response):
|
||||
@@ -22,7 +24,8 @@ class PhpbbSpider(scrapy.Spider):
|
||||
formdata=formdata,
|
||||
callback=self.after_login,
|
||||
dont_click=True)]
|
||||
return form_request
|
||||
yield form_request
|
||||
return
|
||||
else:
|
||||
# REQUEST SUB-FORUM TITLE LINKS
|
||||
links = response.xpath('//a[@class="forumtitle"]/@href').extract()
|
||||
@@ -88,4 +91,4 @@ class PhpbbSpider(scrapy.Spider):
|
||||
# CLICK THROUGH NEXT PAGE
|
||||
next_link = response.xpath('//li[@class="next"]//a[@rel="next"]/@href').extract_first()
|
||||
if next_link:
|
||||
yield scrapy.Request(response.urljoin(next_link), callback=self.parse_posts)
|
||||
yield scrapy.Request(response.urljoin(next_link), callback=self.parse_posts)
|
||||
|
||||
Reference in New Issue
Block a user