diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..4f7ea0f --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,54 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..c4e835a --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/phpBB-forum-scraper.iml b/.idea/phpBB-forum-scraper.iml new file mode 100644 index 0000000..d0876a7 --- /dev/null +++ b/.idea/phpBB-forum-scraper.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/README.md b/README.md index 0b173e9..67d1fdf 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ Python-based web scraper for phpBB forums. Project can be used as a template for building your own custom Scrapy spiders or for one-off crawls on designated forums. Please keep in mind that aggressive crawls -can contribute significant strain on web servers, so please throttle your request rates. +can produce significant strain on web servers, so please throttle your request rates. ## Requirements: diff --git a/phpBB_scraper/phpBB_scraper/settings.py b/phpBB_scraper/phpBB_scraper/settings.py index 2f9a98b..ba0c4ee 100644 --- a/phpBB_scraper/phpBB_scraper/settings.py +++ b/phpBB_scraper/phpBB_scraper/settings.py @@ -26,7 +26,7 @@ ROBOTSTXT_OBEY = True # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs -DOWNLOAD_DELAY = 1.0 +DOWNLOAD_DELAY = 3.0 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 diff --git a/phpBB_scraper/phpBB_scraper/spiders/phpBB.py b/phpBB_scraper/phpBB_scraper/spiders/phpBB.py index d398af5..f76b711 100644 --- a/phpBB_scraper/phpBB_scraper/spiders/phpBB.py +++ b/phpBB_scraper/phpBB_scraper/spiders/phpBB.py @@ -1,35 +1,50 @@ # -*- coding: utf-8 -*- import re +import json import scrapy from bs4 import BeautifulSoup from scrapy.http import Request +# TODO: Please provide values for the following variables +# Domains only, no urls +ALLOWED_DOMAINS = [''] +# Starting urls +START_URLS = [''] +# Is login required? True or False. +FORM_LOGIN = False +# Login username +USERNAME = '' +# Login password +PASSWORD = '' +# Login url +LOGIN_URL = '' + class PhpbbSpider(scrapy.Spider): name = 'phpBB' - # Domain only, no urls - allowed_domains = [''] - start_urls = [''] - username = '' - password = '' - # False if you don't need to login, True if you do. - form_login = False - + allowed_domains = ALLOWED_DOMAINS + start_urls = START_URLS + form_login = FORM_LOGIN + if form_login is True: + username = USERNAME + password = PASSWORD + login_url = LOGIN_URL + start_urls.insert(0, login_url) + def parse(self, response): # LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN if self.form_login: + formxpath = '//*[contains(@action, "login")]' formdata = {'username': self.username, 'password': self.password} - form_request = [ - scrapy.FormRequest.from_response( + form_request = scrapy.FormRequest.from_response( response, formdata=formdata, + formxpath=formxpath, callback=self.after_login, - dont_click=True - ) - ] + dont_click=False + ) yield form_request - return else: # REQUEST SUB-FORUM TITLE LINKS links = response.xpath('//a[@class="forumtitle"]/@href').extract() @@ -79,7 +94,7 @@ class PhpbbSpider(scrapy.Spider): # COLLECT FORUM POST DATA usernames = response.xpath('//p[@class="author"]//a[@class="username"]//text()').extract() post_counts = response.xpath('//dd[@class="profile-posts"]//a/text()').extract() - post_times = response.xpath('//p[@class="author"]/text()').extract() + post_times = response.xpath('//div[@class="postbody"]//time/@datetime').extract() post_texts = response.xpath('//div[@class="postbody"]//div[@class="content"]').extract() post_quotes = [self.clean_quote(s) for s in post_texts] post_texts = [self.clean_text(s) for s in post_texts]