diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..4f7ea0f
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,54 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..c4e835a
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/phpBB-forum-scraper.iml b/.idea/phpBB-forum-scraper.iml
new file mode 100644
index 0000000..d0876a7
--- /dev/null
+++ b/.idea/phpBB-forum-scraper.iml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/README.md b/README.md
index 0b173e9..67d1fdf 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
Python-based web scraper for phpBB forums. Project can be used as a template for building your own
custom Scrapy spiders or for one-off crawls on designated forums. Please keep in mind that aggressive crawls
-can contribute significant strain on web servers, so please throttle your request rates.
+can produce significant strain on web servers, so please throttle your request rates.
## Requirements:
diff --git a/phpBB_scraper/phpBB_scraper/settings.py b/phpBB_scraper/phpBB_scraper/settings.py
index 2f9a98b..ba0c4ee 100644
--- a/phpBB_scraper/phpBB_scraper/settings.py
+++ b/phpBB_scraper/phpBB_scraper/settings.py
@@ -26,7 +26,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
-DOWNLOAD_DELAY = 1.0
+DOWNLOAD_DELAY = 3.0
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
diff --git a/phpBB_scraper/phpBB_scraper/spiders/phpBB.py b/phpBB_scraper/phpBB_scraper/spiders/phpBB.py
index d398af5..f76b711 100644
--- a/phpBB_scraper/phpBB_scraper/spiders/phpBB.py
+++ b/phpBB_scraper/phpBB_scraper/spiders/phpBB.py
@@ -1,35 +1,50 @@
# -*- coding: utf-8 -*-
import re
+import json
import scrapy
from bs4 import BeautifulSoup
from scrapy.http import Request
+# TODO: Please provide values for the following variables
+# Domains only, no urls
+ALLOWED_DOMAINS = ['']
+# Starting urls
+START_URLS = ['']
+# Is login required? True or False.
+FORM_LOGIN = False
+# Login username
+USERNAME = ''
+# Login password
+PASSWORD = ''
+# Login url
+LOGIN_URL = ''
+
class PhpbbSpider(scrapy.Spider):
name = 'phpBB'
- # Domain only, no urls
- allowed_domains = ['']
- start_urls = ['']
- username = ''
- password = ''
- # False if you don't need to login, True if you do.
- form_login = False
-
+ allowed_domains = ALLOWED_DOMAINS
+ start_urls = START_URLS
+ form_login = FORM_LOGIN
+ if form_login is True:
+ username = USERNAME
+ password = PASSWORD
+ login_url = LOGIN_URL
+ start_urls.insert(0, login_url)
+
def parse(self, response):
# LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN
if self.form_login:
+ formxpath = '//*[contains(@action, "login")]'
formdata = {'username': self.username, 'password': self.password}
- form_request = [
- scrapy.FormRequest.from_response(
+ form_request = scrapy.FormRequest.from_response(
response,
formdata=formdata,
+ formxpath=formxpath,
callback=self.after_login,
- dont_click=True
- )
- ]
+ dont_click=False
+ )
yield form_request
- return
else:
# REQUEST SUB-FORUM TITLE LINKS
links = response.xpath('//a[@class="forumtitle"]/@href').extract()
@@ -79,7 +94,7 @@ class PhpbbSpider(scrapy.Spider):
# COLLECT FORUM POST DATA
usernames = response.xpath('//p[@class="author"]//a[@class="username"]//text()').extract()
post_counts = response.xpath('//dd[@class="profile-posts"]//a/text()').extract()
- post_times = response.xpath('//p[@class="author"]/text()').extract()
+ post_times = response.xpath('//div[@class="postbody"]//time/@datetime').extract()
post_texts = response.xpath('//div[@class="postbody"]//div[@class="content"]').extract()
post_quotes = [self.clean_quote(s) for s in post_texts]
post_texts = [self.clean_text(s) for s in post_texts]