mirror of
https://github.com/NohamR/phpBB-forum-scraper.git
synced 2026-02-22 02:25:43 +00:00
Merge pull request #6 from Dascienz/dev
Debugged FormRequest for Authentication
This commit is contained in:
54
.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
54
.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<profile version="1.0">
|
||||||
|
<option name="myName" value="Project Default" />
|
||||||
|
<inspection_tool class="HtmlUnknownTag" enabled="true" level="WARNING" enabled_by_default="true">
|
||||||
|
<option name="myValues">
|
||||||
|
<value>
|
||||||
|
<list size="7">
|
||||||
|
<item index="0" class="java.lang.String" itemvalue="nobr" />
|
||||||
|
<item index="1" class="java.lang.String" itemvalue="noembed" />
|
||||||
|
<item index="2" class="java.lang.String" itemvalue="comment" />
|
||||||
|
<item index="3" class="java.lang.String" itemvalue="noscript" />
|
||||||
|
<item index="4" class="java.lang.String" itemvalue="embed" />
|
||||||
|
<item index="5" class="java.lang.String" itemvalue="script" />
|
||||||
|
<item index="6" class="java.lang.String" itemvalue="body" />
|
||||||
|
</list>
|
||||||
|
</value>
|
||||||
|
</option>
|
||||||
|
<option name="myCustomValuesEnabled" value="true" />
|
||||||
|
</inspection_tool>
|
||||||
|
<inspection_tool class="PyBroadExceptionInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
|
||||||
|
<inspection_tool class="PyMethodOverridingInspection" enabled="false" level="WARNING" enabled_by_default="false" />
|
||||||
|
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
||||||
|
<option name="ignoredPackages">
|
||||||
|
<value>
|
||||||
|
<list size="11">
|
||||||
|
<item index="0" class="java.lang.String" itemvalue="pyyaml" />
|
||||||
|
<item index="1" class="java.lang.String" itemvalue="scipy" />
|
||||||
|
<item index="2" class="java.lang.String" itemvalue="tornado" />
|
||||||
|
<item index="3" class="java.lang.String" itemvalue="pymongo" />
|
||||||
|
<item index="4" class="java.lang.String" itemvalue="scikit-learn" />
|
||||||
|
<item index="5" class="java.lang.String" itemvalue="apscheduler" />
|
||||||
|
<item index="6" class="java.lang.String" itemvalue="numpy" />
|
||||||
|
<item index="7" class="java.lang.String" itemvalue="redis" />
|
||||||
|
<item index="8" class="java.lang.String" itemvalue="bcrypt" />
|
||||||
|
<item index="9" class="java.lang.String" itemvalue="pandas" />
|
||||||
|
<item index="10" class="java.lang.String" itemvalue="bson" />
|
||||||
|
</list>
|
||||||
|
</value>
|
||||||
|
</option>
|
||||||
|
</inspection_tool>
|
||||||
|
<inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
||||||
|
<option name="ignoredErrors">
|
||||||
|
<list>
|
||||||
|
<option value="E722" />
|
||||||
|
</list>
|
||||||
|
</option>
|
||||||
|
</inspection_tool>
|
||||||
|
<inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
|
||||||
|
<option name="processCode" value="true" />
|
||||||
|
<option name="processLiterals" value="true" />
|
||||||
|
<option name="processComments" value="true" />
|
||||||
|
</inspection_tool>
|
||||||
|
</profile>
|
||||||
|
</component>
|
||||||
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<settings>
|
||||||
|
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||||
|
<version value="1.0" />
|
||||||
|
</settings>
|
||||||
|
</component>
|
||||||
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/phpBB-forum-scraper.iml" filepath="$PROJECT_DIR$/.idea/phpBB-forum-scraper.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
8
.idea/phpBB-forum-scraper.iml
generated
Normal file
8
.idea/phpBB-forum-scraper.iml
generated
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$" />
|
||||||
|
<orderEntry type="inheritedJdk" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
||||||
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
Python-based web scraper for phpBB forums. Project can be used as a template for building your own
|
Python-based web scraper for phpBB forums. Project can be used as a template for building your own
|
||||||
custom Scrapy spiders or for one-off crawls on designated forums. Please keep in mind that aggressive crawls
|
custom Scrapy spiders or for one-off crawls on designated forums. Please keep in mind that aggressive crawls
|
||||||
can contribute significant strain on web servers, so please throttle your request rates.
|
can produce significant strain on web servers, so please throttle your request rates.
|
||||||
|
|
||||||
|
|
||||||
## Requirements:
|
## Requirements:
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ ROBOTSTXT_OBEY = True
|
|||||||
# Configure a delay for requests for the same website (default: 0)
|
# Configure a delay for requests for the same website (default: 0)
|
||||||
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
|
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
|
||||||
# See also autothrottle settings and docs
|
# See also autothrottle settings and docs
|
||||||
DOWNLOAD_DELAY = 1.0
|
DOWNLOAD_DELAY = 3.0
|
||||||
# The download delay setting will honor only one of:
|
# The download delay setting will honor only one of:
|
||||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||||
|
|||||||
@@ -1,35 +1,50 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import re
|
import re
|
||||||
|
import json
|
||||||
import scrapy
|
import scrapy
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from scrapy.http import Request
|
from scrapy.http import Request
|
||||||
|
|
||||||
|
# TODO: Please provide values for the following variables
|
||||||
|
# Domains only, no urls
|
||||||
|
ALLOWED_DOMAINS = ['']
|
||||||
|
# Starting urls
|
||||||
|
START_URLS = ['']
|
||||||
|
# Is login required? True or False.
|
||||||
|
FORM_LOGIN = False
|
||||||
|
# Login username
|
||||||
|
USERNAME = ''
|
||||||
|
# Login password
|
||||||
|
PASSWORD = ''
|
||||||
|
# Login url
|
||||||
|
LOGIN_URL = ''
|
||||||
|
|
||||||
|
|
||||||
class PhpbbSpider(scrapy.Spider):
|
class PhpbbSpider(scrapy.Spider):
|
||||||
|
|
||||||
name = 'phpBB'
|
name = 'phpBB'
|
||||||
# Domain only, no urls
|
allowed_domains = ALLOWED_DOMAINS
|
||||||
allowed_domains = ['']
|
start_urls = START_URLS
|
||||||
start_urls = ['']
|
form_login = FORM_LOGIN
|
||||||
username = ''
|
if form_login is True:
|
||||||
password = ''
|
username = USERNAME
|
||||||
# False if you don't need to login, True if you do.
|
password = PASSWORD
|
||||||
form_login = False
|
login_url = LOGIN_URL
|
||||||
|
start_urls.insert(0, login_url)
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
# LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN
|
# LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN
|
||||||
if self.form_login:
|
if self.form_login:
|
||||||
|
formxpath = '//*[contains(@action, "login")]'
|
||||||
formdata = {'username': self.username, 'password': self.password}
|
formdata = {'username': self.username, 'password': self.password}
|
||||||
form_request = [
|
form_request = scrapy.FormRequest.from_response(
|
||||||
scrapy.FormRequest.from_response(
|
|
||||||
response,
|
response,
|
||||||
formdata=formdata,
|
formdata=formdata,
|
||||||
|
formxpath=formxpath,
|
||||||
callback=self.after_login,
|
callback=self.after_login,
|
||||||
dont_click=True
|
dont_click=False
|
||||||
)
|
)
|
||||||
]
|
|
||||||
yield form_request
|
yield form_request
|
||||||
return
|
|
||||||
else:
|
else:
|
||||||
# REQUEST SUB-FORUM TITLE LINKS
|
# REQUEST SUB-FORUM TITLE LINKS
|
||||||
links = response.xpath('//a[@class="forumtitle"]/@href').extract()
|
links = response.xpath('//a[@class="forumtitle"]/@href').extract()
|
||||||
@@ -79,7 +94,7 @@ class PhpbbSpider(scrapy.Spider):
|
|||||||
# COLLECT FORUM POST DATA
|
# COLLECT FORUM POST DATA
|
||||||
usernames = response.xpath('//p[@class="author"]//a[@class="username"]//text()').extract()
|
usernames = response.xpath('//p[@class="author"]//a[@class="username"]//text()').extract()
|
||||||
post_counts = response.xpath('//dd[@class="profile-posts"]//a/text()').extract()
|
post_counts = response.xpath('//dd[@class="profile-posts"]//a/text()').extract()
|
||||||
post_times = response.xpath('//p[@class="author"]/text()').extract()
|
post_times = response.xpath('//div[@class="postbody"]//time/@datetime').extract()
|
||||||
post_texts = response.xpath('//div[@class="postbody"]//div[@class="content"]').extract()
|
post_texts = response.xpath('//div[@class="postbody"]//div[@class="content"]').extract()
|
||||||
post_quotes = [self.clean_quote(s) for s in post_texts]
|
post_quotes = [self.clean_quote(s) for s in post_texts]
|
||||||
post_texts = [self.clean_text(s) for s in post_texts]
|
post_texts = [self.clean_text(s) for s in post_texts]
|
||||||
|
|||||||
Reference in New Issue
Block a user