mirror of
https://github.com/NohamR/phpBB-forum-scraper.git
synced 2026-02-22 02:25:43 +00:00
Debugged FormRequest for Authentication
This commit is contained in:
54
.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
54
.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
@@ -0,0 +1,54 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="HtmlUnknownTag" enabled="true" level="WARNING" enabled_by_default="true">
|
||||
<option name="myValues">
|
||||
<value>
|
||||
<list size="7">
|
||||
<item index="0" class="java.lang.String" itemvalue="nobr" />
|
||||
<item index="1" class="java.lang.String" itemvalue="noembed" />
|
||||
<item index="2" class="java.lang.String" itemvalue="comment" />
|
||||
<item index="3" class="java.lang.String" itemvalue="noscript" />
|
||||
<item index="4" class="java.lang.String" itemvalue="embed" />
|
||||
<item index="5" class="java.lang.String" itemvalue="script" />
|
||||
<item index="6" class="java.lang.String" itemvalue="body" />
|
||||
</list>
|
||||
</value>
|
||||
</option>
|
||||
<option name="myCustomValuesEnabled" value="true" />
|
||||
</inspection_tool>
|
||||
<inspection_tool class="PyBroadExceptionInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
|
||||
<inspection_tool class="PyMethodOverridingInspection" enabled="false" level="WARNING" enabled_by_default="false" />
|
||||
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
||||
<option name="ignoredPackages">
|
||||
<value>
|
||||
<list size="11">
|
||||
<item index="0" class="java.lang.String" itemvalue="pyyaml" />
|
||||
<item index="1" class="java.lang.String" itemvalue="scipy" />
|
||||
<item index="2" class="java.lang.String" itemvalue="tornado" />
|
||||
<item index="3" class="java.lang.String" itemvalue="pymongo" />
|
||||
<item index="4" class="java.lang.String" itemvalue="scikit-learn" />
|
||||
<item index="5" class="java.lang.String" itemvalue="apscheduler" />
|
||||
<item index="6" class="java.lang.String" itemvalue="numpy" />
|
||||
<item index="7" class="java.lang.String" itemvalue="redis" />
|
||||
<item index="8" class="java.lang.String" itemvalue="bcrypt" />
|
||||
<item index="9" class="java.lang.String" itemvalue="pandas" />
|
||||
<item index="10" class="java.lang.String" itemvalue="bson" />
|
||||
</list>
|
||||
</value>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
<inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
||||
<option name="ignoredErrors">
|
||||
<list>
|
||||
<option value="E722" />
|
||||
</list>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
<inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
|
||||
<option name="processCode" value="true" />
|
||||
<option name="processLiterals" value="true" />
|
||||
<option name="processComments" value="true" />
|
||||
</inspection_tool>
|
||||
</profile>
|
||||
</component>
|
||||
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
||||
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
@@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/phpBB-forum-scraper.iml" filepath="$PROJECT_DIR$/.idea/phpBB-forum-scraper.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
||||
8
.idea/phpBB-forum-scraper.iml
generated
Normal file
8
.idea/phpBB-forum-scraper.iml
generated
Normal file
@@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
||||
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
Python-based web scraper for phpBB forums. Project can be used as a template for building your own
|
||||
custom Scrapy spiders or for one-off crawls on designated forums. Please keep in mind that aggressive crawls
|
||||
can contribute significant strain on web servers, so please throttle your request rates.
|
||||
can produce significant strain on web servers, so please throttle your request rates.
|
||||
|
||||
|
||||
## Requirements:
|
||||
|
||||
@@ -26,7 +26,7 @@ ROBOTSTXT_OBEY = True
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
DOWNLOAD_DELAY = 1.0
|
||||
DOWNLOAD_DELAY = 3.0
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
@@ -1,35 +1,50 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import json
|
||||
import scrapy
|
||||
from bs4 import BeautifulSoup
|
||||
from scrapy.http import Request
|
||||
|
||||
# TODO: Please provide values for the following variables
|
||||
# Domains only, no urls
|
||||
ALLOWED_DOMAINS = ['']
|
||||
# Starting urls
|
||||
START_URLS = ['']
|
||||
# Is login required? True or False.
|
||||
FORM_LOGIN = False
|
||||
# Login username
|
||||
USERNAME = ''
|
||||
# Login password
|
||||
PASSWORD = ''
|
||||
# Login url
|
||||
LOGIN_URL = ''
|
||||
|
||||
|
||||
class PhpbbSpider(scrapy.Spider):
|
||||
|
||||
name = 'phpBB'
|
||||
# Domain only, no urls
|
||||
allowed_domains = ['']
|
||||
start_urls = ['']
|
||||
username = ''
|
||||
password = ''
|
||||
# False if you don't need to login, True if you do.
|
||||
form_login = False
|
||||
allowed_domains = ALLOWED_DOMAINS
|
||||
start_urls = START_URLS
|
||||
form_login = FORM_LOGIN
|
||||
if form_login is True:
|
||||
username = USERNAME
|
||||
password = PASSWORD
|
||||
login_url = LOGIN_URL
|
||||
start_urls.insert(0, login_url)
|
||||
|
||||
def parse(self, response):
|
||||
# LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN
|
||||
if self.form_login:
|
||||
formxpath = '//*[contains(@action, "login")]'
|
||||
formdata = {'username': self.username, 'password': self.password}
|
||||
form_request = [
|
||||
scrapy.FormRequest.from_response(
|
||||
form_request = scrapy.FormRequest.from_response(
|
||||
response,
|
||||
formdata=formdata,
|
||||
formxpath=formxpath,
|
||||
callback=self.after_login,
|
||||
dont_click=True
|
||||
)
|
||||
]
|
||||
dont_click=False
|
||||
)
|
||||
yield form_request
|
||||
return
|
||||
else:
|
||||
# REQUEST SUB-FORUM TITLE LINKS
|
||||
links = response.xpath('//a[@class="forumtitle"]/@href').extract()
|
||||
@@ -79,7 +94,7 @@ class PhpbbSpider(scrapy.Spider):
|
||||
# COLLECT FORUM POST DATA
|
||||
usernames = response.xpath('//p[@class="author"]//a[@class="username"]//text()').extract()
|
||||
post_counts = response.xpath('//dd[@class="profile-posts"]//a/text()').extract()
|
||||
post_times = response.xpath('//p[@class="author"]/text()').extract()
|
||||
post_times = response.xpath('//div[@class="postbody"]//time/@datetime').extract()
|
||||
post_texts = response.xpath('//div[@class="postbody"]//div[@class="content"]').extract()
|
||||
post_quotes = [self.clean_quote(s) for s in post_texts]
|
||||
post_texts = [self.clean_text(s) for s in post_texts]
|
||||
|
||||
Reference in New Issue
Block a user