Debugged FormRequest for Authentication

This commit is contained in:
David Ascienzo
2020-04-21 18:49:20 -04:00
parent 29dd97420f
commit 29cb84e271
8 changed files with 114 additions and 17 deletions

View File

@@ -0,0 +1,54 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="HtmlUnknownTag" enabled="true" level="WARNING" enabled_by_default="true">
<option name="myValues">
<value>
<list size="7">
<item index="0" class="java.lang.String" itemvalue="nobr" />
<item index="1" class="java.lang.String" itemvalue="noembed" />
<item index="2" class="java.lang.String" itemvalue="comment" />
<item index="3" class="java.lang.String" itemvalue="noscript" />
<item index="4" class="java.lang.String" itemvalue="embed" />
<item index="5" class="java.lang.String" itemvalue="script" />
<item index="6" class="java.lang.String" itemvalue="body" />
</list>
</value>
</option>
<option name="myCustomValuesEnabled" value="true" />
</inspection_tool>
<inspection_tool class="PyBroadExceptionInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
<inspection_tool class="PyMethodOverridingInspection" enabled="false" level="WARNING" enabled_by_default="false" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="11">
<item index="0" class="java.lang.String" itemvalue="pyyaml" />
<item index="1" class="java.lang.String" itemvalue="scipy" />
<item index="2" class="java.lang.String" itemvalue="tornado" />
<item index="3" class="java.lang.String" itemvalue="pymongo" />
<item index="4" class="java.lang.String" itemvalue="scikit-learn" />
<item index="5" class="java.lang.String" itemvalue="apscheduler" />
<item index="6" class="java.lang.String" itemvalue="numpy" />
<item index="7" class="java.lang.String" itemvalue="redis" />
<item index="8" class="java.lang.String" itemvalue="bcrypt" />
<item index="9" class="java.lang.String" itemvalue="pandas" />
<item index="10" class="java.lang.String" itemvalue="bson" />
</list>
</value>
</option>
</inspection_tool>
<inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="E722" />
</list>
</option>
</inspection_tool>
<inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
<option name="processCode" value="true" />
<option name="processLiterals" value="true" />
<option name="processComments" value="true" />
</inspection_tool>
</profile>
</component>

View File

@@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

8
.idea/modules.xml generated Normal file
View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/phpBB-forum-scraper.iml" filepath="$PROJECT_DIR$/.idea/phpBB-forum-scraper.iml" />
</modules>
</component>
</project>

8
.idea/phpBB-forum-scraper.iml generated Normal file
View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

6
.idea/vcs.xml generated Normal file
View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

View File

@@ -2,7 +2,7 @@
Python-based web scraper for phpBB forums. Project can be used as a template for building your own Python-based web scraper for phpBB forums. Project can be used as a template for building your own
custom Scrapy spiders or for one-off crawls on designated forums. Please keep in mind that aggressive crawls custom Scrapy spiders or for one-off crawls on designated forums. Please keep in mind that aggressive crawls
can contribute significant strain on web servers, so please throttle your request rates. can produce significant strain on web servers, so please throttle your request rates.
## Requirements: ## Requirements:

View File

@@ -26,7 +26,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
DOWNLOAD_DELAY = 1.0 DOWNLOAD_DELAY = 3.0
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16

View File

@@ -1,35 +1,50 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import re import re
import json
import scrapy import scrapy
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from scrapy.http import Request from scrapy.http import Request
# TODO: Please provide values for the following variables
# Domains only, no urls
ALLOWED_DOMAINS = ['']
# Starting urls
START_URLS = ['']
# Is login required? True or False.
FORM_LOGIN = False
# Login username
USERNAME = ''
# Login password
PASSWORD = ''
# Login url
LOGIN_URL = ''
class PhpbbSpider(scrapy.Spider): class PhpbbSpider(scrapy.Spider):
name = 'phpBB' name = 'phpBB'
# Domain only, no urls allowed_domains = ALLOWED_DOMAINS
allowed_domains = [''] start_urls = START_URLS
start_urls = [''] form_login = FORM_LOGIN
username = '' if form_login is True:
password = '' username = USERNAME
# False if you don't need to login, True if you do. password = PASSWORD
form_login = False login_url = LOGIN_URL
start_urls.insert(0, login_url)
def parse(self, response): def parse(self, response):
# LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN # LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN
if self.form_login: if self.form_login:
formxpath = '//*[contains(@action, "login")]'
formdata = {'username': self.username, 'password': self.password} formdata = {'username': self.username, 'password': self.password}
form_request = [ form_request = scrapy.FormRequest.from_response(
scrapy.FormRequest.from_response(
response, response,
formdata=formdata, formdata=formdata,
formxpath=formxpath,
callback=self.after_login, callback=self.after_login,
dont_click=True dont_click=False
) )
]
yield form_request yield form_request
return
else: else:
# REQUEST SUB-FORUM TITLE LINKS # REQUEST SUB-FORUM TITLE LINKS
links = response.xpath('//a[@class="forumtitle"]/@href').extract() links = response.xpath('//a[@class="forumtitle"]/@href').extract()
@@ -79,7 +94,7 @@ class PhpbbSpider(scrapy.Spider):
# COLLECT FORUM POST DATA # COLLECT FORUM POST DATA
usernames = response.xpath('//p[@class="author"]//a[@class="username"]//text()').extract() usernames = response.xpath('//p[@class="author"]//a[@class="username"]//text()').extract()
post_counts = response.xpath('//dd[@class="profile-posts"]//a/text()').extract() post_counts = response.xpath('//dd[@class="profile-posts"]//a/text()').extract()
post_times = response.xpath('//p[@class="author"]/text()').extract() post_times = response.xpath('//div[@class="postbody"]//time/@datetime').extract()
post_texts = response.xpath('//div[@class="postbody"]//div[@class="content"]').extract() post_texts = response.xpath('//div[@class="postbody"]//div[@class="content"]').extract()
post_quotes = [self.clean_quote(s) for s in post_texts] post_quotes = [self.clean_quote(s) for s in post_texts]
post_texts = [self.clean_text(s) for s in post_texts] post_texts = [self.clean_text(s) for s in post_texts]