Debugged FormRequest for Authentication

This commit is contained in:
David Ascienzo
2020-04-21 18:49:20 -04:00
parent 29dd97420f
commit 29cb84e271
8 changed files with 114 additions and 17 deletions

View File

@@ -0,0 +1,54 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="HtmlUnknownTag" enabled="true" level="WARNING" enabled_by_default="true">
<option name="myValues">
<value>
<list size="7">
<item index="0" class="java.lang.String" itemvalue="nobr" />
<item index="1" class="java.lang.String" itemvalue="noembed" />
<item index="2" class="java.lang.String" itemvalue="comment" />
<item index="3" class="java.lang.String" itemvalue="noscript" />
<item index="4" class="java.lang.String" itemvalue="embed" />
<item index="5" class="java.lang.String" itemvalue="script" />
<item index="6" class="java.lang.String" itemvalue="body" />
</list>
</value>
</option>
<option name="myCustomValuesEnabled" value="true" />
</inspection_tool>
<inspection_tool class="PyBroadExceptionInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
<inspection_tool class="PyMethodOverridingInspection" enabled="false" level="WARNING" enabled_by_default="false" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="11">
<item index="0" class="java.lang.String" itemvalue="pyyaml" />
<item index="1" class="java.lang.String" itemvalue="scipy" />
<item index="2" class="java.lang.String" itemvalue="tornado" />
<item index="3" class="java.lang.String" itemvalue="pymongo" />
<item index="4" class="java.lang.String" itemvalue="scikit-learn" />
<item index="5" class="java.lang.String" itemvalue="apscheduler" />
<item index="6" class="java.lang.String" itemvalue="numpy" />
<item index="7" class="java.lang.String" itemvalue="redis" />
<item index="8" class="java.lang.String" itemvalue="bcrypt" />
<item index="9" class="java.lang.String" itemvalue="pandas" />
<item index="10" class="java.lang.String" itemvalue="bson" />
</list>
</value>
</option>
</inspection_tool>
<inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="E722" />
</list>
</option>
</inspection_tool>
<inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
<option name="processCode" value="true" />
<option name="processLiterals" value="true" />
<option name="processComments" value="true" />
</inspection_tool>
</profile>
</component>

View File

@@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

8
.idea/modules.xml generated Normal file
View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/phpBB-forum-scraper.iml" filepath="$PROJECT_DIR$/.idea/phpBB-forum-scraper.iml" />
</modules>
</component>
</project>

8
.idea/phpBB-forum-scraper.iml generated Normal file
View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

6
.idea/vcs.xml generated Normal file
View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

View File

@@ -2,7 +2,7 @@
Python-based web scraper for phpBB forums. Project can be used as a template for building your own
custom Scrapy spiders or for one-off crawls on designated forums. Please keep in mind that aggressive crawls
can contribute significant strain on web servers, so please throttle your request rates.
can produce significant strain on web servers, so please throttle your request rates.
## Requirements:

View File

@@ -26,7 +26,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1.0
DOWNLOAD_DELAY = 3.0
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

View File

@@ -1,35 +1,50 @@
# -*- coding: utf-8 -*-
import re
import json
import scrapy
from bs4 import BeautifulSoup
from scrapy.http import Request
# TODO: Please provide values for the following variables
# Domains only, no urls
ALLOWED_DOMAINS = ['']
# Starting urls
START_URLS = ['']
# Is login required? True or False.
FORM_LOGIN = False
# Login username
USERNAME = ''
# Login password
PASSWORD = ''
# Login url
LOGIN_URL = ''
class PhpbbSpider(scrapy.Spider):
name = 'phpBB'
# Domain only, no urls
allowed_domains = ['']
start_urls = ['']
username = ''
password = ''
# False if you don't need to login, True if you do.
form_login = False
allowed_domains = ALLOWED_DOMAINS
start_urls = START_URLS
form_login = FORM_LOGIN
if form_login is True:
username = USERNAME
password = PASSWORD
login_url = LOGIN_URL
start_urls.insert(0, login_url)
def parse(self, response):
# LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN
if self.form_login:
formxpath = '//*[contains(@action, "login")]'
formdata = {'username': self.username, 'password': self.password}
form_request = [
scrapy.FormRequest.from_response(
form_request = scrapy.FormRequest.from_response(
response,
formdata=formdata,
formxpath=formxpath,
callback=self.after_login,
dont_click=True
)
]
dont_click=False
)
yield form_request
return
else:
# REQUEST SUB-FORUM TITLE LINKS
links = response.xpath('//a[@class="forumtitle"]/@href').extract()
@@ -79,7 +94,7 @@ class PhpbbSpider(scrapy.Spider):
# COLLECT FORUM POST DATA
usernames = response.xpath('//p[@class="author"]//a[@class="username"]//text()').extract()
post_counts = response.xpath('//dd[@class="profile-posts"]//a/text()').extract()
post_times = response.xpath('//p[@class="author"]/text()').extract()
post_times = response.xpath('//div[@class="postbody"]//time/@datetime').extract()
post_texts = response.xpath('//div[@class="postbody"]//div[@class="content"]').extract()
post_quotes = [self.clean_quote(s) for s in post_texts]
post_texts = [self.clean_text(s) for s in post_texts]