Debugged FormRequest for Authentication

This commit is contained in:
David Ascienzo
2020-04-21 18:49:20 -04:00
parent 29dd97420f
commit 29cb84e271
8 changed files with 114 additions and 17 deletions

View File

@@ -26,7 +26,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1.0
DOWNLOAD_DELAY = 3.0
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

View File

@@ -1,35 +1,50 @@
# -*- coding: utf-8 -*-
import re
import json
import scrapy
from bs4 import BeautifulSoup
from scrapy.http import Request
# TODO: Please provide values for the following variables
# Domains only, no urls
ALLOWED_DOMAINS = ['']
# Starting urls
START_URLS = ['']
# Is login required? True or False.
FORM_LOGIN = False
# Login username
USERNAME = ''
# Login password
PASSWORD = ''
# Login url
LOGIN_URL = ''
class PhpbbSpider(scrapy.Spider):
name = 'phpBB'
# Domain only, no urls
allowed_domains = ['']
start_urls = ['']
username = ''
password = ''
# False if you don't need to login, True if you do.
form_login = False
allowed_domains = ALLOWED_DOMAINS
start_urls = START_URLS
form_login = FORM_LOGIN
if form_login is True:
username = USERNAME
password = PASSWORD
login_url = LOGIN_URL
start_urls.insert(0, login_url)
def parse(self, response):
# LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN
if self.form_login:
formxpath = '//*[contains(@action, "login")]'
formdata = {'username': self.username, 'password': self.password}
form_request = [
scrapy.FormRequest.from_response(
form_request = scrapy.FormRequest.from_response(
response,
formdata=formdata,
formxpath=formxpath,
callback=self.after_login,
dont_click=True
)
]
dont_click=False
)
yield form_request
return
else:
# REQUEST SUB-FORUM TITLE LINKS
links = response.xpath('//a[@class="forumtitle"]/@href').extract()
@@ -79,7 +94,7 @@ class PhpbbSpider(scrapy.Spider):
# COLLECT FORUM POST DATA
usernames = response.xpath('//p[@class="author"]//a[@class="username"]//text()').extract()
post_counts = response.xpath('//dd[@class="profile-posts"]//a/text()').extract()
post_times = response.xpath('//p[@class="author"]/text()').extract()
post_times = response.xpath('//div[@class="postbody"]//time/@datetime').extract()
post_texts = response.xpath('//div[@class="postbody"]//div[@class="content"]').extract()
post_quotes = [self.clean_quote(s) for s in post_texts]
post_texts = [self.clean_text(s) for s in post_texts]