mirror of
https://github.com/NohamR/phpBB-forum-scraper.git
synced 2026-02-22 02:25:43 +00:00
Debugged FormRequest for Authentication
This commit is contained in:
@@ -26,7 +26,7 @@ ROBOTSTXT_OBEY = True
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
DOWNLOAD_DELAY = 1.0
|
||||
DOWNLOAD_DELAY = 3.0
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
@@ -1,35 +1,50 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import json
|
||||
import scrapy
|
||||
from bs4 import BeautifulSoup
|
||||
from scrapy.http import Request
|
||||
|
||||
# TODO: Please provide values for the following variables
|
||||
# Domains only, no urls
|
||||
ALLOWED_DOMAINS = ['']
|
||||
# Starting urls
|
||||
START_URLS = ['']
|
||||
# Is login required? True or False.
|
||||
FORM_LOGIN = False
|
||||
# Login username
|
||||
USERNAME = ''
|
||||
# Login password
|
||||
PASSWORD = ''
|
||||
# Login url
|
||||
LOGIN_URL = ''
|
||||
|
||||
|
||||
class PhpbbSpider(scrapy.Spider):
|
||||
|
||||
name = 'phpBB'
|
||||
# Domain only, no urls
|
||||
allowed_domains = ['']
|
||||
start_urls = ['']
|
||||
username = ''
|
||||
password = ''
|
||||
# False if you don't need to login, True if you do.
|
||||
form_login = False
|
||||
|
||||
allowed_domains = ALLOWED_DOMAINS
|
||||
start_urls = START_URLS
|
||||
form_login = FORM_LOGIN
|
||||
if form_login is True:
|
||||
username = USERNAME
|
||||
password = PASSWORD
|
||||
login_url = LOGIN_URL
|
||||
start_urls.insert(0, login_url)
|
||||
|
||||
def parse(self, response):
|
||||
# LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN
|
||||
if self.form_login:
|
||||
formxpath = '//*[contains(@action, "login")]'
|
||||
formdata = {'username': self.username, 'password': self.password}
|
||||
form_request = [
|
||||
scrapy.FormRequest.from_response(
|
||||
form_request = scrapy.FormRequest.from_response(
|
||||
response,
|
||||
formdata=formdata,
|
||||
formxpath=formxpath,
|
||||
callback=self.after_login,
|
||||
dont_click=True
|
||||
)
|
||||
]
|
||||
dont_click=False
|
||||
)
|
||||
yield form_request
|
||||
return
|
||||
else:
|
||||
# REQUEST SUB-FORUM TITLE LINKS
|
||||
links = response.xpath('//a[@class="forumtitle"]/@href').extract()
|
||||
@@ -79,7 +94,7 @@ class PhpbbSpider(scrapy.Spider):
|
||||
# COLLECT FORUM POST DATA
|
||||
usernames = response.xpath('//p[@class="author"]//a[@class="username"]//text()').extract()
|
||||
post_counts = response.xpath('//dd[@class="profile-posts"]//a/text()').extract()
|
||||
post_times = response.xpath('//p[@class="author"]/text()').extract()
|
||||
post_times = response.xpath('//div[@class="postbody"]//time/@datetime').extract()
|
||||
post_texts = response.xpath('//div[@class="postbody"]//div[@class="content"]').extract()
|
||||
post_quotes = [self.clean_quote(s) for s in post_texts]
|
||||
post_texts = [self.clean_text(s) for s in post_texts]
|
||||
|
||||
Reference in New Issue
Block a user