Debugged FormRequest for Authentication

2026-02-22 02:25:43 +00:00 · 2020-04-21 18:49:20 -04:00
parent 29dd97420f
commit 29cb84e271
8 changed files with 114 additions and 17 deletions
--- a/phpBB_scraper/phpBB_scraper/settings.py
+++ b/phpBB_scraper/phpBB_scraper/settings.py
@@ -26,7 +26,7 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-DOWNLOAD_DELAY = 1.0
+DOWNLOAD_DELAY = 3.0
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
--- a/phpBB_scraper/phpBB_scraper/spiders/phpBB.py
+++ b/phpBB_scraper/phpBB_scraper/spiders/phpBB.py
@@ -1,35 +1,50 @@
 # -*- coding: utf-8 -*-
 import re
+import json
 import scrapy
 from bs4 import BeautifulSoup
 from scrapy.http import Request

+# TODO: Please provide values for the following variables
+# Domains only, no urls
+ALLOWED_DOMAINS = ['']
+# Starting urls
+START_URLS = ['']
+# Is login required? True or False.
+FORM_LOGIN = False
+# Login username
+USERNAME = ''
+# Login password
+PASSWORD = ''
+# Login url
+LOGIN_URL = ''
+

 class PhpbbSpider(scrapy.Spider):
    
    name = 'phpBB'
-    # Domain only, no urls
-    allowed_domains = ['']
-    start_urls = ['']
-    username = ''
-    password = ''
-    # False if you don't need to login, True if you do.
-    form_login = False
-    
+    allowed_domains = ALLOWED_DOMAINS
+    start_urls = START_URLS
+    form_login = FORM_LOGIN
+    if form_login is True:
+        username = USERNAME
+        password = PASSWORD
+        login_url = LOGIN_URL
+        start_urls.insert(0, login_url)
+
    def parse(self, response):
        # LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN
        if self.form_login:
+            formxpath = '//*[contains(@action, "login")]'
            formdata = {'username': self.username, 'password': self.password}
-            form_request = [
-                scrapy.FormRequest.from_response(
+            form_request = scrapy.FormRequest.from_response(
                    response,
                    formdata=formdata,
+                    formxpath=formxpath,
                    callback=self.after_login,
-                    dont_click=True
-                )
-            ]
+                    dont_click=False
+            )
            yield form_request
-            return
        else:
            # REQUEST SUB-FORUM TITLE LINKS
            links = response.xpath('//a[@class="forumtitle"]/@href').extract()
@@ -79,7 +94,7 @@ class PhpbbSpider(scrapy.Spider):
        # COLLECT FORUM POST DATA
        usernames = response.xpath('//p[@class="author"]//a[@class="username"]//text()').extract()
        post_counts = response.xpath('//dd[@class="profile-posts"]//a/text()').extract()
-        post_times = response.xpath('//p[@class="author"]/text()').extract()
+        post_times = response.xpath('//div[@class="postbody"]//time/@datetime').extract()
        post_texts = response.xpath('//div[@class="postbody"]//div[@class="content"]').extract()
        post_quotes = [self.clean_quote(s) for s in post_texts]
        post_texts = [self.clean_text(s) for s in post_texts]