Debugged FormRequest for Authentication

2026-02-21 18:15:43 +00:00 · 2020-04-21 18:49:20 -04:00
parent 29dd97420f
commit 29cb84e271
8 changed files with 114 additions and 17 deletions
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,54 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="HtmlUnknownTag" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="myValues">
+        <value>
+          <list size="7">
+            <item index="0" class="java.lang.String" itemvalue="nobr" />
+            <item index="1" class="java.lang.String" itemvalue="noembed" />
+            <item index="2" class="java.lang.String" itemvalue="comment" />
+            <item index="3" class="java.lang.String" itemvalue="noscript" />
+            <item index="4" class="java.lang.String" itemvalue="embed" />
+            <item index="5" class="java.lang.String" itemvalue="script" />
+            <item index="6" class="java.lang.String" itemvalue="body" />
+          </list>
+        </value>
+      </option>
+      <option name="myCustomValuesEnabled" value="true" />
+    </inspection_tool>
+    <inspection_tool class="PyBroadExceptionInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
+    <inspection_tool class="PyMethodOverridingInspection" enabled="false" level="WARNING" enabled_by_default="false" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="11">
+            <item index="0" class="java.lang.String" itemvalue="pyyaml" />
+            <item index="1" class="java.lang.String" itemvalue="scipy" />
+            <item index="2" class="java.lang.String" itemvalue="tornado" />
+            <item index="3" class="java.lang.String" itemvalue="pymongo" />
+            <item index="4" class="java.lang.String" itemvalue="scikit-learn" />
+            <item index="5" class="java.lang.String" itemvalue="apscheduler" />
+            <item index="6" class="java.lang.String" itemvalue="numpy" />
+            <item index="7" class="java.lang.String" itemvalue="redis" />
+            <item index="8" class="java.lang.String" itemvalue="bcrypt" />
+            <item index="9" class="java.lang.String" itemvalue="pandas" />
+            <item index="10" class="java.lang.String" itemvalue="bson" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="E722" />
+        </list>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
+      <option name="processCode" value="true" />
+      <option name="processLiterals" value="true" />
+      <option name="processComments" value="true" />
+    </inspection_tool>
+  </profile>
+</component>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/phpBB-forum-scraper.iml" filepath="$PROJECT_DIR$/.idea/phpBB-forum-scraper.iml" />
+    </modules>
+  </component>
+</project>
--- a/.idea/phpBB-forum-scraper.iml
+++ b/.idea/phpBB-forum-scraper.iml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@

 Python-based web scraper for phpBB forums. Project can be used as a template for building your own
 custom Scrapy spiders or for one-off crawls on designated forums. Please keep in mind that aggressive crawls
-can contribute significant strain on web servers, so please throttle your request rates.
+can produce significant strain on web servers, so please throttle your request rates.


 ## Requirements: 
--- a/phpBB_scraper/phpBB_scraper/settings.py
+++ b/phpBB_scraper/phpBB_scraper/settings.py
@@ -26,7 +26,7 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-DOWNLOAD_DELAY = 1.0
+DOWNLOAD_DELAY = 3.0
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
--- a/phpBB_scraper/phpBB_scraper/spiders/phpBB.py
+++ b/phpBB_scraper/phpBB_scraper/spiders/phpBB.py
@@ -1,35 +1,50 @@
 # -*- coding: utf-8 -*-
 import re
+import json
 import scrapy
 from bs4 import BeautifulSoup
 from scrapy.http import Request

+# TODO: Please provide values for the following variables
+# Domains only, no urls
+ALLOWED_DOMAINS = ['']
+# Starting urls
+START_URLS = ['']
+# Is login required? True or False.
+FORM_LOGIN = False
+# Login username
+USERNAME = ''
+# Login password
+PASSWORD = ''
+# Login url
+LOGIN_URL = ''
+

 class PhpbbSpider(scrapy.Spider):
    
    name = 'phpBB'
-    # Domain only, no urls
-    allowed_domains = ['']
-    start_urls = ['']
-    username = ''
-    password = ''
-    # False if you don't need to login, True if you do.
-    form_login = False
-    
+    allowed_domains = ALLOWED_DOMAINS
+    start_urls = START_URLS
+    form_login = FORM_LOGIN
+    if form_login is True:
+        username = USERNAME
+        password = PASSWORD
+        login_url = LOGIN_URL
+        start_urls.insert(0, login_url)
+
    def parse(self, response):
        # LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN
        if self.form_login:
+            formxpath = '//*[contains(@action, "login")]'
            formdata = {'username': self.username, 'password': self.password}
-            form_request = [
-                scrapy.FormRequest.from_response(
+            form_request = scrapy.FormRequest.from_response(
                    response,
                    formdata=formdata,
+                    formxpath=formxpath,
                    callback=self.after_login,
-                    dont_click=True
-                )
-            ]
+                    dont_click=False
+            )
            yield form_request
-            return
        else:
            # REQUEST SUB-FORUM TITLE LINKS
            links = response.xpath('//a[@class="forumtitle"]/@href').extract()
@@ -79,7 +94,7 @@ class PhpbbSpider(scrapy.Spider):
        # COLLECT FORUM POST DATA
        usernames = response.xpath('//p[@class="author"]//a[@class="username"]//text()').extract()
        post_counts = response.xpath('//dd[@class="profile-posts"]//a/text()').extract()
-        post_times = response.xpath('//p[@class="author"]/text()').extract()
+        post_times = response.xpath('//div[@class="postbody"]//time/@datetime').extract()
        post_texts = response.xpath('//div[@class="postbody"]//div[@class="content"]').extract()
        post_quotes = [self.clean_quote(s) for s in post_texts]
        post_texts = [self.clean_text(s) for s in post_texts]