mirror of
https://github.com/NohamR/phpBB-forum-scraper.git
synced 2026-02-22 02:25:43 +00:00
Add macserialjunkie spider and SQLite pipeline
Fork and refactor project for scraping macserialjunkie.com: add a new phpBB spider (uses python-dotenv for credentials, form login enabled, multiple start_urls, robust ID/time/text extraction and pagination) and an SQLitePipeline that saves posts to posts.db with a tqdm progress bar. Update settings to use the SQLite pipeline, increase concurrency, reduce download delay, disable robots.txt, set JOBDIR for resume and silence logs; add .env.example and .python-version, update README and requirements (add tqdm), tidy .gitignore, and add pyproject.toml. Also reorganize package layout (rename/move phpBB_scraper modules), remove legacy pipeline and old spider implementations, and add a dependency lock file (uv.lock).
This commit is contained in:
4
phpBB_scraper/spiders/__init__.py
Normal file
4
phpBB_scraper/spiders/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
192
phpBB_scraper/spiders/phpBB.py
Normal file
192
phpBB_scraper/spiders/phpBB.py
Normal file
@@ -0,0 +1,192 @@
|
||||
import re
|
||||
|
||||
import scrapy
|
||||
from bs4 import BeautifulSoup
|
||||
from scrapy.http import Request
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
# TODO: Please provide values for the following variables
|
||||
# Domains only, no urls
|
||||
ALLOWED_DOMAINS = ["macserialjunkie.com"]
|
||||
# Starting urls
|
||||
START_URLS = [
|
||||
"https://macserialjunkie.com/forum/index.php",
|
||||
## Add missing sub-forums
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=53", # msj.keygens
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=52", # msj.stw.graphics
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=27", # msj.stw.videotutorials
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=28", # msj.stw.webdev
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=25", # cracking.workshop
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=34", # msj.games.cracks
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=35", # msj.games.serials
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=63", # msj.games.ports
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=56", # msj.audio.cracks
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=57", # msj.audio.serials
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=59", # msj.iOS.games
|
||||
]
|
||||
# Is login required? True or False.
|
||||
FORM_LOGIN = True
|
||||
# Login username
|
||||
USERNAME = os.getenv("USERNAME") or "username"
|
||||
# Login password
|
||||
PASSWORD = os.getenv("PASSWORD") or "password"
|
||||
# Login url
|
||||
LOGIN_URL = "https://macserialjunkie.com/forum/ucp.php"
|
||||
|
||||
|
||||
class PhpbbSpider(scrapy.Spider):
|
||||
name = "phpBB"
|
||||
allowed_domains = ALLOWED_DOMAINS
|
||||
start_urls = START_URLS
|
||||
form_login = FORM_LOGIN
|
||||
if form_login is True:
|
||||
username = USERNAME
|
||||
password = PASSWORD
|
||||
login_url = LOGIN_URL
|
||||
start_urls.insert(0, login_url)
|
||||
|
||||
username_xpath = (
|
||||
'//p[contains(@class, "author")]//a[contains(@class, "username")]//text()'
|
||||
)
|
||||
post_count_xpath = '//dd[@class="profile-posts" or not(@class)]//a/text()'
|
||||
post_time_xpath = '//div[@class="postbody"]//time/@datetime|//div[@class="postbody"]//p[@class="author"]/text()[2]'
|
||||
post_text_xpath = '//div[@class="postbody"]//div[@class="content"]'
|
||||
|
||||
def parse(self, response):
|
||||
if self.form_login:
|
||||
formxpath = '//*[contains(@action, "login")]'
|
||||
formdata = {"username": self.username, "password": self.password}
|
||||
form_request = scrapy.FormRequest.from_response(
|
||||
response,
|
||||
formdata=formdata,
|
||||
formxpath=formxpath,
|
||||
callback=self.after_login,
|
||||
dont_click=False,
|
||||
)
|
||||
yield form_request
|
||||
else:
|
||||
links = response.xpath('//a[@class="forumtitle"]/@href').extract()
|
||||
for link in links:
|
||||
yield scrapy.Request(response.urljoin(link), callback=self.parse_topics)
|
||||
|
||||
def after_login(self, response):
|
||||
if b"authentication failed" in response.body:
|
||||
self.logger.error("Login failed.")
|
||||
return
|
||||
else:
|
||||
links = response.xpath('//a[@class="forumtitle"]/@href').extract()
|
||||
for link in links:
|
||||
yield scrapy.Request(response.urljoin(link), callback=self.parse_topics)
|
||||
|
||||
def parse_topics(self, response):
|
||||
links = response.xpath('//a[@class="topictitle"]/@href').extract()
|
||||
for link in links:
|
||||
yield scrapy.Request(response.urljoin(link), callback=self.parse_posts)
|
||||
|
||||
next_link = response.xpath(
|
||||
'//li[contains(@class, "next")]//a[@rel="next"]/@href'
|
||||
).extract_first()
|
||||
if next_link:
|
||||
# print("next_link: ", next_link)
|
||||
yield scrapy.Request(
|
||||
response.urljoin(next_link), callback=self.parse_topics
|
||||
)
|
||||
|
||||
def clean_quote(self, string):
|
||||
soup = BeautifulSoup(string, "lxml")
|
||||
block_quotes = soup.find_all("blockquote")
|
||||
for i, quote in enumerate(block_quotes):
|
||||
block_quotes[i] = "<quote-%s>=%s" % (str(i + 1), quote.get_text())
|
||||
return "".join(block_quotes).strip()
|
||||
|
||||
def clean_text(self, string):
|
||||
tags = ["blockquote"]
|
||||
soup = BeautifulSoup(string, "lxml")
|
||||
for tag in tags:
|
||||
for i, item in enumerate(soup.find_all(tag)):
|
||||
item.replaceWith("<reply-%s>=" % str(i + 1))
|
||||
return re.sub(r" +", r" ", soup.get_text()).strip()
|
||||
|
||||
def parse_posts(self, response):
|
||||
# Try the hidden input field first
|
||||
topic_id = response.xpath(
|
||||
'//input[@type="hidden" and @name="t"]/@value'
|
||||
).extract_first()
|
||||
# Fallback to URL regex if hidden input isn't found
|
||||
if not topic_id:
|
||||
topic_id_match = re.search(r"[?&]t=(\d+)", response.url)
|
||||
if topic_id_match:
|
||||
topic_id = topic_id_match.group(1)
|
||||
|
||||
# This ensures IDs, Dates, and Text stay synchronized
|
||||
posts = response.xpath(
|
||||
'//div[contains(@class, "post") and contains(@class, "has-profile")]'
|
||||
)
|
||||
|
||||
for post in posts:
|
||||
# The div usually has id="p123456", we want 123456
|
||||
div_id = post.xpath("./@id").extract_first()
|
||||
post_id = div_id.replace("p", "") if div_id else None
|
||||
|
||||
# Modern phpBB themes usually have a hidden span with data attributes
|
||||
poster_id = post.xpath(
|
||||
'.//span[contains(@class, "postdetails")]/@data-poster-id'
|
||||
).extract_first()
|
||||
|
||||
# Fallback: Extract from the profile link (e.g., ...&u=5465)
|
||||
if not poster_id:
|
||||
profile_link = post.xpath(
|
||||
'.//dt[contains(@class, "has-profile-rank")]/a[contains(@href, "mode=viewprofile")]/@href'
|
||||
).extract_first()
|
||||
if profile_link:
|
||||
u_match = re.search(r"[?&]u=(\d+)", profile_link)
|
||||
if u_match:
|
||||
poster_id = u_match.group(1)
|
||||
|
||||
# Priority 1: The 'datetime' attribute (ISO format)
|
||||
post_time = post.xpath(
|
||||
'.//p[@class="author"]//time/@datetime'
|
||||
).extract_first()
|
||||
|
||||
# Priority 2: The visible text inside the time tag
|
||||
if not post_time:
|
||||
post_time = post.xpath(
|
||||
'.//p[@class="author"]//time/text()'
|
||||
).extract_first()
|
||||
|
||||
username = post.xpath(
|
||||
'.//dt[contains(@class, "has-profile-rank")]//a[contains(@class, "username")]/text()'
|
||||
).extract_first()
|
||||
post_count = post.xpath(
|
||||
'.//dd[@class="profile-posts"]//a/text()'
|
||||
).extract_first()
|
||||
|
||||
content_html = post.xpath('.//div[@class="content"]').extract_first() or ""
|
||||
post_text = self.clean_text(content_html)
|
||||
quote_text = self.clean_quote(content_html)
|
||||
|
||||
yield {
|
||||
"TopicID": topic_id,
|
||||
"PostID": post_id,
|
||||
"PosterID": poster_id,
|
||||
"Username": username,
|
||||
"PostCount": post_count,
|
||||
"PostTime": post_time,
|
||||
"PostText": post_text,
|
||||
"QuoteText": quote_text,
|
||||
}
|
||||
|
||||
# Updated to use contains(@class, "next") because class is "arrow next"
|
||||
next_link = response.xpath(
|
||||
'//li[contains(@class, "next")]/a[@rel="next"]/@href'
|
||||
).extract_first()
|
||||
|
||||
# Fallback: just look for the rel="next" attribute directly
|
||||
if not next_link:
|
||||
next_link = response.xpath('//a[@rel="next"]/@href').extract_first()
|
||||
|
||||
if next_link:
|
||||
# print("next_link: ", next_link)
|
||||
yield scrapy.Request(response.urljoin(next_link), callback=self.parse_posts)
|
||||
Reference in New Issue
Block a user