Add macserialjunkie spider and SQLite pipeline

Fork and refactor project for scraping macserialjunkie.com: add a new phpBB spider (uses python-dotenv for credentials, form login enabled, multiple start_urls, robust ID/time/text extraction and pagination) and an SQLitePipeline that saves posts to posts.db with a tqdm progress bar. Update settings to use the SQLite pipeline, increase concurrency, reduce download delay, disable robots.txt, set JOBDIR for resume and silence logs; add .env.example and .python-version, update README and requirements (add tqdm), tidy .gitignore, and add pyproject.toml. Also reorganize package layout (rename/move phpBB_scraper modules), remove legacy pipeline and old spider implementations, and add a dependency lock file (uv.lock).
This commit is contained in:
√(noham)²
2026-01-31 13:30:41 +01:00
parent d0178052c9
commit 5615658452
17 changed files with 1152 additions and 163 deletions

View File

@@ -0,0 +1,192 @@
import re
import scrapy
from bs4 import BeautifulSoup
from scrapy.http import Request
import os
from dotenv import load_dotenv
load_dotenv()
# TODO: Please provide values for the following variables
# Domains only, no urls
ALLOWED_DOMAINS = ["macserialjunkie.com"]
# Starting urls
START_URLS = [
"https://macserialjunkie.com/forum/index.php",
## Add missing sub-forums
"https://macserialjunkie.com/forum/viewforum.php?f=53", # msj.keygens
"https://macserialjunkie.com/forum/viewforum.php?f=52", # msj.stw.graphics
"https://macserialjunkie.com/forum/viewforum.php?f=27", # msj.stw.videotutorials
"https://macserialjunkie.com/forum/viewforum.php?f=28", # msj.stw.webdev
"https://macserialjunkie.com/forum/viewforum.php?f=25", # cracking.workshop
"https://macserialjunkie.com/forum/viewforum.php?f=34", # msj.games.cracks
"https://macserialjunkie.com/forum/viewforum.php?f=35", # msj.games.serials
"https://macserialjunkie.com/forum/viewforum.php?f=63", # msj.games.ports
"https://macserialjunkie.com/forum/viewforum.php?f=56", # msj.audio.cracks
"https://macserialjunkie.com/forum/viewforum.php?f=57", # msj.audio.serials
"https://macserialjunkie.com/forum/viewforum.php?f=59", # msj.iOS.games
]
# Is login required? True or False.
FORM_LOGIN = True
# Login username
USERNAME = os.getenv("USERNAME") or "username"
# Login password
PASSWORD = os.getenv("PASSWORD") or "password"
# Login url
LOGIN_URL = "https://macserialjunkie.com/forum/ucp.php"
class PhpbbSpider(scrapy.Spider):
name = "phpBB"
allowed_domains = ALLOWED_DOMAINS
start_urls = START_URLS
form_login = FORM_LOGIN
if form_login is True:
username = USERNAME
password = PASSWORD
login_url = LOGIN_URL
start_urls.insert(0, login_url)
username_xpath = (
'//p[contains(@class, "author")]//a[contains(@class, "username")]//text()'
)
post_count_xpath = '//dd[@class="profile-posts" or not(@class)]//a/text()'
post_time_xpath = '//div[@class="postbody"]//time/@datetime|//div[@class="postbody"]//p[@class="author"]/text()[2]'
post_text_xpath = '//div[@class="postbody"]//div[@class="content"]'
def parse(self, response):
if self.form_login:
formxpath = '//*[contains(@action, "login")]'
formdata = {"username": self.username, "password": self.password}
form_request = scrapy.FormRequest.from_response(
response,
formdata=formdata,
formxpath=formxpath,
callback=self.after_login,
dont_click=False,
)
yield form_request
else:
links = response.xpath('//a[@class="forumtitle"]/@href').extract()
for link in links:
yield scrapy.Request(response.urljoin(link), callback=self.parse_topics)
def after_login(self, response):
if b"authentication failed" in response.body:
self.logger.error("Login failed.")
return
else:
links = response.xpath('//a[@class="forumtitle"]/@href').extract()
for link in links:
yield scrapy.Request(response.urljoin(link), callback=self.parse_topics)
def parse_topics(self, response):
links = response.xpath('//a[@class="topictitle"]/@href').extract()
for link in links:
yield scrapy.Request(response.urljoin(link), callback=self.parse_posts)
next_link = response.xpath(
'//li[contains(@class, "next")]//a[@rel="next"]/@href'
).extract_first()
if next_link:
# print("next_link: ", next_link)
yield scrapy.Request(
response.urljoin(next_link), callback=self.parse_topics
)
def clean_quote(self, string):
soup = BeautifulSoup(string, "lxml")
block_quotes = soup.find_all("blockquote")
for i, quote in enumerate(block_quotes):
block_quotes[i] = "<quote-%s>=%s" % (str(i + 1), quote.get_text())
return "".join(block_quotes).strip()
def clean_text(self, string):
tags = ["blockquote"]
soup = BeautifulSoup(string, "lxml")
for tag in tags:
for i, item in enumerate(soup.find_all(tag)):
item.replaceWith("<reply-%s>=" % str(i + 1))
return re.sub(r" +", r" ", soup.get_text()).strip()
def parse_posts(self, response):
# Try the hidden input field first
topic_id = response.xpath(
'//input[@type="hidden" and @name="t"]/@value'
).extract_first()
# Fallback to URL regex if hidden input isn't found
if not topic_id:
topic_id_match = re.search(r"[?&]t=(\d+)", response.url)
if topic_id_match:
topic_id = topic_id_match.group(1)
# This ensures IDs, Dates, and Text stay synchronized
posts = response.xpath(
'//div[contains(@class, "post") and contains(@class, "has-profile")]'
)
for post in posts:
# The div usually has id="p123456", we want 123456
div_id = post.xpath("./@id").extract_first()
post_id = div_id.replace("p", "") if div_id else None
# Modern phpBB themes usually have a hidden span with data attributes
poster_id = post.xpath(
'.//span[contains(@class, "postdetails")]/@data-poster-id'
).extract_first()
# Fallback: Extract from the profile link (e.g., ...&u=5465)
if not poster_id:
profile_link = post.xpath(
'.//dt[contains(@class, "has-profile-rank")]/a[contains(@href, "mode=viewprofile")]/@href'
).extract_first()
if profile_link:
u_match = re.search(r"[?&]u=(\d+)", profile_link)
if u_match:
poster_id = u_match.group(1)
# Priority 1: The 'datetime' attribute (ISO format)
post_time = post.xpath(
'.//p[@class="author"]//time/@datetime'
).extract_first()
# Priority 2: The visible text inside the time tag
if not post_time:
post_time = post.xpath(
'.//p[@class="author"]//time/text()'
).extract_first()
username = post.xpath(
'.//dt[contains(@class, "has-profile-rank")]//a[contains(@class, "username")]/text()'
).extract_first()
post_count = post.xpath(
'.//dd[@class="profile-posts"]//a/text()'
).extract_first()
content_html = post.xpath('.//div[@class="content"]').extract_first() or ""
post_text = self.clean_text(content_html)
quote_text = self.clean_quote(content_html)
yield {
"TopicID": topic_id,
"PostID": post_id,
"PosterID": poster_id,
"Username": username,
"PostCount": post_count,
"PostTime": post_time,
"PostText": post_text,
"QuoteText": quote_text,
}
# Updated to use contains(@class, "next") because class is "arrow next"
next_link = response.xpath(
'//li[contains(@class, "next")]/a[@rel="next"]/@href'
).extract_first()
# Fallback: just look for the rel="next" attribute directly
if not next_link:
next_link = response.xpath('//a[@rel="next"]/@href').extract_first()
if next_link:
# print("next_link: ", next_link)
yield scrapy.Request(response.urljoin(next_link), callback=self.parse_posts)