Add macserialjunkie spider and SQLite pipeline

Fork and refactor project for scraping macserialjunkie.com: add a new phpBB spider (uses python-dotenv for credentials, form login enabled, multiple start_urls, robust ID/time/text extraction and pagination) and an SQLitePipeline that saves posts to posts.db with a tqdm progress bar. Update settings to use the SQLite pipeline, increase concurrency, reduce download delay, disable robots.txt, set JOBDIR for resume and silence logs; add .env.example and .python-version, update README and requirements (add tqdm), tidy .gitignore, and add pyproject.toml. Also reorganize package layout (rename/move phpBB_scraper modules), remove legacy pipeline and old spider implementations, and add a dependency lock file (uv.lock).
This commit is contained in:
√(noham)²
2026-01-31 13:30:41 +01:00
parent d0178052c9
commit 5615658452
17 changed files with 1152 additions and 163 deletions

View File

@@ -53,4 +53,4 @@ class PhpbbScraperSpiderMiddleware(object):
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
spider.logger.info("Spider opened: %s" % spider.name)

View File

@@ -1,11 +0,0 @@
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class PhpbbScraperPipeline(object):
def process_item(self, item, spider):
return item

View File

@@ -1,116 +0,0 @@
# -*- coding: utf-8 -*-
import re
import scrapy
from bs4 import BeautifulSoup
from scrapy.http import Request
# TODO: Please provide values for the following variables
# Domains only, no urls
ALLOWED_DOMAINS = ['']
# Starting urls
START_URLS = ['']
# Is login required? True or False.
FORM_LOGIN = False
# Login username
USERNAME = ''
# Login password
PASSWORD = ''
# Login url
LOGIN_URL = ''
class PhpbbSpider(scrapy.Spider):
name = 'phpBB'
allowed_domains = ALLOWED_DOMAINS
start_urls = START_URLS
form_login = FORM_LOGIN
if form_login is True:
username = USERNAME
password = PASSWORD
login_url = LOGIN_URL
start_urls.insert(0, login_url)
username_xpath = '//p[contains(@class, "author")]//a[contains(@class, "username")]//text()'
post_count_xpath = '//dd[@class="profile-posts" or not(@class)]//a/text()'
post_time_xpath = '//div[@class="postbody"]//time/@datetime|//div[@class="postbody"]//p[@class="author"]/text()[2]'
post_text_xpath = '//div[@class="postbody"]//div[@class="content"]'
def parse(self, response):
# LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN
if self.form_login:
formxpath = '//*[contains(@action, "login")]'
formdata = {'username': self.username, 'password': self.password}
form_request = scrapy.FormRequest.from_response(
response,
formdata=formdata,
formxpath=formxpath,
callback=self.after_login,
dont_click=False
)
yield form_request
else:
# REQUEST SUB-FORUM TITLE LINKS
links = response.xpath('//a[@class="forumtitle"]/@href').extract()
for link in links:
yield scrapy.Request(response.urljoin(link), callback=self.parse_topics)
def after_login(self, response):
# CHECK LOGIN SUCCESS BEFORE MAKING REQUESTS
if b'authentication failed' in response.body:
self.logger.error('Login failed.')
return
else:
# REQUEST SUB-FORUM TITLE LINKS
links = response.xpath('//a[@class="forumtitle"]/@href').extract()
for link in links:
yield scrapy.Request(response.urljoin(link), callback=self.parse_topics)
def parse_topics(self, response):
# REQUEST TOPIC TITLE LINKS
links = response.xpath('//a[@class="topictitle"]/@href').extract()
for link in links:
yield scrapy.Request(response.urljoin(link), callback=self.parse_posts)
# IF NEXT PAGE EXISTS, FOLLOW
next_link = response.xpath('//li[@class="next"]//a[@rel="next"]/@href').extract_first()
if next_link:
yield scrapy.Request(response.urljoin(next_link), callback=self.parse_topics)
def clean_quote(self, string):
# CLEAN HTML TAGS FROM POST TEXT, MARK QUOTES
soup = BeautifulSoup(string, 'lxml')
block_quotes = soup.find_all('blockquote')
for i, quote in enumerate(block_quotes):
block_quotes[i] = '<quote-%s>=%s' % (str(i + 1), quote.get_text())
return ''.join(block_quotes).strip()
def clean_text(self, string):
# CLEAN HTML TAGS FROM POST TEXT, MARK REPLIES TO QUOTES
tags = ['blockquote']
soup = BeautifulSoup(string, 'lxml')
for tag in tags:
for i, item in enumerate(soup.find_all(tag)):
item.replaceWith('<reply-%s>=' % str(i + 1))
return re.sub(r' +', r' ', soup.get_text()).strip()
def parse_posts(self, response):
# COLLECT FORUM POST DATA
usernames = response.xpath(self.username_xpath).extract()
n = len(usernames)
if n > 0:
post_counts = response.xpath(self.post_count_xpath).extract() or (n * [''])
post_times = response.xpath(self.post_time_xpath).extract() or (n * [''])
post_texts = response.xpath(self.post_text_xpath).extract() or (n * [''])
post_quotes = [self.clean_quote(s) for s in post_texts]
post_texts = [self.clean_text(s) for s in post_texts]
# YIELD POST DATA
for i in range(n):
yield {'Username': str(usernames[i]).strip(), 'PostCount': str(post_counts[i]).strip(),
'PostTime': str(post_times[i]).strip(), 'PostText': post_texts[i], 'QuoteText': post_quotes[i]}
# CLICK THROUGH NEXT PAGE
next_link = response.xpath('//li[@class="next"]//a[@rel="next"]/@href').extract_first()
if next_link:
yield scrapy.Request(response.urljoin(next_link), callback=self.parse_posts)

115
phpBB_scraper/pipelines.py Normal file
View File

@@ -0,0 +1,115 @@
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import sqlite3
from datetime import datetime
from tqdm import tqdm
class PhpbbScraperPipeline(object):
def process_item(self, item, spider):
return item
class SQLitePipeline(object):
def __init__(self):
self.connection = None
self.cursor = None
self.pbar = None
self.item_count = 0
self.spider = None
def open_spider(self, spider):
"""Initialize database connection when spider opens"""
self.spider = spider
# Create database file in the same directory as posts.csv was
self.connection = sqlite3.connect("posts.db")
self.cursor = self.connection.cursor()
# Create table if it doesn't exist
self.cursor.execute(
"""
CREATE TABLE IF NOT EXISTS posts (
id INTEGER PRIMARY KEY AUTOINCREMENT,
topic_id TEXT,
post_id TEXT,
poster_id TEXT,
username TEXT,
post_count TEXT,
post_time TEXT,
post_text TEXT,
quote_text TEXT,
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
"""
)
# Create indexes for better query performance
self.cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_topic_id ON posts(topic_id)
"""
)
self.cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_post_id ON posts(post_id)
"""
)
self.cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_poster_id ON posts(poster_id)
"""
)
self.connection.commit()
# Initialize progress bar
self.pbar = tqdm(desc="Scraping posts", unit=" posts", dynamic_ncols=True)
def close_spider(self, spider):
"""Close database connection when spider closes"""
if self.pbar is not None:
self.pbar.close()
if self.connection:
self.connection.close()
def process_item(self, item, spider):
"""Insert scraped item into database"""
self.cursor.execute(
"""
INSERT INTO posts (topic_id, post_id, poster_id, username, post_count,
post_time, post_text, quote_text)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""",
(
item.get("TopicID"),
item.get("PostID"),
item.get("PosterID"),
item.get("Username"),
item.get("PostCount"),
item.get("PostTime"),
item.get("PostText"),
item.get("QuoteText"),
),
)
self.connection.commit()
# Update progress bar
self.item_count += 1
self.pbar.update(1)
# Get queue stats from spider's crawler
stats = self.spider.crawler.stats.get_stats()
pending = stats.get('scheduler/enqueued', 0) - stats.get('scheduler/dequeued', 0)
self.pbar.set_postfix({
'total': self.item_count,
'queue': pending
})
return item

View File

@@ -1,11 +0,0 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = phpBB_scraper.settings
[deploy]
#url = http://localhost:6800/
project = phpBB_scraper

View File

@@ -9,81 +9,87 @@
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'phpBB_scraper'
SPIDER_MODULES = ['phpBB_scraper.spiders']
NEWSPIDER_MODULE = 'phpBB_scraper.spiders'
BOT_NAME = "phpBB_scraper"
SPIDER_MODULES = ["phpBB_scraper.spiders"]
NEWSPIDER_MODULE = "phpBB_scraper.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 OPR/45.0.2552.888'
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 OPR/45.0.2552.888"
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3.0
DOWNLOAD_DELAY = 0.1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = True
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# }
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# SPIDER_MIDDLEWARES = {
# 'phpBB_scraper.middlewares.PhpbbScraperSpiderMiddleware': 543,
#}
# }
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# DOWNLOADER_MIDDLEWARES = {
# 'phpBB_scraper.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# }
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# }
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'phpBB_scraper.pipelines.PhpbbScraperPipeline': 300,
#}
ITEM_PIPELINES = {
"phpBB_scraper.pipelines.SQLitePipeline": 300,
}
# Disable default Scrapy logging for cleaner output with tqdm
LOG_LEVEL = "ERROR"
# Enable job directory for pause/resume functionality
JOBDIR = "crawls/resume"
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

View File

@@ -0,0 +1,192 @@
import re
import scrapy
from bs4 import BeautifulSoup
from scrapy.http import Request
import os
from dotenv import load_dotenv
load_dotenv()
# TODO: Please provide values for the following variables
# Domains only, no urls
ALLOWED_DOMAINS = ["macserialjunkie.com"]
# Starting urls
START_URLS = [
"https://macserialjunkie.com/forum/index.php",
## Add missing sub-forums
"https://macserialjunkie.com/forum/viewforum.php?f=53", # msj.keygens
"https://macserialjunkie.com/forum/viewforum.php?f=52", # msj.stw.graphics
"https://macserialjunkie.com/forum/viewforum.php?f=27", # msj.stw.videotutorials
"https://macserialjunkie.com/forum/viewforum.php?f=28", # msj.stw.webdev
"https://macserialjunkie.com/forum/viewforum.php?f=25", # cracking.workshop
"https://macserialjunkie.com/forum/viewforum.php?f=34", # msj.games.cracks
"https://macserialjunkie.com/forum/viewforum.php?f=35", # msj.games.serials
"https://macserialjunkie.com/forum/viewforum.php?f=63", # msj.games.ports
"https://macserialjunkie.com/forum/viewforum.php?f=56", # msj.audio.cracks
"https://macserialjunkie.com/forum/viewforum.php?f=57", # msj.audio.serials
"https://macserialjunkie.com/forum/viewforum.php?f=59", # msj.iOS.games
]
# Is login required? True or False.
FORM_LOGIN = True
# Login username
USERNAME = os.getenv("USERNAME") or "username"
# Login password
PASSWORD = os.getenv("PASSWORD") or "password"
# Login url
LOGIN_URL = "https://macserialjunkie.com/forum/ucp.php"
class PhpbbSpider(scrapy.Spider):
name = "phpBB"
allowed_domains = ALLOWED_DOMAINS
start_urls = START_URLS
form_login = FORM_LOGIN
if form_login is True:
username = USERNAME
password = PASSWORD
login_url = LOGIN_URL
start_urls.insert(0, login_url)
username_xpath = (
'//p[contains(@class, "author")]//a[contains(@class, "username")]//text()'
)
post_count_xpath = '//dd[@class="profile-posts" or not(@class)]//a/text()'
post_time_xpath = '//div[@class="postbody"]//time/@datetime|//div[@class="postbody"]//p[@class="author"]/text()[2]'
post_text_xpath = '//div[@class="postbody"]//div[@class="content"]'
def parse(self, response):
if self.form_login:
formxpath = '//*[contains(@action, "login")]'
formdata = {"username": self.username, "password": self.password}
form_request = scrapy.FormRequest.from_response(
response,
formdata=formdata,
formxpath=formxpath,
callback=self.after_login,
dont_click=False,
)
yield form_request
else:
links = response.xpath('//a[@class="forumtitle"]/@href').extract()
for link in links:
yield scrapy.Request(response.urljoin(link), callback=self.parse_topics)
def after_login(self, response):
if b"authentication failed" in response.body:
self.logger.error("Login failed.")
return
else:
links = response.xpath('//a[@class="forumtitle"]/@href').extract()
for link in links:
yield scrapy.Request(response.urljoin(link), callback=self.parse_topics)
def parse_topics(self, response):
links = response.xpath('//a[@class="topictitle"]/@href').extract()
for link in links:
yield scrapy.Request(response.urljoin(link), callback=self.parse_posts)
next_link = response.xpath(
'//li[contains(@class, "next")]//a[@rel="next"]/@href'
).extract_first()
if next_link:
# print("next_link: ", next_link)
yield scrapy.Request(
response.urljoin(next_link), callback=self.parse_topics
)
def clean_quote(self, string):
soup = BeautifulSoup(string, "lxml")
block_quotes = soup.find_all("blockquote")
for i, quote in enumerate(block_quotes):
block_quotes[i] = "<quote-%s>=%s" % (str(i + 1), quote.get_text())
return "".join(block_quotes).strip()
def clean_text(self, string):
tags = ["blockquote"]
soup = BeautifulSoup(string, "lxml")
for tag in tags:
for i, item in enumerate(soup.find_all(tag)):
item.replaceWith("<reply-%s>=" % str(i + 1))
return re.sub(r" +", r" ", soup.get_text()).strip()
def parse_posts(self, response):
# Try the hidden input field first
topic_id = response.xpath(
'//input[@type="hidden" and @name="t"]/@value'
).extract_first()
# Fallback to URL regex if hidden input isn't found
if not topic_id:
topic_id_match = re.search(r"[?&]t=(\d+)", response.url)
if topic_id_match:
topic_id = topic_id_match.group(1)
# This ensures IDs, Dates, and Text stay synchronized
posts = response.xpath(
'//div[contains(@class, "post") and contains(@class, "has-profile")]'
)
for post in posts:
# The div usually has id="p123456", we want 123456
div_id = post.xpath("./@id").extract_first()
post_id = div_id.replace("p", "") if div_id else None
# Modern phpBB themes usually have a hidden span with data attributes
poster_id = post.xpath(
'.//span[contains(@class, "postdetails")]/@data-poster-id'
).extract_first()
# Fallback: Extract from the profile link (e.g., ...&u=5465)
if not poster_id:
profile_link = post.xpath(
'.//dt[contains(@class, "has-profile-rank")]/a[contains(@href, "mode=viewprofile")]/@href'
).extract_first()
if profile_link:
u_match = re.search(r"[?&]u=(\d+)", profile_link)
if u_match:
poster_id = u_match.group(1)
# Priority 1: The 'datetime' attribute (ISO format)
post_time = post.xpath(
'.//p[@class="author"]//time/@datetime'
).extract_first()
# Priority 2: The visible text inside the time tag
if not post_time:
post_time = post.xpath(
'.//p[@class="author"]//time/text()'
).extract_first()
username = post.xpath(
'.//dt[contains(@class, "has-profile-rank")]//a[contains(@class, "username")]/text()'
).extract_first()
post_count = post.xpath(
'.//dd[@class="profile-posts"]//a/text()'
).extract_first()
content_html = post.xpath('.//div[@class="content"]').extract_first() or ""
post_text = self.clean_text(content_html)
quote_text = self.clean_quote(content_html)
yield {
"TopicID": topic_id,
"PostID": post_id,
"PosterID": poster_id,
"Username": username,
"PostCount": post_count,
"PostTime": post_time,
"PostText": post_text,
"QuoteText": quote_text,
}
# Updated to use contains(@class, "next") because class is "arrow next"
next_link = response.xpath(
'//li[contains(@class, "next")]/a[@rel="next"]/@href'
).extract_first()
# Fallback: just look for the rel="next" attribute directly
if not next_link:
next_link = response.xpath('//a[@rel="next"]/@href').extract_first()
if next_link:
# print("next_link: ", next_link)
yield scrapy.Request(response.urljoin(next_link), callback=self.parse_posts)