Files
phpBB-forum-scraper/phpBB_scraper/pipelines.py
√(noham)² 5615658452 Add macserialjunkie spider and SQLite pipeline
Fork and refactor project for scraping macserialjunkie.com: add a new phpBB spider (uses python-dotenv for credentials, form login enabled, multiple start_urls, robust ID/time/text extraction and pagination) and an SQLitePipeline that saves posts to posts.db with a tqdm progress bar. Update settings to use the SQLite pipeline, increase concurrency, reduce download delay, disable robots.txt, set JOBDIR for resume and silence logs; add .env.example and .python-version, update README and requirements (add tqdm), tidy .gitignore, and add pyproject.toml. Also reorganize package layout (rename/move phpBB_scraper modules), remove legacy pipeline and old spider implementations, and add a dependency lock file (uv.lock).
2026-01-31 13:30:41 +01:00

116 lines
3.3 KiB
Python

# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import sqlite3
from datetime import datetime
from tqdm import tqdm
class PhpbbScraperPipeline(object):
def process_item(self, item, spider):
return item
class SQLitePipeline(object):
def __init__(self):
self.connection = None
self.cursor = None
self.pbar = None
self.item_count = 0
self.spider = None
def open_spider(self, spider):
"""Initialize database connection when spider opens"""
self.spider = spider
# Create database file in the same directory as posts.csv was
self.connection = sqlite3.connect("posts.db")
self.cursor = self.connection.cursor()
# Create table if it doesn't exist
self.cursor.execute(
"""
CREATE TABLE IF NOT EXISTS posts (
id INTEGER PRIMARY KEY AUTOINCREMENT,
topic_id TEXT,
post_id TEXT,
poster_id TEXT,
username TEXT,
post_count TEXT,
post_time TEXT,
post_text TEXT,
quote_text TEXT,
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
"""
)
# Create indexes for better query performance
self.cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_topic_id ON posts(topic_id)
"""
)
self.cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_post_id ON posts(post_id)
"""
)
self.cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_poster_id ON posts(poster_id)
"""
)
self.connection.commit()
# Initialize progress bar
self.pbar = tqdm(desc="Scraping posts", unit=" posts", dynamic_ncols=True)
def close_spider(self, spider):
"""Close database connection when spider closes"""
if self.pbar is not None:
self.pbar.close()
if self.connection:
self.connection.close()
def process_item(self, item, spider):
"""Insert scraped item into database"""
self.cursor.execute(
"""
INSERT INTO posts (topic_id, post_id, poster_id, username, post_count,
post_time, post_text, quote_text)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""",
(
item.get("TopicID"),
item.get("PostID"),
item.get("PosterID"),
item.get("Username"),
item.get("PostCount"),
item.get("PostTime"),
item.get("PostText"),
item.get("QuoteText"),
),
)
self.connection.commit()
# Update progress bar
self.item_count += 1
self.pbar.update(1)
# Get queue stats from spider's crawler
stats = self.spider.crawler.stats.get_stats()
pending = stats.get('scheduler/enqueued', 0) - stats.get('scheduler/dequeued', 0)
self.pbar.set_postfix({
'total': self.item_count,
'queue': pending
})
return item