phpBB-forum-scraper/phpBB_scraper/pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import sqlite3
from datetime import datetime
from tqdm import tqdm


class PhpbbScraperPipeline(object):
    def process_item(self, item, spider):
        return item


class SQLitePipeline(object):
    def __init__(self):
        self.connection = None
        self.cursor = None
        self.pbar = None
        self.item_count = 0
        self.spider = None

    def open_spider(self, spider):
        """Initialize database connection when spider opens"""
        self.spider = spider
        # Create database file in the same directory as posts.csv was
        self.connection = sqlite3.connect("posts.db")
        self.cursor = self.connection.cursor()

        # Create table if it doesn't exist
        self.cursor.execute(
            """
            CREATE TABLE IF NOT EXISTS posts (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                topic_id TEXT,
                post_id TEXT,
                poster_id TEXT,
                username TEXT,
                post_count TEXT,
                post_time TEXT,
                post_text TEXT,
                quote_text TEXT,
                scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        """
        )

        # Create indexes for better query performance
        self.cursor.execute(
            """
            CREATE INDEX IF NOT EXISTS idx_topic_id ON posts(topic_id)
        """
        )
        self.cursor.execute(
            """
            CREATE INDEX IF NOT EXISTS idx_post_id ON posts(post_id)
        """
        )
        self.cursor.execute(
            """
            CREATE INDEX IF NOT EXISTS idx_poster_id ON posts(poster_id)
        """
        )

        self.connection.commit()

        # Initialize progress bar
        self.pbar = tqdm(desc="Scraping posts", unit=" posts", dynamic_ncols=True)

    def close_spider(self, spider):
        """Close database connection when spider closes"""
        if self.pbar is not None:
            self.pbar.close()
        if self.connection:
            self.connection.close()

    def process_item(self, item, spider):
        """Insert scraped item into database"""
        self.cursor.execute(
            """
            INSERT INTO posts (topic_id, post_id, poster_id, username, post_count,
                             post_time, post_text, quote_text)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
        """,
            (
                item.get("TopicID"),
                item.get("PostID"),
                item.get("PosterID"),
                item.get("Username"),
                item.get("PostCount"),
                item.get("PostTime"),
                item.get("PostText"),
                item.get("QuoteText"),
            ),
        )

        self.connection.commit()

        # Update progress bar
        self.item_count += 1
        self.pbar.update(1)

        # Get queue stats from spider's crawler
        stats = self.spider.crawler.stats.get_stats()
        pending = stats.get('scheduler/enqueued', 0) - stats.get('scheduler/dequeued', 0)

        self.pbar.set_postfix({
            'total': self.item_count,
            'queue': pending
        })

        return item