mirror of
https://github.com/NohamR/phpBB-forum-scraper.git
synced 2026-02-22 02:25:43 +00:00
Add macserialjunkie spider and SQLite pipeline
Fork and refactor project for scraping macserialjunkie.com: add a new phpBB spider (uses python-dotenv for credentials, form login enabled, multiple start_urls, robust ID/time/text extraction and pagination) and an SQLitePipeline that saves posts to posts.db with a tqdm progress bar. Update settings to use the SQLite pipeline, increase concurrency, reduce download delay, disable robots.txt, set JOBDIR for resume and silence logs; add .env.example and .python-version, update README and requirements (add tqdm), tidy .gitignore, and add pyproject.toml. Also reorganize package layout (rename/move phpBB_scraper modules), remove legacy pipeline and old spider implementations, and add a dependency lock file (uv.lock).
This commit is contained in:
@@ -53,4 +53,4 @@ class PhpbbScraperSpiderMiddleware(object):
|
||||
yield r
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
spider.logger.info("Spider opened: %s" % spider.name)
|
||||
@@ -1,11 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
|
||||
class PhpbbScraperPipeline(object):
|
||||
def process_item(self, item, spider):
|
||||
return item
|
||||
@@ -1,116 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import scrapy
|
||||
from bs4 import BeautifulSoup
|
||||
from scrapy.http import Request
|
||||
|
||||
# TODO: Please provide values for the following variables
|
||||
# Domains only, no urls
|
||||
ALLOWED_DOMAINS = ['']
|
||||
# Starting urls
|
||||
START_URLS = ['']
|
||||
# Is login required? True or False.
|
||||
FORM_LOGIN = False
|
||||
# Login username
|
||||
USERNAME = ''
|
||||
# Login password
|
||||
PASSWORD = ''
|
||||
# Login url
|
||||
LOGIN_URL = ''
|
||||
|
||||
|
||||
class PhpbbSpider(scrapy.Spider):
|
||||
|
||||
name = 'phpBB'
|
||||
allowed_domains = ALLOWED_DOMAINS
|
||||
start_urls = START_URLS
|
||||
form_login = FORM_LOGIN
|
||||
if form_login is True:
|
||||
username = USERNAME
|
||||
password = PASSWORD
|
||||
login_url = LOGIN_URL
|
||||
start_urls.insert(0, login_url)
|
||||
|
||||
username_xpath = '//p[contains(@class, "author")]//a[contains(@class, "username")]//text()'
|
||||
post_count_xpath = '//dd[@class="profile-posts" or not(@class)]//a/text()'
|
||||
post_time_xpath = '//div[@class="postbody"]//time/@datetime|//div[@class="postbody"]//p[@class="author"]/text()[2]'
|
||||
post_text_xpath = '//div[@class="postbody"]//div[@class="content"]'
|
||||
|
||||
def parse(self, response):
|
||||
# LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN
|
||||
if self.form_login:
|
||||
formxpath = '//*[contains(@action, "login")]'
|
||||
formdata = {'username': self.username, 'password': self.password}
|
||||
form_request = scrapy.FormRequest.from_response(
|
||||
response,
|
||||
formdata=formdata,
|
||||
formxpath=formxpath,
|
||||
callback=self.after_login,
|
||||
dont_click=False
|
||||
)
|
||||
yield form_request
|
||||
else:
|
||||
# REQUEST SUB-FORUM TITLE LINKS
|
||||
links = response.xpath('//a[@class="forumtitle"]/@href').extract()
|
||||
for link in links:
|
||||
yield scrapy.Request(response.urljoin(link), callback=self.parse_topics)
|
||||
|
||||
def after_login(self, response):
|
||||
# CHECK LOGIN SUCCESS BEFORE MAKING REQUESTS
|
||||
if b'authentication failed' in response.body:
|
||||
self.logger.error('Login failed.')
|
||||
return
|
||||
else:
|
||||
# REQUEST SUB-FORUM TITLE LINKS
|
||||
links = response.xpath('//a[@class="forumtitle"]/@href').extract()
|
||||
for link in links:
|
||||
yield scrapy.Request(response.urljoin(link), callback=self.parse_topics)
|
||||
|
||||
def parse_topics(self, response):
|
||||
# REQUEST TOPIC TITLE LINKS
|
||||
links = response.xpath('//a[@class="topictitle"]/@href').extract()
|
||||
for link in links:
|
||||
yield scrapy.Request(response.urljoin(link), callback=self.parse_posts)
|
||||
|
||||
# IF NEXT PAGE EXISTS, FOLLOW
|
||||
next_link = response.xpath('//li[@class="next"]//a[@rel="next"]/@href').extract_first()
|
||||
if next_link:
|
||||
yield scrapy.Request(response.urljoin(next_link), callback=self.parse_topics)
|
||||
|
||||
def clean_quote(self, string):
|
||||
# CLEAN HTML TAGS FROM POST TEXT, MARK QUOTES
|
||||
soup = BeautifulSoup(string, 'lxml')
|
||||
block_quotes = soup.find_all('blockquote')
|
||||
for i, quote in enumerate(block_quotes):
|
||||
block_quotes[i] = '<quote-%s>=%s' % (str(i + 1), quote.get_text())
|
||||
return ''.join(block_quotes).strip()
|
||||
|
||||
def clean_text(self, string):
|
||||
# CLEAN HTML TAGS FROM POST TEXT, MARK REPLIES TO QUOTES
|
||||
tags = ['blockquote']
|
||||
soup = BeautifulSoup(string, 'lxml')
|
||||
for tag in tags:
|
||||
for i, item in enumerate(soup.find_all(tag)):
|
||||
item.replaceWith('<reply-%s>=' % str(i + 1))
|
||||
return re.sub(r' +', r' ', soup.get_text()).strip()
|
||||
|
||||
def parse_posts(self, response):
|
||||
# COLLECT FORUM POST DATA
|
||||
usernames = response.xpath(self.username_xpath).extract()
|
||||
n = len(usernames)
|
||||
if n > 0:
|
||||
post_counts = response.xpath(self.post_count_xpath).extract() or (n * [''])
|
||||
post_times = response.xpath(self.post_time_xpath).extract() or (n * [''])
|
||||
post_texts = response.xpath(self.post_text_xpath).extract() or (n * [''])
|
||||
post_quotes = [self.clean_quote(s) for s in post_texts]
|
||||
post_texts = [self.clean_text(s) for s in post_texts]
|
||||
|
||||
# YIELD POST DATA
|
||||
for i in range(n):
|
||||
yield {'Username': str(usernames[i]).strip(), 'PostCount': str(post_counts[i]).strip(),
|
||||
'PostTime': str(post_times[i]).strip(), 'PostText': post_texts[i], 'QuoteText': post_quotes[i]}
|
||||
|
||||
# CLICK THROUGH NEXT PAGE
|
||||
next_link = response.xpath('//li[@class="next"]//a[@rel="next"]/@href').extract_first()
|
||||
if next_link:
|
||||
yield scrapy.Request(response.urljoin(next_link), callback=self.parse_posts)
|
||||
115
phpBB_scraper/pipelines.py
Normal file
115
phpBB_scraper/pipelines.py
Normal file
@@ -0,0 +1,115 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
import sqlite3
|
||||
from datetime import datetime
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
class PhpbbScraperPipeline(object):
|
||||
def process_item(self, item, spider):
|
||||
return item
|
||||
|
||||
|
||||
class SQLitePipeline(object):
|
||||
def __init__(self):
|
||||
self.connection = None
|
||||
self.cursor = None
|
||||
self.pbar = None
|
||||
self.item_count = 0
|
||||
self.spider = None
|
||||
|
||||
def open_spider(self, spider):
|
||||
"""Initialize database connection when spider opens"""
|
||||
self.spider = spider
|
||||
# Create database file in the same directory as posts.csv was
|
||||
self.connection = sqlite3.connect("posts.db")
|
||||
self.cursor = self.connection.cursor()
|
||||
|
||||
# Create table if it doesn't exist
|
||||
self.cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS posts (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
topic_id TEXT,
|
||||
post_id TEXT,
|
||||
poster_id TEXT,
|
||||
username TEXT,
|
||||
post_count TEXT,
|
||||
post_time TEXT,
|
||||
post_text TEXT,
|
||||
quote_text TEXT,
|
||||
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Create indexes for better query performance
|
||||
self.cursor.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_topic_id ON posts(topic_id)
|
||||
"""
|
||||
)
|
||||
self.cursor.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_post_id ON posts(post_id)
|
||||
"""
|
||||
)
|
||||
self.cursor.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_poster_id ON posts(poster_id)
|
||||
"""
|
||||
)
|
||||
|
||||
self.connection.commit()
|
||||
|
||||
# Initialize progress bar
|
||||
self.pbar = tqdm(desc="Scraping posts", unit=" posts", dynamic_ncols=True)
|
||||
|
||||
def close_spider(self, spider):
|
||||
"""Close database connection when spider closes"""
|
||||
if self.pbar is not None:
|
||||
self.pbar.close()
|
||||
if self.connection:
|
||||
self.connection.close()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
"""Insert scraped item into database"""
|
||||
self.cursor.execute(
|
||||
"""
|
||||
INSERT INTO posts (topic_id, post_id, poster_id, username, post_count,
|
||||
post_time, post_text, quote_text)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
item.get("TopicID"),
|
||||
item.get("PostID"),
|
||||
item.get("PosterID"),
|
||||
item.get("Username"),
|
||||
item.get("PostCount"),
|
||||
item.get("PostTime"),
|
||||
item.get("PostText"),
|
||||
item.get("QuoteText"),
|
||||
),
|
||||
)
|
||||
|
||||
self.connection.commit()
|
||||
|
||||
# Update progress bar
|
||||
self.item_count += 1
|
||||
self.pbar.update(1)
|
||||
|
||||
# Get queue stats from spider's crawler
|
||||
stats = self.spider.crawler.stats.get_stats()
|
||||
pending = stats.get('scheduler/enqueued', 0) - stats.get('scheduler/dequeued', 0)
|
||||
|
||||
self.pbar.set_postfix({
|
||||
'total': self.item_count,
|
||||
'queue': pending
|
||||
})
|
||||
|
||||
return item
|
||||
@@ -1,11 +0,0 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.org/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = phpBB_scraper.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = phpBB_scraper
|
||||
@@ -9,81 +9,87 @@
|
||||
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
|
||||
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'phpBB_scraper'
|
||||
SPIDER_MODULES = ['phpBB_scraper.spiders']
|
||||
NEWSPIDER_MODULE = 'phpBB_scraper.spiders'
|
||||
BOT_NAME = "phpBB_scraper"
|
||||
SPIDER_MODULES = ["phpBB_scraper.spiders"]
|
||||
NEWSPIDER_MODULE = "phpBB_scraper.spiders"
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 OPR/45.0.2552.888'
|
||||
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 OPR/45.0.2552.888"
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
DOWNLOAD_DELAY = 3.0
|
||||
DOWNLOAD_DELAY = 0.1
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
# CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
COOKIES_ENABLED = True
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
# TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# DEFAULT_REQUEST_HEADERS = {
|
||||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
# 'Accept-Language': 'en',
|
||||
#}
|
||||
# }
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# SPIDER_MIDDLEWARES = {
|
||||
# 'phpBB_scraper.middlewares.PhpbbScraperSpiderMiddleware': 543,
|
||||
#}
|
||||
# }
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# DOWNLOADER_MIDDLEWARES = {
|
||||
# 'phpBB_scraper.middlewares.MyCustomDownloaderMiddleware': 543,
|
||||
#}
|
||||
# }
|
||||
|
||||
# Enable or disable extensions
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# EXTENSIONS = {
|
||||
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||
#}
|
||||
# }
|
||||
|
||||
# Configure item pipelines
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
|
||||
#ITEM_PIPELINES = {
|
||||
# 'phpBB_scraper.pipelines.PhpbbScraperPipeline': 300,
|
||||
#}
|
||||
ITEM_PIPELINES = {
|
||||
"phpBB_scraper.pipelines.SQLitePipeline": 300,
|
||||
}
|
||||
|
||||
# Disable default Scrapy logging for cleaner output with tqdm
|
||||
LOG_LEVEL = "ERROR"
|
||||
|
||||
# Enable job directory for pause/resume functionality
|
||||
JOBDIR = "crawls/resume"
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
# AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
# HTTPCACHE_ENABLED = True
|
||||
# HTTPCACHE_EXPIRATION_SECS = 0
|
||||
# HTTPCACHE_DIR = 'httpcache'
|
||||
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
192
phpBB_scraper/spiders/phpBB.py
Normal file
192
phpBB_scraper/spiders/phpBB.py
Normal file
@@ -0,0 +1,192 @@
|
||||
import re
|
||||
|
||||
import scrapy
|
||||
from bs4 import BeautifulSoup
|
||||
from scrapy.http import Request
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
# TODO: Please provide values for the following variables
|
||||
# Domains only, no urls
|
||||
ALLOWED_DOMAINS = ["macserialjunkie.com"]
|
||||
# Starting urls
|
||||
START_URLS = [
|
||||
"https://macserialjunkie.com/forum/index.php",
|
||||
## Add missing sub-forums
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=53", # msj.keygens
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=52", # msj.stw.graphics
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=27", # msj.stw.videotutorials
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=28", # msj.stw.webdev
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=25", # cracking.workshop
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=34", # msj.games.cracks
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=35", # msj.games.serials
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=63", # msj.games.ports
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=56", # msj.audio.cracks
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=57", # msj.audio.serials
|
||||
"https://macserialjunkie.com/forum/viewforum.php?f=59", # msj.iOS.games
|
||||
]
|
||||
# Is login required? True or False.
|
||||
FORM_LOGIN = True
|
||||
# Login username
|
||||
USERNAME = os.getenv("USERNAME") or "username"
|
||||
# Login password
|
||||
PASSWORD = os.getenv("PASSWORD") or "password"
|
||||
# Login url
|
||||
LOGIN_URL = "https://macserialjunkie.com/forum/ucp.php"
|
||||
|
||||
|
||||
class PhpbbSpider(scrapy.Spider):
|
||||
name = "phpBB"
|
||||
allowed_domains = ALLOWED_DOMAINS
|
||||
start_urls = START_URLS
|
||||
form_login = FORM_LOGIN
|
||||
if form_login is True:
|
||||
username = USERNAME
|
||||
password = PASSWORD
|
||||
login_url = LOGIN_URL
|
||||
start_urls.insert(0, login_url)
|
||||
|
||||
username_xpath = (
|
||||
'//p[contains(@class, "author")]//a[contains(@class, "username")]//text()'
|
||||
)
|
||||
post_count_xpath = '//dd[@class="profile-posts" or not(@class)]//a/text()'
|
||||
post_time_xpath = '//div[@class="postbody"]//time/@datetime|//div[@class="postbody"]//p[@class="author"]/text()[2]'
|
||||
post_text_xpath = '//div[@class="postbody"]//div[@class="content"]'
|
||||
|
||||
def parse(self, response):
|
||||
if self.form_login:
|
||||
formxpath = '//*[contains(@action, "login")]'
|
||||
formdata = {"username": self.username, "password": self.password}
|
||||
form_request = scrapy.FormRequest.from_response(
|
||||
response,
|
||||
formdata=formdata,
|
||||
formxpath=formxpath,
|
||||
callback=self.after_login,
|
||||
dont_click=False,
|
||||
)
|
||||
yield form_request
|
||||
else:
|
||||
links = response.xpath('//a[@class="forumtitle"]/@href').extract()
|
||||
for link in links:
|
||||
yield scrapy.Request(response.urljoin(link), callback=self.parse_topics)
|
||||
|
||||
def after_login(self, response):
|
||||
if b"authentication failed" in response.body:
|
||||
self.logger.error("Login failed.")
|
||||
return
|
||||
else:
|
||||
links = response.xpath('//a[@class="forumtitle"]/@href').extract()
|
||||
for link in links:
|
||||
yield scrapy.Request(response.urljoin(link), callback=self.parse_topics)
|
||||
|
||||
def parse_topics(self, response):
|
||||
links = response.xpath('//a[@class="topictitle"]/@href').extract()
|
||||
for link in links:
|
||||
yield scrapy.Request(response.urljoin(link), callback=self.parse_posts)
|
||||
|
||||
next_link = response.xpath(
|
||||
'//li[contains(@class, "next")]//a[@rel="next"]/@href'
|
||||
).extract_first()
|
||||
if next_link:
|
||||
# print("next_link: ", next_link)
|
||||
yield scrapy.Request(
|
||||
response.urljoin(next_link), callback=self.parse_topics
|
||||
)
|
||||
|
||||
def clean_quote(self, string):
|
||||
soup = BeautifulSoup(string, "lxml")
|
||||
block_quotes = soup.find_all("blockquote")
|
||||
for i, quote in enumerate(block_quotes):
|
||||
block_quotes[i] = "<quote-%s>=%s" % (str(i + 1), quote.get_text())
|
||||
return "".join(block_quotes).strip()
|
||||
|
||||
def clean_text(self, string):
|
||||
tags = ["blockquote"]
|
||||
soup = BeautifulSoup(string, "lxml")
|
||||
for tag in tags:
|
||||
for i, item in enumerate(soup.find_all(tag)):
|
||||
item.replaceWith("<reply-%s>=" % str(i + 1))
|
||||
return re.sub(r" +", r" ", soup.get_text()).strip()
|
||||
|
||||
def parse_posts(self, response):
|
||||
# Try the hidden input field first
|
||||
topic_id = response.xpath(
|
||||
'//input[@type="hidden" and @name="t"]/@value'
|
||||
).extract_first()
|
||||
# Fallback to URL regex if hidden input isn't found
|
||||
if not topic_id:
|
||||
topic_id_match = re.search(r"[?&]t=(\d+)", response.url)
|
||||
if topic_id_match:
|
||||
topic_id = topic_id_match.group(1)
|
||||
|
||||
# This ensures IDs, Dates, and Text stay synchronized
|
||||
posts = response.xpath(
|
||||
'//div[contains(@class, "post") and contains(@class, "has-profile")]'
|
||||
)
|
||||
|
||||
for post in posts:
|
||||
# The div usually has id="p123456", we want 123456
|
||||
div_id = post.xpath("./@id").extract_first()
|
||||
post_id = div_id.replace("p", "") if div_id else None
|
||||
|
||||
# Modern phpBB themes usually have a hidden span with data attributes
|
||||
poster_id = post.xpath(
|
||||
'.//span[contains(@class, "postdetails")]/@data-poster-id'
|
||||
).extract_first()
|
||||
|
||||
# Fallback: Extract from the profile link (e.g., ...&u=5465)
|
||||
if not poster_id:
|
||||
profile_link = post.xpath(
|
||||
'.//dt[contains(@class, "has-profile-rank")]/a[contains(@href, "mode=viewprofile")]/@href'
|
||||
).extract_first()
|
||||
if profile_link:
|
||||
u_match = re.search(r"[?&]u=(\d+)", profile_link)
|
||||
if u_match:
|
||||
poster_id = u_match.group(1)
|
||||
|
||||
# Priority 1: The 'datetime' attribute (ISO format)
|
||||
post_time = post.xpath(
|
||||
'.//p[@class="author"]//time/@datetime'
|
||||
).extract_first()
|
||||
|
||||
# Priority 2: The visible text inside the time tag
|
||||
if not post_time:
|
||||
post_time = post.xpath(
|
||||
'.//p[@class="author"]//time/text()'
|
||||
).extract_first()
|
||||
|
||||
username = post.xpath(
|
||||
'.//dt[contains(@class, "has-profile-rank")]//a[contains(@class, "username")]/text()'
|
||||
).extract_first()
|
||||
post_count = post.xpath(
|
||||
'.//dd[@class="profile-posts"]//a/text()'
|
||||
).extract_first()
|
||||
|
||||
content_html = post.xpath('.//div[@class="content"]').extract_first() or ""
|
||||
post_text = self.clean_text(content_html)
|
||||
quote_text = self.clean_quote(content_html)
|
||||
|
||||
yield {
|
||||
"TopicID": topic_id,
|
||||
"PostID": post_id,
|
||||
"PosterID": poster_id,
|
||||
"Username": username,
|
||||
"PostCount": post_count,
|
||||
"PostTime": post_time,
|
||||
"PostText": post_text,
|
||||
"QuoteText": quote_text,
|
||||
}
|
||||
|
||||
# Updated to use contains(@class, "next") because class is "arrow next"
|
||||
next_link = response.xpath(
|
||||
'//li[contains(@class, "next")]/a[@rel="next"]/@href'
|
||||
).extract_first()
|
||||
|
||||
# Fallback: just look for the rel="next" attribute directly
|
||||
if not next_link:
|
||||
next_link = response.xpath('//a[@rel="next"]/@href').extract_first()
|
||||
|
||||
if next_link:
|
||||
# print("next_link: ", next_link)
|
||||
yield scrapy.Request(response.urljoin(next_link), callback=self.parse_posts)
|
||||
Reference in New Issue
Block a user