Add macserialjunkie spider and SQLite pipeline

Fork and refactor project for scraping macserialjunkie.com: add a new phpBB spider (uses python-dotenv for credentials, form login enabled, multiple start_urls, robust ID/time/text extraction and pagination) and an SQLitePipeline that saves posts to posts.db with a tqdm progress bar. Update settings to use the SQLite pipeline, increase concurrency, reduce download delay, disable robots.txt, set JOBDIR for resume and silence logs; add .env.example and .python-version, update README and requirements (add tqdm), tidy .gitignore, and add pyproject.toml. Also reorganize package layout (rename/move phpBB_scraper modules), remove legacy pipeline and old spider implementations, and add a dependency lock file (uv.lock).
2026-02-22 02:25:43 +00:00 · 2026-01-31 13:30:41 +01:00
parent d0178052c9
commit 5615658452
17 changed files with 1152 additions and 163 deletions
--- a/phpBB_scraper/phpBB_scraper/init.py
+++ b/phpBB_scraper/phpBB_scraper/init.py
--- a/phpBB_scraper/phpBB_scraper/items.py
+++ b/phpBB_scraper/phpBB_scraper/items.py
--- a/phpBB_scraper/phpBB_scraper/middlewares.py
+++ b/phpBB_scraper/phpBB_scraper/middlewares.py
@@ -53,4 +53,4 @@ class PhpbbScraperSpiderMiddleware(object):
            yield r

    def spider_opened(self, spider):
-        spider.logger.info('Spider opened: %s' % spider.name)
+        spider.logger.info("Spider opened: %s" % spider.name)
--- a/phpBB_scraper/phpBB_scraper/pipelines.py
+++ b/phpBB_scraper/phpBB_scraper/pipelines.py
@@ -1,11 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Define your item pipelines here
-#
-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
-
-
-class PhpbbScraperPipeline(object):
-    def process_item(self, item, spider):
-        return item
--- a/phpBB_scraper/phpBB_scraper/spiders/phpBB.py
+++ b/phpBB_scraper/phpBB_scraper/spiders/phpBB.py
@@ -1,116 +0,0 @@
-# -*- coding: utf-8 -*-
-import re
-import scrapy
-from bs4 import BeautifulSoup
-from scrapy.http import Request
-
-# TODO: Please provide values for the following variables
-# Domains only, no urls
-ALLOWED_DOMAINS = ['']
-# Starting urls
-START_URLS = ['']
-# Is login required? True or False.
-FORM_LOGIN = False
-# Login username
-USERNAME = ''
-# Login password
-PASSWORD = ''
-# Login url
-LOGIN_URL = ''
-
-
-class PhpbbSpider(scrapy.Spider):
-    
-    name = 'phpBB'
-    allowed_domains = ALLOWED_DOMAINS
-    start_urls = START_URLS
-    form_login = FORM_LOGIN
-    if form_login is True:
-        username = USERNAME
-        password = PASSWORD
-        login_url = LOGIN_URL
-        start_urls.insert(0, login_url)
-
-    username_xpath = '//p[contains(@class, "author")]//a[contains(@class, "username")]//text()'
-    post_count_xpath = '//dd[@class="profile-posts" or not(@class)]//a/text()'
-    post_time_xpath = '//div[@class="postbody"]//time/@datetime|//div[@class="postbody"]//p[@class="author"]/text()[2]'
-    post_text_xpath = '//div[@class="postbody"]//div[@class="content"]'
-
-    def parse(self, response):
-        # LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN
-        if self.form_login:
-            formxpath = '//*[contains(@action, "login")]'
-            formdata = {'username': self.username, 'password': self.password}
-            form_request = scrapy.FormRequest.from_response(
-                    response,
-                    formdata=formdata,
-                    formxpath=formxpath,
-                    callback=self.after_login,
-                    dont_click=False
-            )
-            yield form_request
-        else:
-            # REQUEST SUB-FORUM TITLE LINKS
-            links = response.xpath('//a[@class="forumtitle"]/@href').extract()
-            for link in links:
-                yield scrapy.Request(response.urljoin(link), callback=self.parse_topics)
-
-    def after_login(self, response):
-        # CHECK LOGIN SUCCESS BEFORE MAKING REQUESTS
-        if b'authentication failed' in response.body:
-            self.logger.error('Login failed.')
-            return
-        else:
-            # REQUEST SUB-FORUM TITLE LINKS
-            links = response.xpath('//a[@class="forumtitle"]/@href').extract()
-            for link in links:
-                yield scrapy.Request(response.urljoin(link), callback=self.parse_topics)
-
-    def parse_topics(self, response):
-        # REQUEST TOPIC TITLE LINKS
-        links = response.xpath('//a[@class="topictitle"]/@href').extract()
-        for link in links:
-            yield scrapy.Request(response.urljoin(link), callback=self.parse_posts)
-        
-        # IF NEXT PAGE EXISTS, FOLLOW
-        next_link = response.xpath('//li[@class="next"]//a[@rel="next"]/@href').extract_first()
-        if next_link:
-            yield scrapy.Request(response.urljoin(next_link), callback=self.parse_topics)   
-    
-    def clean_quote(self, string):
-        # CLEAN HTML TAGS FROM POST TEXT, MARK QUOTES
-        soup = BeautifulSoup(string, 'lxml')
-        block_quotes = soup.find_all('blockquote')
-        for i, quote in enumerate(block_quotes):
-            block_quotes[i] = '<quote-%s>=%s' % (str(i + 1), quote.get_text())
-        return ''.join(block_quotes).strip()
-    
-    def clean_text(self, string):
-        # CLEAN HTML TAGS FROM POST TEXT, MARK REPLIES TO QUOTES
-        tags = ['blockquote']
-        soup = BeautifulSoup(string, 'lxml')
-        for tag in tags:
-            for i, item in enumerate(soup.find_all(tag)):
-                item.replaceWith('<reply-%s>=' % str(i + 1))
-        return re.sub(r' +', r' ', soup.get_text()).strip()
-      
-    def parse_posts(self, response):
-        # COLLECT FORUM POST DATA
-        usernames = response.xpath(self.username_xpath).extract()
-        n = len(usernames)
-        if n > 0:
-            post_counts = response.xpath(self.post_count_xpath).extract() or (n * [''])
-            post_times = response.xpath(self.post_time_xpath).extract() or (n * [''])
-            post_texts = response.xpath(self.post_text_xpath).extract() or (n * [''])
-            post_quotes = [self.clean_quote(s) for s in post_texts]
-            post_texts = [self.clean_text(s) for s in post_texts]
-
-            # YIELD POST DATA
-            for i in range(n):
-                yield {'Username': str(usernames[i]).strip(), 'PostCount': str(post_counts[i]).strip(),
-                       'PostTime': str(post_times[i]).strip(), 'PostText': post_texts[i], 'QuoteText': post_quotes[i]}
-
-        # CLICK THROUGH NEXT PAGE
-        next_link = response.xpath('//li[@class="next"]//a[@rel="next"]/@href').extract_first()
-        if next_link:
-            yield scrapy.Request(response.urljoin(next_link), callback=self.parse_posts)
--- a/phpBB_scraper/pipelines.py
+++ b/phpBB_scraper/pipelines.py
@@ -0,0 +1,115 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+import sqlite3
+from datetime import datetime
+from tqdm import tqdm
+
+
+class PhpbbScraperPipeline(object):
+    def process_item(self, item, spider):
+        return item
+
+
+class SQLitePipeline(object):
+    def __init__(self):
+        self.connection = None
+        self.cursor = None
+        self.pbar = None
+        self.item_count = 0
+        self.spider = None
+
+    def open_spider(self, spider):
+        """Initialize database connection when spider opens"""
+        self.spider = spider
+        # Create database file in the same directory as posts.csv was
+        self.connection = sqlite3.connect("posts.db")
+        self.cursor = self.connection.cursor()
+
+        # Create table if it doesn't exist
+        self.cursor.execute(
+            """
+            CREATE TABLE IF NOT EXISTS posts (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                topic_id TEXT,
+                post_id TEXT,
+                poster_id TEXT,
+                username TEXT,
+                post_count TEXT,
+                post_time TEXT,
+                post_text TEXT,
+                quote_text TEXT,
+                scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+            )
+        """
+        )
+
+        # Create indexes for better query performance
+        self.cursor.execute(
+            """
+            CREATE INDEX IF NOT EXISTS idx_topic_id ON posts(topic_id)
+        """
+        )
+        self.cursor.execute(
+            """
+            CREATE INDEX IF NOT EXISTS idx_post_id ON posts(post_id)
+        """
+        )
+        self.cursor.execute(
+            """
+            CREATE INDEX IF NOT EXISTS idx_poster_id ON posts(poster_id)
+        """
+        )
+
+        self.connection.commit()
+
+        # Initialize progress bar
+        self.pbar = tqdm(desc="Scraping posts", unit=" posts", dynamic_ncols=True)
+
+    def close_spider(self, spider):
+        """Close database connection when spider closes"""
+        if self.pbar is not None:
+            self.pbar.close()
+        if self.connection:
+            self.connection.close()
+
+    def process_item(self, item, spider):
+        """Insert scraped item into database"""
+        self.cursor.execute(
+            """
+            INSERT INTO posts (topic_id, post_id, poster_id, username, post_count, 
+                             post_time, post_text, quote_text)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+        """,
+            (
+                item.get("TopicID"),
+                item.get("PostID"),
+                item.get("PosterID"),
+                item.get("Username"),
+                item.get("PostCount"),
+                item.get("PostTime"),
+                item.get("PostText"),
+                item.get("QuoteText"),
+            ),
+        )
+
+        self.connection.commit()
+
+        # Update progress bar
+        self.item_count += 1
+        self.pbar.update(1)
+        
+        # Get queue stats from spider's crawler
+        stats = self.spider.crawler.stats.get_stats()
+        pending = stats.get('scheduler/enqueued', 0) - stats.get('scheduler/dequeued', 0)
+        
+        self.pbar.set_postfix({
+            'total': self.item_count,
+            'queue': pending
+        })
+
+        return item
--- a/phpBB_scraper/scrapy.cfg
+++ b/phpBB_scraper/scrapy.cfg
@@ -1,11 +0,0 @@
-# Automatically created by: scrapy startproject
-#
-# For more information about the [deploy] section see:
-# https://scrapyd.readthedocs.org/en/latest/deploy.html
-
-[settings]
-default = phpBB_scraper.settings
-
-[deploy]
-#url = http://localhost:6800/
-project = phpBB_scraper
--- a/phpBB_scraper/phpBB_scraper/settings.py
+++ b/phpBB_scraper/phpBB_scraper/settings.py
@@ -9,81 +9,87 @@
 #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

-BOT_NAME = 'phpBB_scraper'
-SPIDER_MODULES = ['phpBB_scraper.spiders']
-NEWSPIDER_MODULE = 'phpBB_scraper.spiders'
+BOT_NAME = "phpBB_scraper"
+SPIDER_MODULES = ["phpBB_scraper.spiders"]
+NEWSPIDER_MODULE = "phpBB_scraper.spiders"


 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 OPR/45.0.2552.888'
+USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 OPR/45.0.2552.888"

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
-#CONCURRENT_REQUESTS = 32
+CONCURRENT_REQUESTS = 32

 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-DOWNLOAD_DELAY = 3.0
+DOWNLOAD_DELAY = 0.1
 # The download delay setting will honor only one of:
-#CONCURRENT_REQUESTS_PER_DOMAIN = 16
-#CONCURRENT_REQUESTS_PER_IP = 16
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16

 # Disable cookies (enabled by default)
 COOKIES_ENABLED = True

 # Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED = False
+# TELNETCONSOLE_ENABLED = False

 # Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
+# DEFAULT_REQUEST_HEADERS = {
 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 #   'Accept-Language': 'en',
-#}
+# }

 # Enable or disable spider middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
-#SPIDER_MIDDLEWARES = {
+# SPIDER_MIDDLEWARES = {
 #    'phpBB_scraper.middlewares.PhpbbScraperSpiderMiddleware': 543,
-#}
+# }

 # Enable or disable downloader middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
-#DOWNLOADER_MIDDLEWARES = {
+# DOWNLOADER_MIDDLEWARES = {
 #    'phpBB_scraper.middlewares.MyCustomDownloaderMiddleware': 543,
-#}
+# }

 # Enable or disable extensions
 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
-#EXTENSIONS = {
+# EXTENSIONS = {
 #    'scrapy.extensions.telnet.TelnetConsole': None,
-#}
+# }

 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'phpBB_scraper.pipelines.PhpbbScraperPipeline': 300,
-#}
+ITEM_PIPELINES = {
+    "phpBB_scraper.pipelines.SQLitePipeline": 300,
+}
+
+# Disable default Scrapy logging for cleaner output with tqdm
+LOG_LEVEL = "ERROR"
+
+# Enable job directory for pause/resume functionality
+JOBDIR = "crawls/resume"

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
+# AUTOTHROTTLE_ENABLED = True
 # The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
+# AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
+# AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
+# AUTOTHROTTLE_DEBUG = False

 # Enable and configure HTTP caching (disabled by default)
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = 'httpcache'
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = 'httpcache'
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/phpBB_scraper/phpBB_scraper/spiders/init.py
+++ b/phpBB_scraper/phpBB_scraper/spiders/init.py
--- a/phpBB_scraper/spiders/phpBB.py
+++ b/phpBB_scraper/spiders/phpBB.py
@@ -0,0 +1,192 @@
+import re
+
+import scrapy
+from bs4 import BeautifulSoup
+from scrapy.http import Request
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+# TODO: Please provide values for the following variables
+# Domains only, no urls
+ALLOWED_DOMAINS = ["macserialjunkie.com"]
+# Starting urls
+START_URLS = [
+    "https://macserialjunkie.com/forum/index.php",
+    ## Add missing sub-forums
+    "https://macserialjunkie.com/forum/viewforum.php?f=53",  # msj.keygens
+    "https://macserialjunkie.com/forum/viewforum.php?f=52",  # msj.stw.graphics
+    "https://macserialjunkie.com/forum/viewforum.php?f=27",  # msj.stw.videotutorials
+    "https://macserialjunkie.com/forum/viewforum.php?f=28",  # msj.stw.webdev
+    "https://macserialjunkie.com/forum/viewforum.php?f=25",  # cracking.workshop
+    "https://macserialjunkie.com/forum/viewforum.php?f=34",  # msj.games.cracks
+    "https://macserialjunkie.com/forum/viewforum.php?f=35",  # msj.games.serials
+    "https://macserialjunkie.com/forum/viewforum.php?f=63",  # msj.games.ports
+    "https://macserialjunkie.com/forum/viewforum.php?f=56",  # msj.audio.cracks
+    "https://macserialjunkie.com/forum/viewforum.php?f=57",  # msj.audio.serials
+    "https://macserialjunkie.com/forum/viewforum.php?f=59",  # msj.iOS.games
+]
+# Is login required? True or False.
+FORM_LOGIN = True
+# Login username
+USERNAME = os.getenv("USERNAME") or "username"
+# Login password
+PASSWORD = os.getenv("PASSWORD") or "password"
+# Login url
+LOGIN_URL = "https://macserialjunkie.com/forum/ucp.php"
+
+
+class PhpbbSpider(scrapy.Spider):
+    name = "phpBB"
+    allowed_domains = ALLOWED_DOMAINS
+    start_urls = START_URLS
+    form_login = FORM_LOGIN
+    if form_login is True:
+        username = USERNAME
+        password = PASSWORD
+        login_url = LOGIN_URL
+        start_urls.insert(0, login_url)
+
+    username_xpath = (
+        '//p[contains(@class, "author")]//a[contains(@class, "username")]//text()'
+    )
+    post_count_xpath = '//dd[@class="profile-posts" or not(@class)]//a/text()'
+    post_time_xpath = '//div[@class="postbody"]//time/@datetime|//div[@class="postbody"]//p[@class="author"]/text()[2]'
+    post_text_xpath = '//div[@class="postbody"]//div[@class="content"]'
+
+    def parse(self, response):
+        if self.form_login:
+            formxpath = '//*[contains(@action, "login")]'
+            formdata = {"username": self.username, "password": self.password}
+            form_request = scrapy.FormRequest.from_response(
+                response,
+                formdata=formdata,
+                formxpath=formxpath,
+                callback=self.after_login,
+                dont_click=False,
+            )
+            yield form_request
+        else:
+            links = response.xpath('//a[@class="forumtitle"]/@href').extract()
+            for link in links:
+                yield scrapy.Request(response.urljoin(link), callback=self.parse_topics)
+
+    def after_login(self, response):
+        if b"authentication failed" in response.body:
+            self.logger.error("Login failed.")
+            return
+        else:
+            links = response.xpath('//a[@class="forumtitle"]/@href').extract()
+            for link in links:
+                yield scrapy.Request(response.urljoin(link), callback=self.parse_topics)
+
+    def parse_topics(self, response):
+        links = response.xpath('//a[@class="topictitle"]/@href').extract()
+        for link in links:
+            yield scrapy.Request(response.urljoin(link), callback=self.parse_posts)
+
+        next_link = response.xpath(
+            '//li[contains(@class, "next")]//a[@rel="next"]/@href'
+        ).extract_first()
+        if next_link:
+            # print("next_link: ", next_link)
+            yield scrapy.Request(
+                response.urljoin(next_link), callback=self.parse_topics
+            )
+
+    def clean_quote(self, string):
+        soup = BeautifulSoup(string, "lxml")
+        block_quotes = soup.find_all("blockquote")
+        for i, quote in enumerate(block_quotes):
+            block_quotes[i] = "<quote-%s>=%s" % (str(i + 1), quote.get_text())
+        return "".join(block_quotes).strip()
+
+    def clean_text(self, string):
+        tags = ["blockquote"]
+        soup = BeautifulSoup(string, "lxml")
+        for tag in tags:
+            for i, item in enumerate(soup.find_all(tag)):
+                item.replaceWith("<reply-%s>=" % str(i + 1))
+        return re.sub(r" +", r" ", soup.get_text()).strip()
+
+    def parse_posts(self, response):
+        # Try the hidden input field first
+        topic_id = response.xpath(
+            '//input[@type="hidden" and @name="t"]/@value'
+        ).extract_first()
+        # Fallback to URL regex if hidden input isn't found
+        if not topic_id:
+            topic_id_match = re.search(r"[?&]t=(\d+)", response.url)
+            if topic_id_match:
+                topic_id = topic_id_match.group(1)
+
+        # This ensures IDs, Dates, and Text stay synchronized
+        posts = response.xpath(
+            '//div[contains(@class, "post") and contains(@class, "has-profile")]'
+        )
+
+        for post in posts:
+            # The div usually has id="p123456", we want 123456
+            div_id = post.xpath("./@id").extract_first()
+            post_id = div_id.replace("p", "") if div_id else None
+
+            # Modern phpBB themes usually have a hidden span with data attributes
+            poster_id = post.xpath(
+                './/span[contains(@class, "postdetails")]/@data-poster-id'
+            ).extract_first()
+
+            # Fallback: Extract from the profile link (e.g., ...&u=5465)
+            if not poster_id:
+                profile_link = post.xpath(
+                    './/dt[contains(@class, "has-profile-rank")]/a[contains(@href, "mode=viewprofile")]/@href'
+                ).extract_first()
+                if profile_link:
+                    u_match = re.search(r"[?&]u=(\d+)", profile_link)
+                    if u_match:
+                        poster_id = u_match.group(1)
+
+            # Priority 1: The 'datetime' attribute (ISO format)
+            post_time = post.xpath(
+                './/p[@class="author"]//time/@datetime'
+            ).extract_first()
+
+            # Priority 2: The visible text inside the time tag
+            if not post_time:
+                post_time = post.xpath(
+                    './/p[@class="author"]//time/text()'
+                ).extract_first()
+
+            username = post.xpath(
+                './/dt[contains(@class, "has-profile-rank")]//a[contains(@class, "username")]/text()'
+            ).extract_first()
+            post_count = post.xpath(
+                './/dd[@class="profile-posts"]//a/text()'
+            ).extract_first()
+
+            content_html = post.xpath('.//div[@class="content"]').extract_first() or ""
+            post_text = self.clean_text(content_html)
+            quote_text = self.clean_quote(content_html)
+
+            yield {
+                "TopicID": topic_id,
+                "PostID": post_id,
+                "PosterID": poster_id,
+                "Username": username,
+                "PostCount": post_count,
+                "PostTime": post_time,
+                "PostText": post_text,
+                "QuoteText": quote_text,
+            }
+
+        # Updated to use contains(@class, "next") because class is "arrow next"
+        next_link = response.xpath(
+            '//li[contains(@class, "next")]/a[@rel="next"]/@href'
+        ).extract_first()
+
+        # Fallback: just look for the rel="next" attribute directly
+        if not next_link:
+            next_link = response.xpath('//a[@rel="next"]/@href').extract_first()
+
+        if next_link:
+            # print("next_link: ", next_link)
+            yield scrapy.Request(response.urljoin(next_link), callback=self.parse_posts)