add test suites, scrape config

2026-05-24 20:00:45 +00:00 · 2021-12-22 22:28:19 +01:00
parent 0009435769
commit ff3d09c727
9 changed files with 421 additions and 45 deletions
--- a/book/chapter.go
+++ b/book/chapter.go
@@ -1,13 +1,20 @@
 package book

 type chapter struct {
-	name    string
-	author  string
-	content string
+	body        string
+	name        string
+	author      string
+	content     string
+	subChapters []chapter
+	config      *ScrapeConfig
 }

-func NewChapter(name, author, content string) chapter {
-	return chapter{name, author, content}
+func NewChapter(body, name, author, content string, subChapters []chapter, config *ScrapeConfig) chapter {
+	return chapter{body, name, author, content, subChapters, config}
+}
+
+func (c chapter) Body() string {
+	return c.body
 }

 func (c chapter) Name() string {
@@ -21,3 +28,7 @@ func (c chapter) Author() string {
 func (c chapter) Content() string {
 	return c.content
 }
+
+func (c chapter) SubChapters() []chapter {
+	return c.subChapters
+}
--- a/book/format.go
+++ b/book/format.go
@@ -0,0 +1,90 @@
+package book
+
+import (
+	"fmt"
+	"log"
+	"strings"
+
+	md "github.com/JohannesKaufmann/html-to-markdown"
+	"github.com/PuerkitoBio/goquery"
+	epub "github.com/bmaupin/go-epub"
+)
+
+func ToMarkdown(c chapter) string {
+
+	// make title
+	underline := strings.Repeat("=", len(c.Name()))
+	title := fmt.Sprintf("%s\n%s", c.Name(), underline)
+
+	// convert content to markdown
+	content, err := md.NewConverter("", true, nil).ConvertString(c.Content())
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	// merge title and content
+	content = fmt.Sprintf("%s\n\n%s", title, content)
+
+	for _, sc := range c.SubChapters() {
+		// merge subchapters
+		content = fmt.Sprintf("%s\n\n\n%s", content, ToMarkdown(sc))
+	}
+
+	return content
+}
+
+func ToEpub(c chapter, filename string) {
+	if len(filename) == 0 {
+		filename = fmt.Sprintf("%s.epub", c.Name())
+	}
+
+	// init ebook
+	e := epub.NewEpub(c.Name())
+	e.SetAuthor(c.Author())
+
+	AppendToEpub(e, c, false)
+
+	err := e.Write(filename)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	fmt.Printf("Ebook saved to \"%s\"\n", filename)
+}
+
+func AppendToEpub(e *epub.Epub, c chapter, imagesOnly bool) {
+	content := ""
+
+	if imagesOnly == false {
+		content = c.Content()
+	}
+
+	// parse content
+	doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content()))
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	// download images and replace src in img tags of content
+	doc.Find("img").Each(func(i int, s *goquery.Selection) {
+		src, _ := s.Attr("src")
+		imagePath, _ := e.AddImage(src, "")
+
+		if imagesOnly {
+			imageTag, _ := goquery.OuterHtml(s)
+			content += strings.Replace(imageTag, src, imagePath, 1)
+		} else {
+			content = strings.Replace(content, src, imagePath, 1)
+		}
+	})
+
+	html := fmt.Sprintf("<h1>%s</h1>%s", c.Name(), content)
+	_, err = e.AddSection(html, c.Name(), "", "")
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	for _, sc := range c.SubChapters() {
+		AppendToEpub(e, sc, false)
+	}
+}
--- a/book/format_test.go
+++ b/book/format_test.go
@@ -0,0 +1,49 @@
+package book
+
+import (
+	"errors"
+	"os"
+	"testing"
+)
+
+func TestToMarkdown(t *testing.T) {
+
+	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()})
+
+	got := ToMarkdown(c)
+	want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n2011"
+
+	if got != want {
+		t.Errorf("got %q, wanted %q", got, want)
+	}
+}
+
+func TestToEpub(t *testing.T) {
+
+	filename := "ebook.epub"
+	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()})
+	ToEpub(c, filename)
+
+	if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
+		t.Errorf("%s does not exist: %v", filename, err)
+	} else {
+		if err := os.Remove(filename); err != nil {
+			t.Errorf("cannot remove %v: %v", filename, err)
+		}
+	}
+}
+
+func TestToEpubNoFilename(t *testing.T) {
+
+	filename := "Books.epub"
+	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()})
+	ToEpub(c, "")
+
+	if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
+		t.Errorf("%s does not exist: %v", filename, err)
+	} else {
+		if err := os.Remove(filename); err != nil {
+			t.Errorf("cannot remove %v: %v", filename, err)
+		}
+	}
+}
--- a/book/scraper.go
+++ b/book/scraper.go
@@ -1,9 +1,12 @@
 package book

 import (
+	"bytes"
 	"fmt"
+	"io"
 	"log"
 	"math"
+	"net/http"
 	urllib "net/url"
 	"strings"
 	"sync"
@@ -14,14 +17,33 @@ import (
 	colly "github.com/gocolly/colly/v2"
 )

-func NewBookFromURL(url, selector, name, author string, recursive, include bool, limit, offset, delay, threads int) book {
+type ScrapeConfig struct {
+	selector   string
+	limit      int
+	include    bool
+	imagesOnly bool
+}
+
+func NewScrapeConfig() *ScrapeConfig {
+	return &ScrapeConfig{"", -1, true, false}
+}
+
+func NewBookFromURL(url, selector, name, author string, recursive, include, imagesOnly bool, limit, offset, delay, threads int) book {
+	config1 := NewScrapeConfig()
+	config1.imagesOnly = imagesOnly
+
 	var chapters []chapter
 	var home chapter

 	if recursive {
-		chapters, home = tableOfContent(url, selector, limit, offset, delay, threads, include)
+		config2 := NewScrapeConfig()
+		config2.selector = selector
+		config2.limit = limit
+		config2.include = include
+		config2.imagesOnly = imagesOnly
+		chapters, home = tableOfContent(url, config1.selector, config1.limit, offset, delay, threads, config1.include)
 	} else {
-		chapters = []chapter{NewChapterFromURL(url)}
+		chapters = []chapter{NewChapterFromURL(url, []*ScrapeConfig{config1})}
 		home = chapters[0]
 	}

@@ -41,30 +63,82 @@ func NewBookFromURL(url, selector, name, author string, recursive, include bool,
 	return b
 }

-func NewChapterFromURL(url string) chapter {
-	article, err := readability.FromURL(url, 30*time.Second)
+func NewChapterFromURL(url string, configs []*ScrapeConfig) chapter {
+	config := configs[0]
+	content := ""
+
+	base, err := urllib.Parse(url)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	subchapters := []chapter{}
+	if len(configs) > 1 {
+		// add subchapters
+
+		links, _, err := GetLinks(base, config.selector, config.limit, 0, false)
+		if err != nil {
+			log.Fatal(err)
+		}
+
+		for _, link := range links {
+			// and then use it to parse relative URLs
+			u, err := base.Parse(link.href)
+			if err != nil {
+				log.Fatal(err)
+			}
+
+			subchapters = append(subchapters, NewChapterFromURL(u.String(), configs[1:]))
+		}
+	}
+
+	// we want the metadata anyway
+
+	// get page body
+	response, err := http.Get(url)
+	if err != nil {
+		log.Fatal(err)
+	}
+	defer response.Body.Close()
+
+	// duplicate response stream
+	readabilityReader := &bytes.Buffer{}
+	bodyReader := io.TeeReader(response.Body, readabilityReader)
+
+	// extract HTML body
+	body, err := io.ReadAll(bodyReader)
+
+	// extract content
+	article, err := readability.FromReader(readabilityReader, base)
 	if err != nil {
 		log.Fatalf("failed to parse %s, %v\n", url, err)
 	}

-	content := strings.ReplaceAll(article.Content, "\n", "")
+	// we don't care about the content if we do not include this level

-	// if images {
-	// 	// parse html content
-	// 	doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
-	// 	if err != nil {
-	// 		log.Fatal(err)
-	// 	}
+	if config.include {
+		content = article.Content

-	// 	// extract images only
-	// 	content = ""
-	// 	doc.Find("img").Each(func(i int, s *goquery.Selection) {
-	// 		newContent, _ := goquery.OuterHtml(s)
-	// 		content += newContent
-	// 	})
-	// }
+		// extract images
+		if config.imagesOnly {

-	return chapter{article.Title, article.Byline, content}
+			// parse HTML
+			doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
+			if err != nil {
+				log.Fatal(err)
+			}
+
+			// append every image to content
+			content = ""
+			doc.Find("img").Each(func(i int, s *goquery.Selection) {
+				imageTag, _ := goquery.OuterHtml(s)
+				content += imageTag
+			})
+
+		}
+	}
+
+	return chapter{string(body), article.Title, article.Byline, content, subchapters, config}
 }

 func tableOfContent(url, selector string, limit, offset, delay, threads int, include bool) ([]chapter, chapter) {
@@ -91,7 +165,7 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc
 				log.Fatal(err)
 			}

-			chapters[index] = NewChapterFromURL(u.String())
+			chapters[index] = NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()})
 			progress.Incr(index)

 			// short sleep for last chapter to let the progress bar update
@@ -125,7 +199,7 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc
 					log.Fatal(err)
 				}

-				chapters[index] = NewChapterFromURL(u.String())
+				chapters[index] = NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()})
 				progress.Incr(index)

 				<-semaphore
@@ -202,7 +276,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool)

 	links = links[offset:end]

-	home := NewChapterFromURL(url.String())
+	home := NewChapterFromURL(url.String(), []*ScrapeConfig{NewScrapeConfig()})

 	if include {
 		l := NewLink(url.String(), home.Name())
--- a/book/scraper_test.go
+++ b/book/scraper_test.go
@@ -0,0 +1,131 @@
+package book
+
+import "testing"
+
+func TestBody(t *testing.T) {
+
+	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()})
+
+	got := c.Body()
+	want := "<!doctype html>\n<html lang=\"en-us\">\n  <head>\n    <title>Books</title>\n    <link rel=\"shortcut icon\" href=\"/favicon.ico\" />\n    <meta charset=\"utf-8\" />\n    <meta name=\"generator\" content=\"Hugo 0.59.1\" />\n    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />\n    <meta name=\"author\" content=\"John Doe\" />\n    <meta name=\"description\" content=\" \" />\n    <link rel=\"stylesheet\" href=\"https://books.lapw.at/css/main.min.88e7083eff65effb7485b6e6f38d10afbec25093a6fac42d734ce9024d3defbd.css\" />\n\n    \n    <meta name=\"twitter:card\" content=\"summary\"/>\n<meta name=\"twitter:title\" content=\"Books\"/>\n<meta name=\"twitter:description\" content=\" \"/>\n\n    <meta property=\"og:title\" content=\"Books\" />\n<meta property=\"og:description\" content=\" \" />\n<meta property=\"og:type\" content=\"website\" />\n<meta property=\"og:url\" content=\"https://books.lapw.at/\" />\n\n\n\n  </head>\n  <body>\n    <header class=\"app-header\">\n      <a href=\"https://books.lapw.at/\"><img class=\"app-header-avatar\" src=\"/book.svg\" alt=\"John Doe\" /></a>\n      <h1>Books</h1>\n      <p> </p>\n      <div class=\"app-header-social\">\n        \n      </div>\n    </header>\n    <main class=\"app-container\">\n      \n  <article>\n    <h1>Books</h1>\n    <ul class=\"posts-list\">\n      \n        <li class=\"posts-list-item\">\n          <a class=\"posts-list-item-title\" href=\"https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/\">Discours de la Méthode</a>\n          <span class=\"posts-list-item-description\">\n            <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\" class=\"icon icon-clock\">\n  <title>clock</title>\n  <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 98 min read -\n            1637\n          </span>\n        </li>\n      \n        <li class=\"posts-list-item\">\n          <a class=\"posts-list-item-title\" href=\"https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/\">The Twelve-Factor App</a>\n          <span class=\"posts-list-item-description\">\n            <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\" class=\"icon icon-clock\">\n  <title>clock</title>\n  <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 22 min read -\n            2011\n          </span>\n        </li>\n      \n    </ul>\n    \n\n\n\n  </article>\n\n    </main>\n  </body>\n</html>\n"
+
+	if got != want {
+		t.Errorf("got %v, wanted %v", got, want)
+	}
+}
+
+func TestName(t *testing.T) {
+
+	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()})
+
+	got := c.Name()
+	want := "Books"
+
+	if got != want {
+		t.Errorf("got %v, wanted %v", got, want)
+	}
+}
+
+func TestAuthor(t *testing.T) {
+
+	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()})
+
+	got := c.Author()
+	want := "John Doe"
+
+	if got != want {
+		t.Errorf("got %v, wanted %v", got, want)
+	}
+}
+
+func TestContent(t *testing.T) {
+
+	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()})
+
+	got := c.Content()
+	want := "<div id=\"readability-page-1\" class=\"page\">\n    \n    <main>\n      \n  <article>\n    \n    <ul>\n      \n        <li>\n          <a href=\"https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/\">Discours de la Méthode</a>\n          <span>\n            <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n  <title>clock</title>\n  <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 98 min read -\n            1637\n          </span>\n        </li>\n      \n        <li>\n          <a href=\"https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/\">The Twelve-Factor App</a>\n          <span>\n            <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n  <title>clock</title>\n  <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 22 min read -\n            2011\n          </span>\n        </li>\n      \n    </ul>\n    \n\n\n\n  </article>\n\n    </main>\n  \n\n</div>"
+
+	if got != want {
+		t.Errorf("got %v, wanted %v", got, want)
+	}
+}
+
+func TestContentImagesOnly(t *testing.T) {
+
+	config := NewScrapeConfig()
+	config.imagesOnly = true
+	c := NewChapterFromURL("https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/", []*ScrapeConfig{config})
+
+	got := c.Content()
+	want := "<img src=\"https://books.lapw.at/images/codebase-deploys.png\" alt=\"One codebase maps to many deploys\"/><img src=\"https://books.lapw.at/images/attached-resources.png\" alt=\"A production deploy attached to four backing services.\"/><img src=\"https://books.lapw.at/images/release.png\" alt=\"Code becomes a build, which is combined with config to create a release.\"/><img src=\"https://books.lapw.at/images/process-types.png\" alt=\"Scale is expressed as running processes, workload diversity is expressed as process types.\"/>"
+
+	if got != want {
+		t.Errorf("got %v, wanted %v", got, want)
+	}
+}
+
+func TestSubChapters(t *testing.T) {
+
+	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig(), NewScrapeConfig()})
+
+	got := len(c.SubChapters())
+	want := 2
+
+	if got != want {
+		t.Errorf("got %v, wanted %v", got, want)
+	}
+}
+
+func TestSubChaptersSelector(t *testing.T) {
+
+	c := NewChapterFromURL("https://12factor.net/", []*ScrapeConfig{{"section.concrete > article > h2 > a", -1, true, false}, NewScrapeConfig()})
+
+	got := len(c.SubChapters())
+	want := 12
+
+	if got != want {
+		t.Errorf("got %v, wanted %v", got, want)
+	}
+}
+
+func TestSubChaptersLimit(t *testing.T) {
+
+	config := NewScrapeConfig()
+	config.limit = 1
+	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config, NewScrapeConfig()})
+
+	got := len(c.SubChapters())
+	want := 1
+
+	if got != want {
+		t.Errorf("got %v, wanted %v", got, want)
+	}
+}
+
+func TestSubChaptersLimitOver(t *testing.T) {
+
+	config := NewScrapeConfig()
+	config.limit = 3
+	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config, NewScrapeConfig()})
+
+	got := len(c.SubChapters())
+	want := 2
+
+	if got != want {
+		t.Errorf("got %v, wanted %v", got, want)
+	}
+}
+
+func TestNotInclude(t *testing.T) {
+
+	config := NewScrapeConfig()
+	config.include = false
+	c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config})
+
+	got := c.Content()
+	want := ""
+
+	if got != want {
+		t.Errorf("got %v, wanted %v", got, want)
+	}
+}