test tomobi, update progress style

This commit is contained in:
lapwat
2021-12-27 13:01:45 +01:00
parent ff3d09c727
commit 29008185a8
6 changed files with 316 additions and 80 deletions

View File

@@ -1,9 +1,12 @@
install:
go install
format: format:
gofmt -s -w . gofmt -s -w .
test:
go test github.com/lapwat/papeer/book
install:
go install
clean: clean:
find . -maxdepth 1 -not -name 'README.md' -name '*.md' -delete find . -maxdepth 1 -not -name 'README.md' -name '*.md' -delete
find . -maxdepth 1 -name '*.epub' -delete find . -maxdepth 1 -name '*.epub' -delete

View File

@@ -3,6 +3,8 @@ package book
import ( import (
"fmt" "fmt"
"log" "log"
"os"
"os/exec"
"strings" "strings"
md "github.com/JohannesKaufmann/html-to-markdown" md "github.com/JohannesKaufmann/html-to-markdown"
@@ -10,6 +12,15 @@ import (
epub "github.com/bmaupin/go-epub" epub "github.com/bmaupin/go-epub"
) )
func Filename(name string) string {
filename := name
filename = strings.ReplaceAll(filename, " ", "_")
filename = strings.ReplaceAll(filename, "/", "")
return filename
}
func ToMarkdown(c chapter) string { func ToMarkdown(c chapter) string {
// make title // make title
@@ -33,7 +44,7 @@ func ToMarkdown(c chapter) string {
return content return content
} }
func ToEpub(c chapter, filename string) { func ToEpub(c chapter, filename string) string {
if len(filename) == 0 { if len(filename) == 0 {
filename = fmt.Sprintf("%s.epub", c.Name()) filename = fmt.Sprintf("%s.epub", c.Name())
} }
@@ -50,6 +61,8 @@ func ToEpub(c chapter, filename string) {
} }
fmt.Printf("Ebook saved to \"%s\"\n", filename) fmt.Printf("Ebook saved to \"%s\"\n", filename)
return filename
} }
func AppendToEpub(e *epub.Epub, c chapter, imagesOnly bool) { func AppendToEpub(e *epub.Epub, c chapter, imagesOnly bool) {
@@ -88,3 +101,34 @@ func AppendToEpub(e *epub.Epub, c chapter, imagesOnly bool) {
AppendToEpub(e, sc, false) AppendToEpub(e, sc, false)
} }
} }
func ToMobi(c chapter, filename string) string {
if len(filename) == 0 {
filename = fmt.Sprintf("%s.mobi", c.Name())
} else {
// add .mobi extension if not specified
if strings.HasSuffix(filename, ".mobi") == false {
filename = fmt.Sprintf("%s.mobi", filename)
}
}
filenameEPUB := strings.ReplaceAll(filename, ".mobi", ".epub")
ToEpub(c, filenameEPUB)
exec.Command("kindlegen", filenameEPUB).Run()
// exec command always return status 1 even if it succeed
// if err != nil {
// log.Fatal(err)
// }
fmt.Printf("Ebook saved to \"%s\"\n", filename)
err := os.Remove(filenameEPUB)
if err != nil {
log.Fatal(err)
}
return filename
}

View File

@@ -6,9 +6,20 @@ import (
"testing" "testing"
) )
func TestFilename(t *testing.T) {
got := Filename("This is a chapter / book")
want := "This_is_a_chapter__book"
if got != want {
t.Errorf("got %q, wanted %q", got, want)
}
}
func TestToMarkdown(t *testing.T) { func TestToMarkdown(t *testing.T) {
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}) c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
got := ToMarkdown(c) got := ToMarkdown(c)
want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n2011" want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n2011"
@@ -16,27 +27,13 @@ func TestToMarkdown(t *testing.T) {
if got != want { if got != want {
t.Errorf("got %q, wanted %q", got, want) t.Errorf("got %q, wanted %q", got, want)
} }
} }
func TestToEpub(t *testing.T) { func TestToEpub(t *testing.T) {
filename := "ebook.epub"
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()})
ToEpub(c, filename)
if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
t.Errorf("%s does not exist: %v", filename, err)
} else {
if err := os.Remove(filename); err != nil {
t.Errorf("cannot remove %v: %v", filename, err)
}
}
}
func TestToEpubNoFilename(t *testing.T) {
filename := "Books.epub" filename := "Books.epub"
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}) c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
ToEpub(c, "") ToEpub(c, "")
if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) { if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
@@ -46,4 +43,37 @@ func TestToEpubNoFilename(t *testing.T) {
t.Errorf("cannot remove %v: %v", filename, err) t.Errorf("cannot remove %v: %v", filename, err)
} }
} }
}
func TestToEpubFilename(t *testing.T) {
filename := "ebook.epub"
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
ToEpub(c, filename)
if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
t.Errorf("%s does not exist: %v", filename, err)
} else {
if err := os.Remove(filename); err != nil {
t.Errorf("cannot remove %v: %v", filename, err)
}
}
}
func TestToMobi(t *testing.T) {
filename := "ebook.mobi"
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
ToMobi(c, filename)
if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
t.Errorf("%s does not exist: %v", filename, err)
} else {
if err := os.Remove(filename); err != nil {
t.Errorf("cannot remove %v: %v", filename, err)
}
}
} }

View File

@@ -2,6 +2,7 @@ package book
import ( import (
"fmt" "fmt"
"strings"
"github.com/gosuri/uiprogress" "github.com/gosuri/uiprogress"
) )
@@ -11,20 +12,22 @@ type progress struct {
individuals []*uiprogress.Bar individuals []*uiprogress.Bar
} }
func NewProgress(links []link) progress { func NewProgress(links []link, parent string, depth int) progress {
uiprogress.Start() uiprogress.Start()
global := uiprogress.AddBar(len(links)) global := uiprogress.AddBar(len(links))
indentGlobal := strings.Repeat("> ", depth)
global.AppendFunc(func(b *uiprogress.Bar) string { global.AppendFunc(func(b *uiprogress.Bar) string {
return fmt.Sprintf("Chapters %d / %d", b.Current(), len(links)) return fmt.Sprintf("%v%v (%v / %v)", indentGlobal, parent, b.Current(), len(links))
}) })
// hide individual bars if more than 50 chapters // hide individual bars if more than 50 chapters
individuals := []*uiprogress.Bar{} individuals := []*uiprogress.Bar{}
indent := strings.Repeat("- ", depth)
if len(links) <= 50 { if len(links) <= 50 {
for index, link := range links { for index, link := range links {
bar := uiprogress.AddBar(1) bar := uiprogress.AddBar(1)
barText := fmt.Sprintf("%d. %s", index+1, link.text) barText := fmt.Sprintf("%v#%v %v", indent, index+1, link.Text())
bar.AppendFunc(func(b *uiprogress.Bar) string { bar.AppendFunc(func(b *uiprogress.Bar) string {
return barText return barText
}) })
@@ -35,13 +38,22 @@ func NewProgress(links []link) progress {
return progress{global, individuals} return progress{global, individuals}
} }
func (p *progress) IncrGlobal() { func (p *progress) IncrementGlobal() {
p.global.Incr() p.global.Incr()
} }
func (p *progress) Incr(index int) { func (p *progress) Increment(index int) {
p.global.Incr() p.IncrementGlobal()
if len(p.individuals) > index { if len(p.individuals) > index {
p.individuals[index].Incr() p.individuals[index].Incr()
} }
} }
func (p *progress) UpdateName(index int, name string) {
if len(p.individuals) > index {
barText := fmt.Sprintf("%s", name)
p.individuals[index].AppendFunc(func(b *uiprogress.Bar) string {
return barText
})
}
}

View File

@@ -18,14 +18,55 @@ import (
) )
type ScrapeConfig struct { type ScrapeConfig struct {
depth int
selector string selector string
limit int limit int
offset int
delay int
threads int
include bool include bool
imagesOnly bool imagesOnly bool
} }
func NewScrapeConfig() *ScrapeConfig { func NewScrapeConfig() *ScrapeConfig {
return &ScrapeConfig{"", -1, true, false} return &ScrapeConfig{0, "", -1, 0, -1, -1, true, false}
}
func NewScrapeConfigsAjin() []*ScrapeConfig {
config0 := NewScrapeConfig()
config0.depth = 0
config0.selector = ".dt>a"
config0.limit = 3
config0.offset = 0
config0.delay = 5000
config0.include = false
config1 := NewScrapeConfig()
config1.depth = 1
config1.selector = ".nav_apb>a"
config1.limit = 3
config1.offset = 1
config1.delay = 5000
config1.include = false
config2 := NewScrapeConfig()
config2.depth = 2
config2.imagesOnly = true
return []*ScrapeConfig{config0, config1, config2}
}
func NewScrapeConfigsWikipedia() []*ScrapeConfig {
config0 := NewScrapeConfig()
config0.depth = 0
config0.threads = -1
config0.include = true
config1 := NewScrapeConfig()
config1.depth = 1
config1.include = true
return []*ScrapeConfig{config0, config1}
} }
func NewBookFromURL(url, selector, name, author string, recursive, include, imagesOnly bool, limit, offset, delay, threads int) book { func NewBookFromURL(url, selector, name, author string, recursive, include, imagesOnly bool, limit, offset, delay, threads int) book {
@@ -39,11 +80,14 @@ func NewBookFromURL(url, selector, name, author string, recursive, include, imag
config2 := NewScrapeConfig() config2 := NewScrapeConfig()
config2.selector = selector config2.selector = selector
config2.limit = limit config2.limit = limit
config2.offset = offset
config2.delay = delay
config2.threads = threads
config2.include = include config2.include = include
config2.imagesOnly = imagesOnly config2.imagesOnly = imagesOnly
chapters, home = tableOfContent(url, config1.selector, config1.limit, offset, delay, threads, config1.include) chapters, home = tableOfContent(url, config2)
} else { } else {
chapters = []chapter{NewChapterFromURL(url, []*ScrapeConfig{config1})} chapters = []chapter{NewChapterFromURL(url, []*ScrapeConfig{config1}, 0, func(index int, name string) {})}
home = chapters[0] home = chapters[0]
} }
@@ -63,37 +107,14 @@ func NewBookFromURL(url, selector, name, author string, recursive, include, imag
return b return b
} }
func NewChapterFromURL(url string, configs []*ScrapeConfig) chapter { func NewChapterFromURL(url string, configs []*ScrapeConfig, index int, updateProgressBarName func(index int, name string)) chapter {
config := configs[0] config := configs[0]
content := ""
base, err := urllib.Parse(url) base, err := urllib.Parse(url)
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
subchapters := []chapter{}
if len(configs) > 1 {
// add subchapters
links, _, err := GetLinks(base, config.selector, config.limit, 0, false)
if err != nil {
log.Fatal(err)
}
for _, link := range links {
// and then use it to parse relative URLs
u, err := base.Parse(link.href)
if err != nil {
log.Fatal(err)
}
subchapters = append(subchapters, NewChapterFromURL(u.String(), configs[1:]))
}
}
// we want the metadata anyway
// get page body // get page body
response, err := http.Get(url) response, err := http.Get(url)
if err != nil { if err != nil {
@@ -108,15 +129,84 @@ func NewChapterFromURL(url string, configs []*ScrapeConfig) chapter {
// extract HTML body // extract HTML body
body, err := io.ReadAll(bodyReader) body, err := io.ReadAll(bodyReader)
// extract content // extract article content and metadata
article, err := readability.FromReader(readabilityReader, base) article, err := readability.FromReader(readabilityReader, base)
if err != nil { if err != nil {
log.Fatalf("failed to parse %s, %v\n", url, err) log.Fatalf("failed to parse %s, %v\n", url, err)
} }
name := article.Title
// we don't care about the content if we do not include this level // notify progress bar
updateProgressBarName(index, name)
subchapters := []chapter{}
if len(configs) > 1 {
// add subchapters
links, _, err := GetLinks(base, config.selector, config.limit, config.offset, false)
if err != nil {
log.Fatal(err)
}
subchapters = make([]chapter, len(links))
progress := NewProgress(links, name, config.depth)
if config.delay >= 0 {
// synchronous mode
for index, link := range links {
// and then use it to parse relative URLs
u, err := base.Parse(link.href)
if err != nil {
log.Fatal(err)
}
sc := NewChapterFromURL(u.String(), configs[1:], index, progress.UpdateName)
subchapters[index] = sc
progress.Increment(index)
time.Sleep(time.Duration(config.delay) * time.Millisecond)
}
} else {
// asynchronous mode
var wg sync.WaitGroup
threads := config.threads
if threads == -1 {
threads = len(links)
}
semaphore := make(chan bool, threads)
for index, l := range links {
wg.Add(1)
semaphore <- true
go func(index int, l link) {
defer wg.Done()
// and then use it to parse relative URLs
u, err := base.Parse(l.href)
if err != nil {
log.Fatal(err)
}
sc := NewChapterFromURL(u.String(), configs[1:], index, progress.UpdateName)
subchapters[index] = sc
progress.Increment(index)
<-semaphore
}(index, l)
}
wg.Wait()
}
}
content := ""
if config.include { if config.include {
// we care about the content only if we include this level
content = article.Content content = article.Content
// extract images // extract images
@@ -138,22 +228,23 @@ func NewChapterFromURL(url string, configs []*ScrapeConfig) chapter {
} }
} }
return chapter{string(body), article.Title, article.Byline, content, subchapters, config} return chapter{string(body), name, article.Byline, content, subchapters, config}
} }
func tableOfContent(url, selector string, limit, offset, delay, threads int, include bool) ([]chapter, chapter) { func tableOfContent(url string, config *ScrapeConfig) ([]chapter, chapter) {
base, err := urllib.Parse(url) base, err := urllib.Parse(url)
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
links, home, err := GetLinks(base, selector, limit, offset, include) links, home, err := GetLinks(base, config.selector, config.limit, config.offset, config.include)
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
chapters := make([]chapter, len(links)) chapters := make([]chapter, len(links))
progress := NewProgress(links) progress := NewProgress(links, "", 0)
delay := config.delay
if delay >= 0 { if delay >= 0 {
// synchronous mode // synchronous mode
@@ -165,8 +256,9 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc
log.Fatal(err) log.Fatal(err)
} }
chapters[index] = NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()}) sc := NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
progress.Incr(index) chapters[index] = sc
progress.Increment(index)
// short sleep for last chapter to let the progress bar update // short sleep for last chapter to let the progress bar update
if index == len(links)-1 { if index == len(links)-1 {
@@ -180,6 +272,7 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc
// asynchronous mode // asynchronous mode
var wg sync.WaitGroup var wg sync.WaitGroup
threads := config.threads
if threads == -1 { if threads == -1 {
threads = len(links) threads = len(links)
} }
@@ -199,8 +292,9 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc
log.Fatal(err) log.Fatal(err)
} }
chapters[index] = NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()}) sc := NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
progress.Incr(index) chapters[index] = sc
progress.Increment(index)
<-semaphore <-semaphore
}(index, l) }(index, l)
@@ -276,7 +370,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool)
links = links[offset:end] links = links[offset:end]
home := NewChapterFromURL(url.String(), []*ScrapeConfig{NewScrapeConfig()}) home := NewChapterFromURL(url.String(), []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
if include { if include {
l := NewLink(url.String(), home.Name()) l := NewLink(url.String(), home.Name())

View File

@@ -1,10 +1,14 @@
package book package book
import "testing" import (
"testing"
"time"
)
func TestBody(t *testing.T) { func TestBody(t *testing.T) {
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}) config := NewScrapeConfig()
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}, 0, func(index int, name string) {})
got := c.Body() got := c.Body()
want := "<!doctype html>\n<html lang=\"en-us\">\n <head>\n <title>Books</title>\n <link rel=\"shortcut icon\" href=\"/favicon.ico\" />\n <meta charset=\"utf-8\" />\n <meta name=\"generator\" content=\"Hugo 0.59.1\" />\n <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />\n <meta name=\"author\" content=\"John Doe\" />\n <meta name=\"description\" content=\" \" />\n <link rel=\"stylesheet\" href=\"https://books.lapw.at/css/main.min.88e7083eff65effb7485b6e6f38d10afbec25093a6fac42d734ce9024d3defbd.css\" />\n\n \n <meta name=\"twitter:card\" content=\"summary\"/>\n<meta name=\"twitter:title\" content=\"Books\"/>\n<meta name=\"twitter:description\" content=\" \"/>\n\n <meta property=\"og:title\" content=\"Books\" />\n<meta property=\"og:description\" content=\" \" />\n<meta property=\"og:type\" content=\"website\" />\n<meta property=\"og:url\" content=\"https://books.lapw.at/\" />\n\n\n\n </head>\n <body>\n <header class=\"app-header\">\n <a href=\"https://books.lapw.at/\"><img class=\"app-header-avatar\" src=\"/book.svg\" alt=\"John Doe\" /></a>\n <h1>Books</h1>\n <p> </p>\n <div class=\"app-header-social\">\n \n </div>\n </header>\n <main class=\"app-container\">\n \n <article>\n <h1>Books</h1>\n <ul class=\"posts-list\">\n \n <li class=\"posts-list-item\">\n <a class=\"posts-list-item-title\" href=\"https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/\">Discours de la Méthode</a>\n <span class=\"posts-list-item-description\">\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\" class=\"icon icon-clock\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 98 min read -\n 1637\n </span>\n </li>\n \n <li class=\"posts-list-item\">\n <a class=\"posts-list-item-title\" href=\"https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/\">The Twelve-Factor App</a>\n <span class=\"posts-list-item-description\">\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\" class=\"icon icon-clock\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 22 min read -\n 2011\n </span>\n </li>\n \n </ul>\n \n\n\n\n </article>\n\n </main>\n </body>\n</html>\n" want := "<!doctype html>\n<html lang=\"en-us\">\n <head>\n <title>Books</title>\n <link rel=\"shortcut icon\" href=\"/favicon.ico\" />\n <meta charset=\"utf-8\" />\n <meta name=\"generator\" content=\"Hugo 0.59.1\" />\n <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />\n <meta name=\"author\" content=\"John Doe\" />\n <meta name=\"description\" content=\" \" />\n <link rel=\"stylesheet\" href=\"https://books.lapw.at/css/main.min.88e7083eff65effb7485b6e6f38d10afbec25093a6fac42d734ce9024d3defbd.css\" />\n\n \n <meta name=\"twitter:card\" content=\"summary\"/>\n<meta name=\"twitter:title\" content=\"Books\"/>\n<meta name=\"twitter:description\" content=\" \"/>\n\n <meta property=\"og:title\" content=\"Books\" />\n<meta property=\"og:description\" content=\" \" />\n<meta property=\"og:type\" content=\"website\" />\n<meta property=\"og:url\" content=\"https://books.lapw.at/\" />\n\n\n\n </head>\n <body>\n <header class=\"app-header\">\n <a href=\"https://books.lapw.at/\"><img class=\"app-header-avatar\" src=\"/book.svg\" alt=\"John Doe\" /></a>\n <h1>Books</h1>\n <p> </p>\n <div class=\"app-header-social\">\n \n </div>\n </header>\n <main class=\"app-container\">\n \n <article>\n <h1>Books</h1>\n <ul class=\"posts-list\">\n \n <li class=\"posts-list-item\">\n <a class=\"posts-list-item-title\" href=\"https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/\">Discours de la Méthode</a>\n <span class=\"posts-list-item-description\">\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\" class=\"icon icon-clock\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 98 min read -\n 1637\n </span>\n </li>\n \n <li class=\"posts-list-item\">\n <a class=\"posts-list-item-title\" href=\"https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/\">The Twelve-Factor App</a>\n <span class=\"posts-list-item-description\">\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\" class=\"icon icon-clock\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 22 min read -\n 2011\n </span>\n </li>\n \n </ul>\n \n\n\n\n </article>\n\n </main>\n </body>\n</html>\n"
@@ -12,11 +16,13 @@ func TestBody(t *testing.T) {
if got != want { if got != want {
t.Errorf("got %v, wanted %v", got, want) t.Errorf("got %v, wanted %v", got, want)
} }
} }
func TestName(t *testing.T) { func TestName(t *testing.T) {
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}) config := NewScrapeConfig()
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}, 0, func(index int, name string) {})
got := c.Name() got := c.Name()
want := "Books" want := "Books"
@@ -24,11 +30,13 @@ func TestName(t *testing.T) {
if got != want { if got != want {
t.Errorf("got %v, wanted %v", got, want) t.Errorf("got %v, wanted %v", got, want)
} }
} }
func TestAuthor(t *testing.T) { func TestAuthor(t *testing.T) {
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}) config := NewScrapeConfig()
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}, 0, func(index int, name string) {})
got := c.Author() got := c.Author()
want := "John Doe" want := "John Doe"
@@ -36,11 +44,13 @@ func TestAuthor(t *testing.T) {
if got != want { if got != want {
t.Errorf("got %v, wanted %v", got, want) t.Errorf("got %v, wanted %v", got, want)
} }
} }
func TestContent(t *testing.T) { func TestContent(t *testing.T) {
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}) config := NewScrapeConfig()
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}, 0, func(index int, name string) {})
got := c.Content() got := c.Content()
want := "<div id=\"readability-page-1\" class=\"page\">\n \n <main>\n \n <article>\n \n <ul>\n \n <li>\n <a href=\"https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/\">Discours de la Méthode</a>\n <span>\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 98 min read -\n 1637\n </span>\n </li>\n \n <li>\n <a href=\"https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/\">The Twelve-Factor App</a>\n <span>\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 22 min read -\n 2011\n </span>\n </li>\n \n </ul>\n \n\n\n\n </article>\n\n </main>\n \n\n</div>" want := "<div id=\"readability-page-1\" class=\"page\">\n \n <main>\n \n <article>\n \n <ul>\n \n <li>\n <a href=\"https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/\">Discours de la Méthode</a>\n <span>\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 98 min read -\n 1637\n </span>\n </li>\n \n <li>\n <a href=\"https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/\">The Twelve-Factor App</a>\n <span>\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 22 min read -\n 2011\n </span>\n </li>\n \n </ul>\n \n\n\n\n </article>\n\n </main>\n \n\n</div>"
@@ -48,13 +58,35 @@ func TestContent(t *testing.T) {
if got != want { if got != want {
t.Errorf("got %v, wanted %v", got, want) t.Errorf("got %v, wanted %v", got, want)
} }
}
func TestDelay(t *testing.T) {
config0 := NewScrapeConfig()
config0.delay = 500
config1 := NewScrapeConfig()
start := time.Now()
NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
elapsed := time.Since(start)
got := elapsed
want := time.Duration(500) * time.Millisecond
if got < want {
t.Errorf("got %v, wanted min %v", got, want)
}
} }
func TestContentImagesOnly(t *testing.T) { func TestContentImagesOnly(t *testing.T) {
config := NewScrapeConfig() config := NewScrapeConfig()
config.imagesOnly = true config.imagesOnly = true
c := NewChapterFromURL("https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/", []*ScrapeConfig{config})
c := NewChapterFromURL("https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/", []*ScrapeConfig{config}, 0, func(index int, name string) {})
got := c.Content() got := c.Content()
want := "<img src=\"https://books.lapw.at/images/codebase-deploys.png\" alt=\"One codebase maps to many deploys\"/><img src=\"https://books.lapw.at/images/attached-resources.png\" alt=\"A production deploy attached to four backing services.\"/><img src=\"https://books.lapw.at/images/release.png\" alt=\"Code becomes a build, which is combined with config to create a release.\"/><img src=\"https://books.lapw.at/images/process-types.png\" alt=\"Scale is expressed as running processes, workload diversity is expressed as process types.\"/>" want := "<img src=\"https://books.lapw.at/images/codebase-deploys.png\" alt=\"One codebase maps to many deploys\"/><img src=\"https://books.lapw.at/images/attached-resources.png\" alt=\"A production deploy attached to four backing services.\"/><img src=\"https://books.lapw.at/images/release.png\" alt=\"Code becomes a build, which is combined with config to create a release.\"/><img src=\"https://books.lapw.at/images/process-types.png\" alt=\"Scale is expressed as running processes, workload diversity is expressed as process types.\"/>"
@@ -62,11 +94,15 @@ func TestContentImagesOnly(t *testing.T) {
if got != want { if got != want {
t.Errorf("got %v, wanted %v", got, want) t.Errorf("got %v, wanted %v", got, want)
} }
} }
func TestSubChapters(t *testing.T) { func TestSubChapters(t *testing.T) {
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig(), NewScrapeConfig()}) config0 := NewScrapeConfig()
config1 := NewScrapeConfig()
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
got := len(c.SubChapters()) got := len(c.SubChapters())
want := 2 want := 2
@@ -74,11 +110,17 @@ func TestSubChapters(t *testing.T) {
if got != want { if got != want {
t.Errorf("got %v, wanted %v", got, want) t.Errorf("got %v, wanted %v", got, want)
} }
} }
func TestSubChaptersSelector(t *testing.T) { func TestSubChaptersSelector(t *testing.T) {
c := NewChapterFromURL("https://12factor.net/", []*ScrapeConfig{{"section.concrete > article > h2 > a", -1, true, false}, NewScrapeConfig()}) config0 := NewScrapeConfig()
config0.selector = "section.concrete > article > h2 > a"
config1 := NewScrapeConfig()
c := NewChapterFromURL("https://12factor.net/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
got := len(c.SubChapters()) got := len(c.SubChapters())
want := 12 want := 12
@@ -86,13 +128,17 @@ func TestSubChaptersSelector(t *testing.T) {
if got != want { if got != want {
t.Errorf("got %v, wanted %v", got, want) t.Errorf("got %v, wanted %v", got, want)
} }
} }
func TestSubChaptersLimit(t *testing.T) { func TestSubChaptersLimit(t *testing.T) {
config := NewScrapeConfig() config0 := NewScrapeConfig()
config.limit = 1 config0.limit = 1
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config, NewScrapeConfig()})
config1 := NewScrapeConfig()
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
got := len(c.SubChapters()) got := len(c.SubChapters())
want := 1 want := 1
@@ -100,13 +146,17 @@ func TestSubChaptersLimit(t *testing.T) {
if got != want { if got != want {
t.Errorf("got %v, wanted %v", got, want) t.Errorf("got %v, wanted %v", got, want)
} }
} }
func TestSubChaptersLimitOver(t *testing.T) { func TestSubChaptersLimitOver(t *testing.T) {
config := NewScrapeConfig() config0 := NewScrapeConfig()
config.limit = 3 config0.limit = 3
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config, NewScrapeConfig()})
config1 := NewScrapeConfig()
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
got := len(c.SubChapters()) got := len(c.SubChapters())
want := 2 want := 2
@@ -114,13 +164,15 @@ func TestSubChaptersLimitOver(t *testing.T) {
if got != want { if got != want {
t.Errorf("got %v, wanted %v", got, want) t.Errorf("got %v, wanted %v", got, want)
} }
} }
func TestNotInclude(t *testing.T) { func TestNotInclude(t *testing.T) {
config := NewScrapeConfig() config := NewScrapeConfig()
config.include = false config.include = false
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config})
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}, 0, func(index int, name string) {})
got := c.Content() got := c.Content()
want := "" want := ""
@@ -128,4 +180,5 @@ func TestNotInclude(t *testing.T) {
if got != want { if got != want {
t.Errorf("got %v, wanted %v", got, want) t.Errorf("got %v, wanted %v", got, want)
} }
} }