From 29008185a829aa269aa6514e3f173d7658488f63 Mon Sep 17 00:00:00 2001 From: lapwat Date: Mon, 27 Dec 2021 13:01:45 +0100 Subject: [PATCH] test tomobi, update progress style --- Makefile | 9 ++- book/format.go | 46 +++++++++++- book/format_test.go | 64 +++++++++++----- book/progress.go | 24 ++++-- book/scraper.go | 170 +++++++++++++++++++++++++++++++++---------- book/scraper_test.go | 83 +++++++++++++++++---- 6 files changed, 316 insertions(+), 80 deletions(-) diff --git a/Makefile b/Makefile index 4abac44..4543256 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,12 @@ -install: - go install - format: gofmt -s -w . +test: + go test github.com/lapwat/papeer/book + +install: + go install + clean: find . -maxdepth 1 -not -name 'README.md' -name '*.md' -delete find . -maxdepth 1 -name '*.epub' -delete diff --git a/book/format.go b/book/format.go index 6a8d0df..7d25094 100644 --- a/book/format.go +++ b/book/format.go @@ -3,6 +3,8 @@ package book import ( "fmt" "log" + "os" + "os/exec" "strings" md "github.com/JohannesKaufmann/html-to-markdown" @@ -10,6 +12,15 @@ import ( epub "github.com/bmaupin/go-epub" ) +func Filename(name string) string { + filename := name + + filename = strings.ReplaceAll(filename, " ", "_") + filename = strings.ReplaceAll(filename, "/", "") + + return filename +} + func ToMarkdown(c chapter) string { // make title @@ -33,7 +44,7 @@ func ToMarkdown(c chapter) string { return content } -func ToEpub(c chapter, filename string) { +func ToEpub(c chapter, filename string) string { if len(filename) == 0 { filename = fmt.Sprintf("%s.epub", c.Name()) } @@ -50,6 +61,8 @@ func ToEpub(c chapter, filename string) { } fmt.Printf("Ebook saved to \"%s\"\n", filename) + + return filename } func AppendToEpub(e *epub.Epub, c chapter, imagesOnly bool) { @@ -88,3 +101,34 @@ func AppendToEpub(e *epub.Epub, c chapter, imagesOnly bool) { AppendToEpub(e, sc, false) } } + +func ToMobi(c chapter, filename string) string { + if len(filename) == 0 { + filename = fmt.Sprintf("%s.mobi", c.Name()) + } else { + + // add .mobi extension if not specified + if strings.HasSuffix(filename, ".mobi") == false { + filename = fmt.Sprintf("%s.mobi", filename) + } + + } + + filenameEPUB := strings.ReplaceAll(filename, ".mobi", ".epub") + ToEpub(c, filenameEPUB) + + exec.Command("kindlegen", filenameEPUB).Run() + // exec command always return status 1 even if it succeed + // if err != nil { + // log.Fatal(err) + // } + + fmt.Printf("Ebook saved to \"%s\"\n", filename) + + err := os.Remove(filenameEPUB) + if err != nil { + log.Fatal(err) + } + + return filename +} diff --git a/book/format_test.go b/book/format_test.go index 0f7accf..8769bd5 100644 --- a/book/format_test.go +++ b/book/format_test.go @@ -6,9 +6,20 @@ import ( "testing" ) +func TestFilename(t *testing.T) { + + got := Filename("This is a chapter / book") + want := "This_is_a_chapter__book" + + if got != want { + t.Errorf("got %q, wanted %q", got, want) + } + +} + func TestToMarkdown(t *testing.T) { - c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}) + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {}) got := ToMarkdown(c) want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n2011" @@ -16,27 +27,13 @@ func TestToMarkdown(t *testing.T) { if got != want { t.Errorf("got %q, wanted %q", got, want) } + } func TestToEpub(t *testing.T) { - filename := "ebook.epub" - c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}) - ToEpub(c, filename) - - if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) { - t.Errorf("%s does not exist: %v", filename, err) - } else { - if err := os.Remove(filename); err != nil { - t.Errorf("cannot remove %v: %v", filename, err) - } - } -} - -func TestToEpubNoFilename(t *testing.T) { - filename := "Books.epub" - c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}) + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {}) ToEpub(c, "") if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) { @@ -46,4 +43,37 @@ func TestToEpubNoFilename(t *testing.T) { t.Errorf("cannot remove %v: %v", filename, err) } } + +} + +func TestToEpubFilename(t *testing.T) { + + filename := "ebook.epub" + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {}) + ToEpub(c, filename) + + if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) { + t.Errorf("%s does not exist: %v", filename, err) + } else { + if err := os.Remove(filename); err != nil { + t.Errorf("cannot remove %v: %v", filename, err) + } + } + +} + +func TestToMobi(t *testing.T) { + + filename := "ebook.mobi" + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {}) + ToMobi(c, filename) + + if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) { + t.Errorf("%s does not exist: %v", filename, err) + } else { + if err := os.Remove(filename); err != nil { + t.Errorf("cannot remove %v: %v", filename, err) + } + } + } diff --git a/book/progress.go b/book/progress.go index 1f30ee6..9865308 100644 --- a/book/progress.go +++ b/book/progress.go @@ -2,6 +2,7 @@ package book import ( "fmt" + "strings" "github.com/gosuri/uiprogress" ) @@ -11,20 +12,22 @@ type progress struct { individuals []*uiprogress.Bar } -func NewProgress(links []link) progress { +func NewProgress(links []link, parent string, depth int) progress { uiprogress.Start() global := uiprogress.AddBar(len(links)) + indentGlobal := strings.Repeat("> ", depth) global.AppendFunc(func(b *uiprogress.Bar) string { - return fmt.Sprintf("Chapters %d / %d", b.Current(), len(links)) + return fmt.Sprintf("%v%v (%v / %v)", indentGlobal, parent, b.Current(), len(links)) }) // hide individual bars if more than 50 chapters individuals := []*uiprogress.Bar{} + indent := strings.Repeat("- ", depth) if len(links) <= 50 { for index, link := range links { bar := uiprogress.AddBar(1) - barText := fmt.Sprintf("%d. %s", index+1, link.text) + barText := fmt.Sprintf("%v#%v %v", indent, index+1, link.Text()) bar.AppendFunc(func(b *uiprogress.Bar) string { return barText }) @@ -35,13 +38,22 @@ func NewProgress(links []link) progress { return progress{global, individuals} } -func (p *progress) IncrGlobal() { +func (p *progress) IncrementGlobal() { p.global.Incr() } -func (p *progress) Incr(index int) { - p.global.Incr() +func (p *progress) Increment(index int) { + p.IncrementGlobal() if len(p.individuals) > index { p.individuals[index].Incr() } } + +func (p *progress) UpdateName(index int, name string) { + if len(p.individuals) > index { + barText := fmt.Sprintf("%s", name) + p.individuals[index].AppendFunc(func(b *uiprogress.Bar) string { + return barText + }) + } +} diff --git a/book/scraper.go b/book/scraper.go index b0bb0dd..60fb8c7 100644 --- a/book/scraper.go +++ b/book/scraper.go @@ -18,14 +18,55 @@ import ( ) type ScrapeConfig struct { + depth int selector string limit int + offset int + delay int + threads int include bool imagesOnly bool } func NewScrapeConfig() *ScrapeConfig { - return &ScrapeConfig{"", -1, true, false} + return &ScrapeConfig{0, "", -1, 0, -1, -1, true, false} +} + +func NewScrapeConfigsAjin() []*ScrapeConfig { + config0 := NewScrapeConfig() + config0.depth = 0 + config0.selector = ".dt>a" + config0.limit = 3 + config0.offset = 0 + config0.delay = 5000 + config0.include = false + + config1 := NewScrapeConfig() + config1.depth = 1 + config1.selector = ".nav_apb>a" + config1.limit = 3 + config1.offset = 1 + config1.delay = 5000 + config1.include = false + + config2 := NewScrapeConfig() + config2.depth = 2 + config2.imagesOnly = true + + return []*ScrapeConfig{config0, config1, config2} +} + +func NewScrapeConfigsWikipedia() []*ScrapeConfig { + config0 := NewScrapeConfig() + config0.depth = 0 + config0.threads = -1 + config0.include = true + + config1 := NewScrapeConfig() + config1.depth = 1 + config1.include = true + + return []*ScrapeConfig{config0, config1} } func NewBookFromURL(url, selector, name, author string, recursive, include, imagesOnly bool, limit, offset, delay, threads int) book { @@ -39,11 +80,14 @@ func NewBookFromURL(url, selector, name, author string, recursive, include, imag config2 := NewScrapeConfig() config2.selector = selector config2.limit = limit + config2.offset = offset + config2.delay = delay + config2.threads = threads config2.include = include config2.imagesOnly = imagesOnly - chapters, home = tableOfContent(url, config1.selector, config1.limit, offset, delay, threads, config1.include) + chapters, home = tableOfContent(url, config2) } else { - chapters = []chapter{NewChapterFromURL(url, []*ScrapeConfig{config1})} + chapters = []chapter{NewChapterFromURL(url, []*ScrapeConfig{config1}, 0, func(index int, name string) {})} home = chapters[0] } @@ -63,37 +107,14 @@ func NewBookFromURL(url, selector, name, author string, recursive, include, imag return b } -func NewChapterFromURL(url string, configs []*ScrapeConfig) chapter { +func NewChapterFromURL(url string, configs []*ScrapeConfig, index int, updateProgressBarName func(index int, name string)) chapter { config := configs[0] - content := "" base, err := urllib.Parse(url) if err != nil { log.Fatal(err) } - subchapters := []chapter{} - if len(configs) > 1 { - // add subchapters - - links, _, err := GetLinks(base, config.selector, config.limit, 0, false) - if err != nil { - log.Fatal(err) - } - - for _, link := range links { - // and then use it to parse relative URLs - u, err := base.Parse(link.href) - if err != nil { - log.Fatal(err) - } - - subchapters = append(subchapters, NewChapterFromURL(u.String(), configs[1:])) - } - } - - // we want the metadata anyway - // get page body response, err := http.Get(url) if err != nil { @@ -108,15 +129,84 @@ func NewChapterFromURL(url string, configs []*ScrapeConfig) chapter { // extract HTML body body, err := io.ReadAll(bodyReader) - // extract content + // extract article content and metadata article, err := readability.FromReader(readabilityReader, base) if err != nil { log.Fatalf("failed to parse %s, %v\n", url, err) } + name := article.Title - // we don't care about the content if we do not include this level + // notify progress bar + updateProgressBarName(index, name) + subchapters := []chapter{} + if len(configs) > 1 { + // add subchapters + + links, _, err := GetLinks(base, config.selector, config.limit, config.offset, false) + if err != nil { + log.Fatal(err) + } + + subchapters = make([]chapter, len(links)) + progress := NewProgress(links, name, config.depth) + + if config.delay >= 0 { + + // synchronous mode + for index, link := range links { + // and then use it to parse relative URLs + u, err := base.Parse(link.href) + if err != nil { + log.Fatal(err) + } + + sc := NewChapterFromURL(u.String(), configs[1:], index, progress.UpdateName) + subchapters[index] = sc + progress.Increment(index) + + time.Sleep(time.Duration(config.delay) * time.Millisecond) + } + + } else { + // asynchronous mode + var wg sync.WaitGroup + + threads := config.threads + if threads == -1 { + threads = len(links) + } + semaphore := make(chan bool, threads) + + for index, l := range links { + + wg.Add(1) + semaphore <- true + + go func(index int, l link) { + defer wg.Done() + + // and then use it to parse relative URLs + u, err := base.Parse(l.href) + if err != nil { + log.Fatal(err) + } + + sc := NewChapterFromURL(u.String(), configs[1:], index, progress.UpdateName) + subchapters[index] = sc + progress.Increment(index) + + <-semaphore + }(index, l) + } + wg.Wait() + } + } + + content := "" if config.include { + + // we care about the content only if we include this level content = article.Content // extract images @@ -138,22 +228,23 @@ func NewChapterFromURL(url string, configs []*ScrapeConfig) chapter { } } - return chapter{string(body), article.Title, article.Byline, content, subchapters, config} + return chapter{string(body), name, article.Byline, content, subchapters, config} } -func tableOfContent(url, selector string, limit, offset, delay, threads int, include bool) ([]chapter, chapter) { +func tableOfContent(url string, config *ScrapeConfig) ([]chapter, chapter) { base, err := urllib.Parse(url) if err != nil { log.Fatal(err) } - links, home, err := GetLinks(base, selector, limit, offset, include) + links, home, err := GetLinks(base, config.selector, config.limit, config.offset, config.include) if err != nil { log.Fatal(err) } chapters := make([]chapter, len(links)) - progress := NewProgress(links) + progress := NewProgress(links, "", 0) + delay := config.delay if delay >= 0 { // synchronous mode @@ -165,8 +256,9 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc log.Fatal(err) } - chapters[index] = NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()}) - progress.Incr(index) + sc := NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {}) + chapters[index] = sc + progress.Increment(index) // short sleep for last chapter to let the progress bar update if index == len(links)-1 { @@ -180,6 +272,7 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc // asynchronous mode var wg sync.WaitGroup + threads := config.threads if threads == -1 { threads = len(links) } @@ -199,8 +292,9 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc log.Fatal(err) } - chapters[index] = NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()}) - progress.Incr(index) + sc := NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {}) + chapters[index] = sc + progress.Increment(index) <-semaphore }(index, l) @@ -276,7 +370,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) links = links[offset:end] - home := NewChapterFromURL(url.String(), []*ScrapeConfig{NewScrapeConfig()}) + home := NewChapterFromURL(url.String(), []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {}) if include { l := NewLink(url.String(), home.Name()) diff --git a/book/scraper_test.go b/book/scraper_test.go index 22635c0..90adf5a 100644 --- a/book/scraper_test.go +++ b/book/scraper_test.go @@ -1,10 +1,14 @@ package book -import "testing" +import ( + "testing" + "time" +) func TestBody(t *testing.T) { - c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}) + config := NewScrapeConfig() + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}, 0, func(index int, name string) {}) got := c.Body() want := "\n\n \n Books\n \n \n \n \n \n \n \n\n \n \n\n\n\n \n\n\n\n\n\n\n \n \n
\n \"John\n

Books

\n

\n
\n \n
\n
\n
\n \n \n\n
\n \n\n" @@ -12,11 +16,13 @@ func TestBody(t *testing.T) { if got != want { t.Errorf("got %v, wanted %v", got, want) } + } func TestName(t *testing.T) { - c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}) + config := NewScrapeConfig() + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}, 0, func(index int, name string) {}) got := c.Name() want := "Books" @@ -24,11 +30,13 @@ func TestName(t *testing.T) { if got != want { t.Errorf("got %v, wanted %v", got, want) } + } func TestAuthor(t *testing.T) { - c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}) + config := NewScrapeConfig() + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}, 0, func(index int, name string) {}) got := c.Author() want := "John Doe" @@ -36,11 +44,13 @@ func TestAuthor(t *testing.T) { if got != want { t.Errorf("got %v, wanted %v", got, want) } + } func TestContent(t *testing.T) { - c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}) + config := NewScrapeConfig() + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}, 0, func(index int, name string) {}) got := c.Content() want := "
\n \n
\n \n \n\n
\n \n\n
" @@ -48,13 +58,35 @@ func TestContent(t *testing.T) { if got != want { t.Errorf("got %v, wanted %v", got, want) } + +} + +func TestDelay(t *testing.T) { + + config0 := NewScrapeConfig() + config0.delay = 500 + + config1 := NewScrapeConfig() + + start := time.Now() + NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {}) + elapsed := time.Since(start) + + got := elapsed + want := time.Duration(500) * time.Millisecond + + if got < want { + t.Errorf("got %v, wanted min %v", got, want) + } + } func TestContentImagesOnly(t *testing.T) { config := NewScrapeConfig() config.imagesOnly = true - c := NewChapterFromURL("https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/", []*ScrapeConfig{config}) + + c := NewChapterFromURL("https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/", []*ScrapeConfig{config}, 0, func(index int, name string) {}) got := c.Content() want := "\"One\"A\"Code\"Scale" @@ -62,11 +94,15 @@ func TestContentImagesOnly(t *testing.T) { if got != want { t.Errorf("got %v, wanted %v", got, want) } + } func TestSubChapters(t *testing.T) { - c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig(), NewScrapeConfig()}) + config0 := NewScrapeConfig() + config1 := NewScrapeConfig() + + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {}) got := len(c.SubChapters()) want := 2 @@ -74,11 +110,17 @@ func TestSubChapters(t *testing.T) { if got != want { t.Errorf("got %v, wanted %v", got, want) } + } func TestSubChaptersSelector(t *testing.T) { - c := NewChapterFromURL("https://12factor.net/", []*ScrapeConfig{{"section.concrete > article > h2 > a", -1, true, false}, NewScrapeConfig()}) + config0 := NewScrapeConfig() + config0.selector = "section.concrete > article > h2 > a" + + config1 := NewScrapeConfig() + + c := NewChapterFromURL("https://12factor.net/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {}) got := len(c.SubChapters()) want := 12 @@ -86,13 +128,17 @@ func TestSubChaptersSelector(t *testing.T) { if got != want { t.Errorf("got %v, wanted %v", got, want) } + } func TestSubChaptersLimit(t *testing.T) { - config := NewScrapeConfig() - config.limit = 1 - c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config, NewScrapeConfig()}) + config0 := NewScrapeConfig() + config0.limit = 1 + + config1 := NewScrapeConfig() + + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {}) got := len(c.SubChapters()) want := 1 @@ -100,13 +146,17 @@ func TestSubChaptersLimit(t *testing.T) { if got != want { t.Errorf("got %v, wanted %v", got, want) } + } func TestSubChaptersLimitOver(t *testing.T) { - config := NewScrapeConfig() - config.limit = 3 - c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config, NewScrapeConfig()}) + config0 := NewScrapeConfig() + config0.limit = 3 + + config1 := NewScrapeConfig() + + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {}) got := len(c.SubChapters()) want := 2 @@ -114,13 +164,15 @@ func TestSubChaptersLimitOver(t *testing.T) { if got != want { t.Errorf("got %v, wanted %v", got, want) } + } func TestNotInclude(t *testing.T) { config := NewScrapeConfig() config.include = false - c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}) + + c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config}, 0, func(index int, name string) {}) got := c.Content() want := "" @@ -128,4 +180,5 @@ func TestNotInclude(t *testing.T) { if got != want { t.Errorf("got %v, wanted %v", got, want) } + }