From d73ae0a73b474de6195376654fa9848cd55b94fc Mon Sep 17 00:00:00 2001 From: lapwat Date: Tue, 9 Aug 2022 18:21:18 +0200 Subject: [PATCH] add html format, handle lazy loading images --- Makefile | 2 +- README.md | 9 +++++---- book/format.go | 43 +++++++++++++++++++++++++++++++++++++++- book/format_test.go | 47 +++++++++++++++++++++++++++++++++++++++++++- book/scraper.go | 33 ++++++++++++++++++++++--------- book/scraper_test.go | 2 +- cmd/get.go | 18 ++++++++++++++++- cmd/version.go | 2 +- 8 files changed, 137 insertions(+), 19 deletions(-) diff --git a/Makefile b/Makefile index 4543256..d748813 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ format: gofmt -s -w . test: - go test github.com/lapwat/papeer/book + go test ./... install: go install diff --git a/README.md b/README.md index 024d6e6..27dc8ca 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Papeer -Papeer is a powerful **ereader internet vacuum**. It can scrape any website, removing ads and keeping only the relevant content (formatted text and images). You can export the content to Markdown, EPUB or MOBI files. +Papeer is a powerful **ereader internet vacuum**. It can scrape any website, removing ads and keeping only the relevant content (formatted text and images). You can export the content to Markdown, HTML, EPUB or MOBI files. # Table of contents @@ -39,7 +39,7 @@ Flags: -a, --author string book author --delay int time in milliseconds to wait before downloading next chapter, use with depth/selector (default -1) -d, --depth int scraping depth - -f, --format string file format [stdout, md, epub, mobi] (default "md") + -f, --format string file format [md, html, epub, mobi] (default "md") -h, --help help for get --images retrieve images only -i, --include include URL as first chapter, use with depth/selector @@ -50,6 +50,7 @@ Flags: -q, --quiet hide progress bar -r, --reverse reverse chapter order -s, --selector strings table of contents CSS selector + --stdout print to standard output -t, --threads int download concurrency, use with depth/selector (default -1) --use-link-name use link name for chapter title ``` @@ -140,7 +141,7 @@ go install github.com/lapwat/papeer@latest ```sh # use platform=darwin for MacOS platform=linux -release=0.5.5 +release=0.5.6 # download and extract curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64.tar.gz > papeer.tar.gz @@ -153,7 +154,7 @@ sudo mv papeer /usr/local/bin ### Windows -Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.5.5/papeer-v0.5.5-windows-amd64.exe.zip). +Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.5.6/papeer-v0.5.6-windows-amd64.zip). ## MOBI support diff --git a/book/format.go b/book/format.go index 6049964..dff1a43 100644 --- a/book/format.go +++ b/book/format.go @@ -24,6 +24,7 @@ func Filename(name string) string { func ToMarkdownString(c chapter) string { markdown := "" + // chapter content if c.config.Include { // title markdown += fmt.Sprintf("%s\n", c.Name()) @@ -37,8 +38,8 @@ func ToMarkdownString(c chapter) string { markdown += fmt.Sprintf("%s\n\n\n", content) } + // subchapters content for _, sc := range c.SubChapters() { - // subchapters content markdown += fmt.Sprintf("%s\n\n\n", ToMarkdownString(sc)) } @@ -66,6 +67,44 @@ func ToMarkdown(c chapter, filename string) string { return filename } +func ToHtmlString(c chapter) string { + html := "" + + // chapter content + if c.config.Include { + html += fmt.Sprintf("

%s

", c.Name()) + html += c.Content() + } + + // subchapters content + for _, sc := range c.SubChapters() { + html += ToHtmlString(sc) + } + + return html +} + +func ToHtml(c chapter, filename string) string { + if len(filename) == 0 { + filename = fmt.Sprintf("%s.html", Filename(c.Name())) + } + + html := fmt.Sprintf("%s", ToHtmlString(c)) + + // write to file + f, err := os.Create(filename) + if err != nil { + log.Fatal(err) + } + _, err2 := f.WriteString(html) + if err2 != nil { + log.Fatal(err2) + } + f.Close() + + return filename +} + func ToEpub(c chapter, filename string) string { if len(filename) == 0 { filename = fmt.Sprintf("%s.epub", Filename(c.Name())) @@ -88,6 +127,7 @@ func ToEpub(c chapter, filename string) string { func AppendToEpub(e *epub.Epub, c chapter) { content := "" + // chapter content if c.config.Include { if c.config.ImagesOnly == false { @@ -129,6 +169,7 @@ func AppendToEpub(e *epub.Epub, c chapter) { } + // subchapters content for _, sc := range c.SubChapters() { AppendToEpub(e, sc) } diff --git a/book/format_test.go b/book/format_test.go index 34cccd6..94fda74 100644 --- a/book/format_test.go +++ b/book/format_test.go @@ -22,7 +22,7 @@ func TestToMarkdownString(t *testing.T) { c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {}) got := ToMarkdownString(c) - want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n2011\n\n\n" + want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n 1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n 2011\n\n\n" if got != want { t.Errorf("got %q, wanted %q", got, want) @@ -62,6 +62,51 @@ func TestToMarkdownFilename(t *testing.T) { } +func TestToHtmlString(t *testing.T) { + + c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {}) + + got := ToHtmlString(c) + want := "

Books

\n \n
\n \n \n\n
\n \n\n" + + if got != want { + t.Errorf("got %q, wanted %q", got, want) + } + +} + +func TestToHtml(t *testing.T) { + + c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {}) + ToHtml(c, "") + + filename := "Books.html" + if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) { + t.Errorf("%s does not exist: %v", filename, err) + } else { + if err := os.Remove(filename); err != nil { + t.Errorf("cannot remove %v: %v", filename, err) + } + } + +} + +func TestToHtmlFilename(t *testing.T) { + + filename := "ebook.html" + c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {}) + ToHtml(c, filename) + + if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) { + t.Errorf("%s does not exist: %v", filename, err) + } else { + if err := os.Remove(filename); err != nil { + t.Errorf("cannot remove %v: %v", filename, err) + } + } + +} + func TestToEpub(t *testing.T) { c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {}) diff --git a/book/scraper.go b/book/scraper.go index 693ab60..e2492a3 100644 --- a/book/scraper.go +++ b/book/scraper.go @@ -250,27 +250,42 @@ func NewChapterFromURL(url, linkName string, configs []*ScrapeConfig, index int, // we care about the content only if: // - we include this level // - we use the page name - content = article.Content + + // parse HTML + doc, err := goquery.NewDocumentFromReader(strings.NewReader(article.Content)) + if err != nil { + log.Fatal(err) + } + + // handle lazy images + doc.Find("img").Each(func(i int, source *goquery.Selection) { + src, exists := source.Attr("data-lazy-src") + if exists { + source.SetAttr("src", src) + } + }) + doc.Find("source").Remove() // extract images if config.ImagesOnly { - // parse HTML - doc, err := goquery.NewDocumentFromReader(strings.NewReader(content)) - if err != nil { - log.Fatal(err) - } - // append every image to content content = "" doc.Find("img").Each(func(i int, s *goquery.Selection) { imageTag, _ := goquery.OuterHtml(s) - imageTag = strings.ReplaceAll(imageTag, "\n", "") - + // imageTag = strings.ReplaceAll(imageTag, "\n", "") content += imageTag }) + } else { + + content, err = doc.Find("[id*=readability-page]").Html() + if err != nil { + log.Fatal(err) + } + } + } return chapter{string(body), name, article.Byline, content, subchapters, config} diff --git a/book/scraper_test.go b/book/scraper_test.go index a5f305f..27a0494 100644 --- a/book/scraper_test.go +++ b/book/scraper_test.go @@ -68,7 +68,7 @@ func TestContent(t *testing.T) { c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{config}, 0, func(index int, name string) {}) got := c.Content() - want := "
\n \n
\n \n \n\n
\n \n\n
" + want := "\n \n
\n \n \n\n
\n \n\n" if got != want { t.Errorf("got %v, wanted %v", got, want) diff --git a/cmd/get.go b/cmd/get.go index bd6f1fa..96fec66 100644 --- a/cmd/get.go +++ b/cmd/get.go @@ -43,7 +43,7 @@ func init() { getCmd.PersistentFlags().StringVarP(&getOpts.name, "name", "n", "", "book name (default: page title)") getCmd.PersistentFlags().StringVarP(&getOpts.author, "author", "a", "", "book author") - getCmd.PersistentFlags().StringVarP(&getOpts.Format, "format", "f", "md", "file format [md, epub, mobi]") + getCmd.PersistentFlags().StringVarP(&getOpts.Format, "format", "f", "md", "file format [md, html, epub, mobi]") getCmd.PersistentFlags().StringVarP(&getOpts.output, "output", "", "", "file name (default: book name)") getCmd.PersistentFlags().BoolVarP(&getOpts.stdout, "stdout", "", false, "print to standard output") getCmd.PersistentFlags().BoolVarP(&getOpts.images, "images", "", false, "retrieve images only") @@ -74,6 +74,7 @@ var getCmd = &cobra.Command{ formatEnum := map[string]bool{ "md": true, + "html": true, "epub": true, "mobi": true, } @@ -178,6 +179,21 @@ var getCmd = &cobra.Command{ } } + if getOpts.Format == "html" { + filename := book.ToHtml(c, getOpts.output) + + if getOpts.stdout { + bytesRead, err := ioutil.ReadFile(filename) + if err != nil { + log.Fatal(err) + } + + fmt.Println(string(bytesRead)) + } else { + fmt.Printf("Html saved to \"%s\"\n", filename) + } + } + if getOpts.Format == "epub" { filename := book.ToEpub(c, getOpts.output) diff --git a/cmd/version.go b/cmd/version.go index 638437d..bbede9f 100644 --- a/cmd/version.go +++ b/cmd/version.go @@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{ Use: "version", Short: "Print the version number of papeer", Run: func(cmd *cobra.Command, args []string) { - fmt.Println("papeer v0.5.5") + fmt.Println("papeer v0.5.6") }, }