diff --git a/Makefile b/Makefile
index 4543256..d748813 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@ format:
gofmt -s -w .
test:
- go test github.com/lapwat/papeer/book
+ go test ./...
install:
go install
diff --git a/README.md b/README.md
index 024d6e6..27dc8ca 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# Papeer
-Papeer is a powerful **ereader internet vacuum**. It can scrape any website, removing ads and keeping only the relevant content (formatted text and images). You can export the content to Markdown, EPUB or MOBI files.
+Papeer is a powerful **ereader internet vacuum**. It can scrape any website, removing ads and keeping only the relevant content (formatted text and images). You can export the content to Markdown, HTML, EPUB or MOBI files.
# Table of contents
@@ -39,7 +39,7 @@ Flags:
-a, --author string book author
--delay int time in milliseconds to wait before downloading next chapter, use with depth/selector (default -1)
-d, --depth int scraping depth
- -f, --format string file format [stdout, md, epub, mobi] (default "md")
+ -f, --format string file format [md, html, epub, mobi] (default "md")
-h, --help help for get
--images retrieve images only
-i, --include include URL as first chapter, use with depth/selector
@@ -50,6 +50,7 @@ Flags:
-q, --quiet hide progress bar
-r, --reverse reverse chapter order
-s, --selector strings table of contents CSS selector
+ --stdout print to standard output
-t, --threads int download concurrency, use with depth/selector (default -1)
--use-link-name use link name for chapter title
```
@@ -140,7 +141,7 @@ go install github.com/lapwat/papeer@latest
```sh
# use platform=darwin for MacOS
platform=linux
-release=0.5.5
+release=0.5.6
# download and extract
curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64.tar.gz > papeer.tar.gz
@@ -153,7 +154,7 @@ sudo mv papeer /usr/local/bin
### Windows
-Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.5.5/papeer-v0.5.5-windows-amd64.exe.zip).
+Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.5.6/papeer-v0.5.6-windows-amd64.zip).
## MOBI support
diff --git a/book/format.go b/book/format.go
index 6049964..dff1a43 100644
--- a/book/format.go
+++ b/book/format.go
@@ -24,6 +24,7 @@ func Filename(name string) string {
func ToMarkdownString(c chapter) string {
markdown := ""
+ // chapter content
if c.config.Include {
// title
markdown += fmt.Sprintf("%s\n", c.Name())
@@ -37,8 +38,8 @@ func ToMarkdownString(c chapter) string {
markdown += fmt.Sprintf("%s\n\n\n", content)
}
+ // subchapters content
for _, sc := range c.SubChapters() {
- // subchapters content
markdown += fmt.Sprintf("%s\n\n\n", ToMarkdownString(sc))
}
@@ -66,6 +67,44 @@ func ToMarkdown(c chapter, filename string) string {
return filename
}
+func ToHtmlString(c chapter) string {
+ html := ""
+
+ // chapter content
+ if c.config.Include {
+ html += fmt.Sprintf("
%s
", c.Name())
+ html += c.Content()
+ }
+
+ // subchapters content
+ for _, sc := range c.SubChapters() {
+ html += ToHtmlString(sc)
+ }
+
+ return html
+}
+
+func ToHtml(c chapter, filename string) string {
+ if len(filename) == 0 {
+ filename = fmt.Sprintf("%s.html", Filename(c.Name()))
+ }
+
+ html := fmt.Sprintf("%s", ToHtmlString(c))
+
+ // write to file
+ f, err := os.Create(filename)
+ if err != nil {
+ log.Fatal(err)
+ }
+ _, err2 := f.WriteString(html)
+ if err2 != nil {
+ log.Fatal(err2)
+ }
+ f.Close()
+
+ return filename
+}
+
func ToEpub(c chapter, filename string) string {
if len(filename) == 0 {
filename = fmt.Sprintf("%s.epub", Filename(c.Name()))
@@ -88,6 +127,7 @@ func ToEpub(c chapter, filename string) string {
func AppendToEpub(e *epub.Epub, c chapter) {
content := ""
+ // chapter content
if c.config.Include {
if c.config.ImagesOnly == false {
@@ -129,6 +169,7 @@ func AppendToEpub(e *epub.Epub, c chapter) {
}
+ // subchapters content
for _, sc := range c.SubChapters() {
AppendToEpub(e, sc)
}
diff --git a/book/format_test.go b/book/format_test.go
index 34cccd6..94fda74 100644
--- a/book/format_test.go
+++ b/book/format_test.go
@@ -22,7 +22,7 @@ func TestToMarkdownString(t *testing.T) {
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
got := ToMarkdownString(c)
- want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n2011\n\n\n"
+ want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n 1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n 2011\n\n\n"
if got != want {
t.Errorf("got %q, wanted %q", got, want)
@@ -62,6 +62,51 @@ func TestToMarkdownFilename(t *testing.T) {
}
+func TestToHtmlString(t *testing.T) {
+
+ c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
+
+ got := ToHtmlString(c)
+ want := "Books
\n \n \n \n \n \n \n \n\n\n\n \n\n \n \n\n"
+
+ if got != want {
+ t.Errorf("got %q, wanted %q", got, want)
+ }
+
+}
+
+func TestToHtml(t *testing.T) {
+
+ c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
+ ToHtml(c, "")
+
+ filename := "Books.html"
+ if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
+ t.Errorf("%s does not exist: %v", filename, err)
+ } else {
+ if err := os.Remove(filename); err != nil {
+ t.Errorf("cannot remove %v: %v", filename, err)
+ }
+ }
+
+}
+
+func TestToHtmlFilename(t *testing.T) {
+
+ filename := "ebook.html"
+ c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
+ ToHtml(c, filename)
+
+ if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
+ t.Errorf("%s does not exist: %v", filename, err)
+ } else {
+ if err := os.Remove(filename); err != nil {
+ t.Errorf("cannot remove %v: %v", filename, err)
+ }
+ }
+
+}
+
func TestToEpub(t *testing.T) {
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
diff --git a/book/scraper.go b/book/scraper.go
index 693ab60..e2492a3 100644
--- a/book/scraper.go
+++ b/book/scraper.go
@@ -250,27 +250,42 @@ func NewChapterFromURL(url, linkName string, configs []*ScrapeConfig, index int,
// we care about the content only if:
// - we include this level
// - we use the page name
- content = article.Content
+
+ // parse HTML
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(article.Content))
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ // handle lazy images
+ doc.Find("img").Each(func(i int, source *goquery.Selection) {
+ src, exists := source.Attr("data-lazy-src")
+ if exists {
+ source.SetAttr("src", src)
+ }
+ })
+ doc.Find("source").Remove()
// extract images
if config.ImagesOnly {
- // parse HTML
- doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
- if err != nil {
- log.Fatal(err)
- }
-
// append every image to content
content = ""
doc.Find("img").Each(func(i int, s *goquery.Selection) {
imageTag, _ := goquery.OuterHtml(s)
- imageTag = strings.ReplaceAll(imageTag, "\n", "")
-
+ // imageTag = strings.ReplaceAll(imageTag, "\n", "")
content += imageTag
})
+ } else {
+
+ content, err = doc.Find("[id*=readability-page]").Html()
+ if err != nil {
+ log.Fatal(err)
+ }
+
}
+
}
return chapter{string(body), name, article.Byline, content, subchapters, config}
diff --git a/book/scraper_test.go b/book/scraper_test.go
index a5f305f..27a0494 100644
--- a/book/scraper_test.go
+++ b/book/scraper_test.go
@@ -68,7 +68,7 @@ func TestContent(t *testing.T) {
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{config}, 0, func(index int, name string) {})
got := c.Content()
- want := "\n \n
\n \n \n \n \n \n\n\n\n \n\n \n \n\n
"
+ want := "\n \n \n \n \n \n \n \n\n\n\n \n\n \n \n\n"
if got != want {
t.Errorf("got %v, wanted %v", got, want)
diff --git a/cmd/get.go b/cmd/get.go
index bd6f1fa..96fec66 100644
--- a/cmd/get.go
+++ b/cmd/get.go
@@ -43,7 +43,7 @@ func init() {
getCmd.PersistentFlags().StringVarP(&getOpts.name, "name", "n", "", "book name (default: page title)")
getCmd.PersistentFlags().StringVarP(&getOpts.author, "author", "a", "", "book author")
- getCmd.PersistentFlags().StringVarP(&getOpts.Format, "format", "f", "md", "file format [md, epub, mobi]")
+ getCmd.PersistentFlags().StringVarP(&getOpts.Format, "format", "f", "md", "file format [md, html, epub, mobi]")
getCmd.PersistentFlags().StringVarP(&getOpts.output, "output", "", "", "file name (default: book name)")
getCmd.PersistentFlags().BoolVarP(&getOpts.stdout, "stdout", "", false, "print to standard output")
getCmd.PersistentFlags().BoolVarP(&getOpts.images, "images", "", false, "retrieve images only")
@@ -74,6 +74,7 @@ var getCmd = &cobra.Command{
formatEnum := map[string]bool{
"md": true,
+ "html": true,
"epub": true,
"mobi": true,
}
@@ -178,6 +179,21 @@ var getCmd = &cobra.Command{
}
}
+ if getOpts.Format == "html" {
+ filename := book.ToHtml(c, getOpts.output)
+
+ if getOpts.stdout {
+ bytesRead, err := ioutil.ReadFile(filename)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ fmt.Println(string(bytesRead))
+ } else {
+ fmt.Printf("Html saved to \"%s\"\n", filename)
+ }
+ }
+
if getOpts.Format == "epub" {
filename := book.ToEpub(c, getOpts.output)
diff --git a/cmd/version.go b/cmd/version.go
index 638437d..bbede9f 100644
--- a/cmd/version.go
+++ b/cmd/version.go
@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
Use: "version",
Short: "Print the version number of papeer",
Run: func(cmd *cobra.Command, args []string) {
- fmt.Println("papeer v0.5.5")
+ fmt.Println("papeer v0.5.6")
},
}