mirror of
https://github.com/NohamR/papeer.git
synced 2026-05-25 20:00:47 +00:00
add html format, handle lazy loading images
This commit is contained in:
2
Makefile
2
Makefile
@@ -2,7 +2,7 @@ format:
|
|||||||
gofmt -s -w .
|
gofmt -s -w .
|
||||||
|
|
||||||
test:
|
test:
|
||||||
go test github.com/lapwat/papeer/book
|
go test ./...
|
||||||
|
|
||||||
install:
|
install:
|
||||||
go install
|
go install
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# Papeer
|
# Papeer
|
||||||
|
|
||||||
Papeer is a powerful **ereader internet vacuum**. It can scrape any website, removing ads and keeping only the relevant content (formatted text and images). You can export the content to Markdown, EPUB or MOBI files.
|
Papeer is a powerful **ereader internet vacuum**. It can scrape any website, removing ads and keeping only the relevant content (formatted text and images). You can export the content to Markdown, HTML, EPUB or MOBI files.
|
||||||
|
|
||||||
# Table of contents
|
# Table of contents
|
||||||
|
|
||||||
@@ -39,7 +39,7 @@ Flags:
|
|||||||
-a, --author string book author
|
-a, --author string book author
|
||||||
--delay int time in milliseconds to wait before downloading next chapter, use with depth/selector (default -1)
|
--delay int time in milliseconds to wait before downloading next chapter, use with depth/selector (default -1)
|
||||||
-d, --depth int scraping depth
|
-d, --depth int scraping depth
|
||||||
-f, --format string file format [stdout, md, epub, mobi] (default "md")
|
-f, --format string file format [md, html, epub, mobi] (default "md")
|
||||||
-h, --help help for get
|
-h, --help help for get
|
||||||
--images retrieve images only
|
--images retrieve images only
|
||||||
-i, --include include URL as first chapter, use with depth/selector
|
-i, --include include URL as first chapter, use with depth/selector
|
||||||
@@ -50,6 +50,7 @@ Flags:
|
|||||||
-q, --quiet hide progress bar
|
-q, --quiet hide progress bar
|
||||||
-r, --reverse reverse chapter order
|
-r, --reverse reverse chapter order
|
||||||
-s, --selector strings table of contents CSS selector
|
-s, --selector strings table of contents CSS selector
|
||||||
|
--stdout print to standard output
|
||||||
-t, --threads int download concurrency, use with depth/selector (default -1)
|
-t, --threads int download concurrency, use with depth/selector (default -1)
|
||||||
--use-link-name use link name for chapter title
|
--use-link-name use link name for chapter title
|
||||||
```
|
```
|
||||||
@@ -140,7 +141,7 @@ go install github.com/lapwat/papeer@latest
|
|||||||
```sh
|
```sh
|
||||||
# use platform=darwin for MacOS
|
# use platform=darwin for MacOS
|
||||||
platform=linux
|
platform=linux
|
||||||
release=0.5.5
|
release=0.5.6
|
||||||
|
|
||||||
# download and extract
|
# download and extract
|
||||||
curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64.tar.gz > papeer.tar.gz
|
curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64.tar.gz > papeer.tar.gz
|
||||||
@@ -153,7 +154,7 @@ sudo mv papeer /usr/local/bin
|
|||||||
|
|
||||||
### Windows
|
### Windows
|
||||||
|
|
||||||
Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.5.5/papeer-v0.5.5-windows-amd64.exe.zip).
|
Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.5.6/papeer-v0.5.6-windows-amd64.zip).
|
||||||
|
|
||||||
## MOBI support
|
## MOBI support
|
||||||
|
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ func Filename(name string) string {
|
|||||||
func ToMarkdownString(c chapter) string {
|
func ToMarkdownString(c chapter) string {
|
||||||
markdown := ""
|
markdown := ""
|
||||||
|
|
||||||
|
// chapter content
|
||||||
if c.config.Include {
|
if c.config.Include {
|
||||||
// title
|
// title
|
||||||
markdown += fmt.Sprintf("%s\n", c.Name())
|
markdown += fmt.Sprintf("%s\n", c.Name())
|
||||||
@@ -37,8 +38,8 @@ func ToMarkdownString(c chapter) string {
|
|||||||
markdown += fmt.Sprintf("%s\n\n\n", content)
|
markdown += fmt.Sprintf("%s\n\n\n", content)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// subchapters content
|
||||||
for _, sc := range c.SubChapters() {
|
for _, sc := range c.SubChapters() {
|
||||||
// subchapters content
|
|
||||||
markdown += fmt.Sprintf("%s\n\n\n", ToMarkdownString(sc))
|
markdown += fmt.Sprintf("%s\n\n\n", ToMarkdownString(sc))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -66,6 +67,44 @@ func ToMarkdown(c chapter, filename string) string {
|
|||||||
return filename
|
return filename
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func ToHtmlString(c chapter) string {
|
||||||
|
html := ""
|
||||||
|
|
||||||
|
// chapter content
|
||||||
|
if c.config.Include {
|
||||||
|
html += fmt.Sprintf("<h1>%s</h1>", c.Name())
|
||||||
|
html += c.Content()
|
||||||
|
}
|
||||||
|
|
||||||
|
// subchapters content
|
||||||
|
for _, sc := range c.SubChapters() {
|
||||||
|
html += ToHtmlString(sc)
|
||||||
|
}
|
||||||
|
|
||||||
|
return html
|
||||||
|
}
|
||||||
|
|
||||||
|
func ToHtml(c chapter, filename string) string {
|
||||||
|
if len(filename) == 0 {
|
||||||
|
filename = fmt.Sprintf("%s.html", Filename(c.Name()))
|
||||||
|
}
|
||||||
|
|
||||||
|
html := fmt.Sprintf("<html><head></head><body>%s</body></html>", ToHtmlString(c))
|
||||||
|
|
||||||
|
// write to file
|
||||||
|
f, err := os.Create(filename)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
_, err2 := f.WriteString(html)
|
||||||
|
if err2 != nil {
|
||||||
|
log.Fatal(err2)
|
||||||
|
}
|
||||||
|
f.Close()
|
||||||
|
|
||||||
|
return filename
|
||||||
|
}
|
||||||
|
|
||||||
func ToEpub(c chapter, filename string) string {
|
func ToEpub(c chapter, filename string) string {
|
||||||
if len(filename) == 0 {
|
if len(filename) == 0 {
|
||||||
filename = fmt.Sprintf("%s.epub", Filename(c.Name()))
|
filename = fmt.Sprintf("%s.epub", Filename(c.Name()))
|
||||||
@@ -88,6 +127,7 @@ func ToEpub(c chapter, filename string) string {
|
|||||||
func AppendToEpub(e *epub.Epub, c chapter) {
|
func AppendToEpub(e *epub.Epub, c chapter) {
|
||||||
content := ""
|
content := ""
|
||||||
|
|
||||||
|
// chapter content
|
||||||
if c.config.Include {
|
if c.config.Include {
|
||||||
|
|
||||||
if c.config.ImagesOnly == false {
|
if c.config.ImagesOnly == false {
|
||||||
@@ -129,6 +169,7 @@ func AppendToEpub(e *epub.Epub, c chapter) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// subchapters content
|
||||||
for _, sc := range c.SubChapters() {
|
for _, sc := range c.SubChapters() {
|
||||||
AppendToEpub(e, sc)
|
AppendToEpub(e, sc)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ func TestToMarkdownString(t *testing.T) {
|
|||||||
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||||
|
|
||||||
got := ToMarkdownString(c)
|
got := ToMarkdownString(c)
|
||||||
want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n2011\n\n\n"
|
want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n 1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n 2011\n\n\n"
|
||||||
|
|
||||||
if got != want {
|
if got != want {
|
||||||
t.Errorf("got %q, wanted %q", got, want)
|
t.Errorf("got %q, wanted %q", got, want)
|
||||||
@@ -62,6 +62,51 @@ func TestToMarkdownFilename(t *testing.T) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestToHtmlString(t *testing.T) {
|
||||||
|
|
||||||
|
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||||
|
|
||||||
|
got := ToHtmlString(c)
|
||||||
|
want := "<h1>Books</h1>\n \n <main>\n \n <article>\n \n <ul>\n \n <li>\n <a href=\"https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/\">Discours de la Méthode</a>\n <span>\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 98 min read -\n 1637\n </span>\n </li>\n \n <li>\n <a href=\"https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/\">The Twelve-Factor App</a>\n <span>\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 22 min read -\n 2011\n </span>\n </li>\n \n </ul>\n \n\n\n\n </article>\n\n </main>\n \n\n"
|
||||||
|
|
||||||
|
if got != want {
|
||||||
|
t.Errorf("got %q, wanted %q", got, want)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestToHtml(t *testing.T) {
|
||||||
|
|
||||||
|
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||||
|
ToHtml(c, "")
|
||||||
|
|
||||||
|
filename := "Books.html"
|
||||||
|
if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
|
||||||
|
t.Errorf("%s does not exist: %v", filename, err)
|
||||||
|
} else {
|
||||||
|
if err := os.Remove(filename); err != nil {
|
||||||
|
t.Errorf("cannot remove %v: %v", filename, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestToHtmlFilename(t *testing.T) {
|
||||||
|
|
||||||
|
filename := "ebook.html"
|
||||||
|
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||||
|
ToHtml(c, filename)
|
||||||
|
|
||||||
|
if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
|
||||||
|
t.Errorf("%s does not exist: %v", filename, err)
|
||||||
|
} else {
|
||||||
|
if err := os.Remove(filename); err != nil {
|
||||||
|
t.Errorf("cannot remove %v: %v", filename, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
func TestToEpub(t *testing.T) {
|
func TestToEpub(t *testing.T) {
|
||||||
|
|
||||||
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||||
|
|||||||
@@ -250,27 +250,42 @@ func NewChapterFromURL(url, linkName string, configs []*ScrapeConfig, index int,
|
|||||||
// we care about the content only if:
|
// we care about the content only if:
|
||||||
// - we include this level
|
// - we include this level
|
||||||
// - we use the page name
|
// - we use the page name
|
||||||
content = article.Content
|
|
||||||
|
// parse HTML
|
||||||
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(article.Content))
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// handle lazy images
|
||||||
|
doc.Find("img").Each(func(i int, source *goquery.Selection) {
|
||||||
|
src, exists := source.Attr("data-lazy-src")
|
||||||
|
if exists {
|
||||||
|
source.SetAttr("src", src)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
doc.Find("source").Remove()
|
||||||
|
|
||||||
// extract images
|
// extract images
|
||||||
if config.ImagesOnly {
|
if config.ImagesOnly {
|
||||||
|
|
||||||
// parse HTML
|
|
||||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
|
|
||||||
if err != nil {
|
|
||||||
log.Fatal(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// append every image to content
|
// append every image to content
|
||||||
content = ""
|
content = ""
|
||||||
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||||
imageTag, _ := goquery.OuterHtml(s)
|
imageTag, _ := goquery.OuterHtml(s)
|
||||||
imageTag = strings.ReplaceAll(imageTag, "\n", "")
|
// imageTag = strings.ReplaceAll(imageTag, "\n", "")
|
||||||
|
|
||||||
content += imageTag
|
content += imageTag
|
||||||
})
|
})
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
content, err = doc.Find("[id*=readability-page]").Html()
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return chapter{string(body), name, article.Byline, content, subchapters, config}
|
return chapter{string(body), name, article.Byline, content, subchapters, config}
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ func TestContent(t *testing.T) {
|
|||||||
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{config}, 0, func(index int, name string) {})
|
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{config}, 0, func(index int, name string) {})
|
||||||
|
|
||||||
got := c.Content()
|
got := c.Content()
|
||||||
want := "<div id=\"readability-page-1\" class=\"page\">\n \n <main>\n \n <article>\n \n <ul>\n \n <li>\n <a href=\"https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/\">Discours de la Méthode</a>\n <span>\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 98 min read -\n 1637\n </span>\n </li>\n \n <li>\n <a href=\"https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/\">The Twelve-Factor App</a>\n <span>\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 22 min read -\n 2011\n </span>\n </li>\n \n </ul>\n \n\n\n\n </article>\n\n </main>\n \n\n</div>"
|
want := "\n \n <main>\n \n <article>\n \n <ul>\n \n <li>\n <a href=\"https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/\">Discours de la Méthode</a>\n <span>\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 98 min read -\n 1637\n </span>\n </li>\n \n <li>\n <a href=\"https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/\">The Twelve-Factor App</a>\n <span>\n <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"24\" height=\"24\" viewBox=\"0 0 24 24\" fill=\"none\" stroke=\"currentColor\" stroke-width=\"2\" stroke-linecap=\"round\" stroke-linejoin=\"round\">\n <title>clock</title>\n <circle cx=\"12\" cy=\"12\" r=\"10\"></circle><polyline points=\"12 6 12 12 16 14\"></polyline>\n</svg> 22 min read -\n 2011\n </span>\n </li>\n \n </ul>\n \n\n\n\n </article>\n\n </main>\n \n\n"
|
||||||
|
|
||||||
if got != want {
|
if got != want {
|
||||||
t.Errorf("got %v, wanted %v", got, want)
|
t.Errorf("got %v, wanted %v", got, want)
|
||||||
|
|||||||
18
cmd/get.go
18
cmd/get.go
@@ -43,7 +43,7 @@ func init() {
|
|||||||
|
|
||||||
getCmd.PersistentFlags().StringVarP(&getOpts.name, "name", "n", "", "book name (default: page title)")
|
getCmd.PersistentFlags().StringVarP(&getOpts.name, "name", "n", "", "book name (default: page title)")
|
||||||
getCmd.PersistentFlags().StringVarP(&getOpts.author, "author", "a", "", "book author")
|
getCmd.PersistentFlags().StringVarP(&getOpts.author, "author", "a", "", "book author")
|
||||||
getCmd.PersistentFlags().StringVarP(&getOpts.Format, "format", "f", "md", "file format [md, epub, mobi]")
|
getCmd.PersistentFlags().StringVarP(&getOpts.Format, "format", "f", "md", "file format [md, html, epub, mobi]")
|
||||||
getCmd.PersistentFlags().StringVarP(&getOpts.output, "output", "", "", "file name (default: book name)")
|
getCmd.PersistentFlags().StringVarP(&getOpts.output, "output", "", "", "file name (default: book name)")
|
||||||
getCmd.PersistentFlags().BoolVarP(&getOpts.stdout, "stdout", "", false, "print to standard output")
|
getCmd.PersistentFlags().BoolVarP(&getOpts.stdout, "stdout", "", false, "print to standard output")
|
||||||
getCmd.PersistentFlags().BoolVarP(&getOpts.images, "images", "", false, "retrieve images only")
|
getCmd.PersistentFlags().BoolVarP(&getOpts.images, "images", "", false, "retrieve images only")
|
||||||
@@ -74,6 +74,7 @@ var getCmd = &cobra.Command{
|
|||||||
|
|
||||||
formatEnum := map[string]bool{
|
formatEnum := map[string]bool{
|
||||||
"md": true,
|
"md": true,
|
||||||
|
"html": true,
|
||||||
"epub": true,
|
"epub": true,
|
||||||
"mobi": true,
|
"mobi": true,
|
||||||
}
|
}
|
||||||
@@ -178,6 +179,21 @@ var getCmd = &cobra.Command{
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if getOpts.Format == "html" {
|
||||||
|
filename := book.ToHtml(c, getOpts.output)
|
||||||
|
|
||||||
|
if getOpts.stdout {
|
||||||
|
bytesRead, err := ioutil.ReadFile(filename)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Println(string(bytesRead))
|
||||||
|
} else {
|
||||||
|
fmt.Printf("Html saved to \"%s\"\n", filename)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if getOpts.Format == "epub" {
|
if getOpts.Format == "epub" {
|
||||||
filename := book.ToEpub(c, getOpts.output)
|
filename := book.ToEpub(c, getOpts.output)
|
||||||
|
|
||||||
|
|||||||
@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
|
|||||||
Use: "version",
|
Use: "version",
|
||||||
Short: "Print the version number of papeer",
|
Short: "Print the version number of papeer",
|
||||||
Run: func(cmd *cobra.Command, args []string) {
|
Run: func(cmd *cobra.Command, args []string) {
|
||||||
fmt.Println("papeer v0.5.5")
|
fmt.Println("papeer v0.5.6")
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user