refacto get command, fix: images option

This commit is contained in:
lapwat
2022-01-02 02:16:45 +01:00
parent 29008185a8
commit 5e735f9c52
7 changed files with 103 additions and 201 deletions

View File

@@ -111,14 +111,15 @@ go get -u github.com/lapwat/papeer
```sh ```sh
platform=linux # use platform=darwin for MacOS platform=linux # use platform=darwin for MacOS
curl -L https://github.com/lapwat/papeer/releases/download/v0.3.1/papeer-v0.3.1-$platform-amd64 > papeer release=0.3.2
curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64 > papeer
chmod +x papeer chmod +x papeer
sudo mv papeer /usr/local/bin sudo mv papeer /usr/local/bin
``` ```
### On Windows ### On Windows
Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.3.1/papeer-v0.3.1-windows-amd64.exe). Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.3.2/papeer-v0.3.2-windows-amd64.exe).
## Install kindlegen to export websites to MOBI (optional) ## Install kindlegen to export websites to MOBI (optional)

View File

@@ -22,53 +22,54 @@ func Filename(name string) string {
} }
func ToMarkdown(c chapter) string { func ToMarkdown(c chapter) string {
markdown := ""
// make title if c.config.include {
underline := strings.Repeat("=", len(c.Name())) // title
title := fmt.Sprintf("%s\n%s", c.Name(), underline) markdown += fmt.Sprintf("%s\n", c.Name())
markdown += fmt.Sprintf("%s\n\n", strings.Repeat("=", len(c.Name())))
// convert content to markdown // convert content to markdown
content, err := md.NewConverter("", true, nil).ConvertString(c.Content()) content, err := md.NewConverter("", true, nil).ConvertString(c.Content())
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
markdown += fmt.Sprintf("%s\n\n\n", content)
// merge title and content
content = fmt.Sprintf("%s\n\n%s", title, content)
for _, sc := range c.SubChapters() {
// merge subchapters
content = fmt.Sprintf("%s\n\n\n%s", content, ToMarkdown(sc))
} }
return content for _, sc := range c.SubChapters() {
// subchapters content
markdown += fmt.Sprintf("%s\n\n\n", ToMarkdown(sc))
}
return markdown
} }
func ToEpub(c chapter, filename string) string { func ToEpub(c chapter, filename string) string {
if len(filename) == 0 { if len(filename) == 0 {
filename = fmt.Sprintf("%s.epub", c.Name()) filename = fmt.Sprintf("%s.epub", Filename(c.Name()))
} }
// init ebook // init ebook
e := epub.NewEpub(c.Name()) e := epub.NewEpub(c.Name())
e.SetAuthor(c.Author()) e.SetAuthor(c.Author())
AppendToEpub(e, c, false) AppendToEpub(e, c)
err := e.Write(filename) err := e.Write(filename)
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
fmt.Printf("Ebook saved to \"%s\"\n", filename)
return filename return filename
} }
func AppendToEpub(e *epub.Epub, c chapter, imagesOnly bool) { func AppendToEpub(e *epub.Epub, c chapter) {
content := "" content := ""
if imagesOnly == false { if c.config.include {
if c.config.imagesOnly == false {
content = c.Content() content = c.Content()
} }
@@ -81,9 +82,10 @@ func AppendToEpub(e *epub.Epub, c chapter, imagesOnly bool) {
// download images and replace src in img tags of content // download images and replace src in img tags of content
doc.Find("img").Each(func(i int, s *goquery.Selection) { doc.Find("img").Each(func(i int, s *goquery.Selection) {
src, _ := s.Attr("src") src, _ := s.Attr("src")
src = strings.Split(src, "?")[0] // remove query part
imagePath, _ := e.AddImage(src, "") imagePath, _ := e.AddImage(src, "")
if imagesOnly { if c.config.imagesOnly {
imageTag, _ := goquery.OuterHtml(s) imageTag, _ := goquery.OuterHtml(s)
content += strings.Replace(imageTag, src, imagePath, 1) content += strings.Replace(imageTag, src, imagePath, 1)
} else { } else {
@@ -91,20 +93,29 @@ func AppendToEpub(e *epub.Epub, c chapter, imagesOnly bool) {
} }
}) })
html := fmt.Sprintf("<h1>%s</h1>%s", c.Name(), content) html := ""
// add title only if imagesOnly = false
if c.config.imagesOnly == false {
html += fmt.Sprintf("<h1>%s</h1>", c.Name())
}
html += content
// write to epub file
_, err = e.AddSection(html, c.Name(), "", "") _, err = e.AddSection(html, c.Name(), "", "")
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
}
for _, sc := range c.SubChapters() { for _, sc := range c.SubChapters() {
AppendToEpub(e, sc, false) AppendToEpub(e, sc)
} }
} }
func ToMobi(c chapter, filename string) string { func ToMobi(c chapter, filename string) string {
if len(filename) == 0 { if len(filename) == 0 {
filename = fmt.Sprintf("%s.mobi", c.Name()) filename = fmt.Sprintf("%s.mobi", Filename(c.Name()))
} else { } else {
// add .mobi extension if not specified // add .mobi extension if not specified
@@ -123,8 +134,6 @@ func ToMobi(c chapter, filename string) string {
// log.Fatal(err) // log.Fatal(err)
// } // }
fmt.Printf("Ebook saved to \"%s\"\n", filename)
err := os.Remove(filenameEPUB) err := os.Remove(filenameEPUB)
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)

View File

@@ -22,7 +22,7 @@ func TestToMarkdown(t *testing.T) {
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {}) c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
got := ToMarkdown(c) got := ToMarkdown(c)
want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n2011" want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n2011\n\n\n"
if got != want { if got != want {
t.Errorf("got %q, wanted %q", got, want) t.Errorf("got %q, wanted %q", got, want)

View File

@@ -69,6 +69,13 @@ func NewScrapeConfigsWikipedia() []*ScrapeConfig {
return []*ScrapeConfig{config0, config1} return []*ScrapeConfig{config0, config1}
} }
func NewScrapeConfigFake() *ScrapeConfig {
config := NewScrapeConfig()
config.include = false
return config
}
func NewBookFromURL(url, selector, name, author string, recursive, include, imagesOnly bool, limit, offset, delay, threads int) book { func NewBookFromURL(url, selector, name, author string, recursive, include, imagesOnly bool, limit, offset, delay, threads int) book {
config1 := NewScrapeConfig() config1 := NewScrapeConfig()
config1.imagesOnly = imagesOnly config1.imagesOnly = imagesOnly
@@ -85,7 +92,7 @@ func NewBookFromURL(url, selector, name, author string, recursive, include, imag
config2.threads = threads config2.threads = threads
config2.include = include config2.include = include
config2.imagesOnly = imagesOnly config2.imagesOnly = imagesOnly
chapters, home = tableOfContent(url, config2) chapters, home = tableOfContent(url, config2, config1)
} else { } else {
chapters = []chapter{NewChapterFromURL(url, []*ScrapeConfig{config1}, 0, func(index int, name string) {})} chapters = []chapter{NewChapterFromURL(url, []*ScrapeConfig{config1}, 0, func(index int, name string) {})}
home = chapters[0] home = chapters[0]
@@ -136,7 +143,7 @@ func NewChapterFromURL(url string, configs []*ScrapeConfig, index int, updatePro
} }
name := article.Title name := article.Title
// notify progress bar // notify progress bar with new name
updateProgressBarName(index, name) updateProgressBarName(index, name)
subchapters := []chapter{} subchapters := []chapter{}
@@ -222,6 +229,8 @@ func NewChapterFromURL(url string, configs []*ScrapeConfig, index int, updatePro
content = "" content = ""
doc.Find("img").Each(func(i int, s *goquery.Selection) { doc.Find("img").Each(func(i int, s *goquery.Selection) {
imageTag, _ := goquery.OuterHtml(s) imageTag, _ := goquery.OuterHtml(s)
imageTag = strings.ReplaceAll(imageTag, "\n", "")
content += imageTag content += imageTag
}) })
@@ -231,7 +240,7 @@ func NewChapterFromURL(url string, configs []*ScrapeConfig, index int, updatePro
return chapter{string(body), name, article.Byline, content, subchapters, config} return chapter{string(body), name, article.Byline, content, subchapters, config}
} }
func tableOfContent(url string, config *ScrapeConfig) ([]chapter, chapter) { func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig) ([]chapter, chapter) {
base, err := urllib.Parse(url) base, err := urllib.Parse(url)
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
@@ -243,7 +252,7 @@ func tableOfContent(url string, config *ScrapeConfig) ([]chapter, chapter) {
} }
chapters := make([]chapter, len(links)) chapters := make([]chapter, len(links))
progress := NewProgress(links, "", 0) // progress := NewProgress(links, "", 0)
delay := config.delay delay := config.delay
if delay >= 0 { if delay >= 0 {
@@ -256,9 +265,9 @@ func tableOfContent(url string, config *ScrapeConfig) ([]chapter, chapter) {
log.Fatal(err) log.Fatal(err)
} }
sc := NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {}) sc := NewChapterFromURL(u.String(), []*ScrapeConfig{subConfig}, 0, func(index int, name string) {})
chapters[index] = sc chapters[index] = sc
progress.Increment(index) // progress.Increment(index)
// short sleep for last chapter to let the progress bar update // short sleep for last chapter to let the progress bar update
if index == len(links)-1 { if index == len(links)-1 {
@@ -292,9 +301,9 @@ func tableOfContent(url string, config *ScrapeConfig) ([]chapter, chapter) {
log.Fatal(err) log.Fatal(err)
} }
sc := NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {}) sc := NewChapterFromURL(u.String(), []*ScrapeConfig{subConfig}, 0, func(index int, name string) {})
chapters[index] = sc chapters[index] = sc
progress.Increment(index) // progress.Increment(index)
<-semaphore <-semaphore
}(index, l) }(index, l)

View File

@@ -5,18 +5,14 @@ import (
"fmt" "fmt"
"log" "log"
"os" "os"
"os/exec"
"strings" "strings"
md "github.com/JohannesKaufmann/html-to-markdown"
"github.com/PuerkitoBio/goquery"
epub "github.com/bmaupin/go-epub"
"github.com/spf13/cobra" "github.com/spf13/cobra"
"github.com/lapwat/papeer/book" "github.com/lapwat/papeer/book"
) )
var recursive, include, images bool var recursive, include, images, quiet bool
var format, output, selector, name, author string var format, output, selector, name, author string
var limit, offset, delay, threads int var limit, offset, delay, threads int
@@ -79,159 +75,46 @@ var getCmd = &cobra.Command{
url := args[0] url := args[0]
b := book.NewBookFromURL(url, selector, name, author, recursive, include, images, limit, offset, delay, threads) b := book.NewBookFromURL(url, selector, name, author, recursive, include, images, limit, offset, delay, threads)
if len(output) == 0 { fakeConfig := book.NewScrapeConfigFake()
// set default output fakeChapter := book.NewChapter("", b.Name(), b.Author(), "", b.Chapters(), fakeConfig)
output = strings.ReplaceAll(b.Name(), " ", "_")
output = strings.ReplaceAll(output, "/", "")
output = fmt.Sprintf("%s.%s", output, format)
}
if format == "stdout" { if format == "stdout" {
// TODO: ToMarkdownString
for _, c := range b.Chapters() { markdown := book.ToMarkdown(fakeChapter)
// convert to markdown fmt.Println(markdown)
content, err := md.NewConverter("", true, nil).ConvertString(c.Content())
if err != nil {
log.Fatal(err)
}
text := fmt.Sprintf("%s\n%s\n\n%s\n\n\n", c.Name(), strings.Repeat("=", len(c.Name())), content)
// write to stdout
fmt.Println(text)
}
} }
if format == "md" { if format == "md" {
// TODO: ToMarkdownFile
markdown := book.ToMarkdown(fakeChapter)
// create markdown file if len(output) == 0 {
filename := book.Filename(fakeChapter.Name())
output = fmt.Sprintf("%s.md", filename)
}
// write to file
f, err := os.Create(output) f, err := os.Create(output)
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
defer f.Close() _, err2 := f.WriteString(markdown)
if err2 != nil {
for _, c := range b.Chapters() { log.Fatal(err2)
// convert to markdown
content, err := md.NewConverter("", true, nil).ConvertString(c.Content())
if err != nil {
log.Fatal(err)
}
text := fmt.Sprintf("%s\n%s\n\n%s\n\n\n", c.Name(), strings.Repeat("=", len(c.Name())), content)
// write to markdown file
_, err = f.WriteString(text)
if err != nil {
log.Fatal(err)
}
} }
f.Close()
fmt.Printf("Markdown saved to \"%s\"\n", output) fmt.Printf("Markdown saved to \"%s\"\n", output)
} }
if format == "epub" { if format == "epub" {
e := epub.NewEpub(b.Name()) output = book.ToEpub(fakeChapter, output)
e.SetAuthor(b.Author())
for _, c := range b.Chapters() {
var content string
if images == false {
content = c.Content()
}
// parse content
doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content()))
if err != nil {
log.Fatal(err)
}
// retrieve images and download it
doc.Find("img").Each(func(i int, s *goquery.Selection) {
src, _ := s.Attr("src")
imagePath, _ := e.AddImage(src, "")
if images {
imageTag, _ := goquery.OuterHtml(s)
content += imageTag
}
content = strings.ReplaceAll(content, src, imagePath)
})
html := fmt.Sprintf("<h1>%s</h1>%s", c.Name(), content)
_, err = e.AddSection(html, c.Name(), "", "")
if err != nil {
log.Fatal(err)
}
}
err := e.Write(output)
if err != nil {
log.Fatal(err)
}
fmt.Printf("Ebook saved to \"%s\"\n", output) fmt.Printf("Ebook saved to \"%s\"\n", output)
} }
if format == "mobi" { if format == "mobi" {
e := epub.NewEpub(b.Name()) output = book.ToMobi(fakeChapter, output)
e.SetAuthor(b.Author())
for _, c := range b.Chapters() {
var content string
if images == false {
content = c.Content()
}
// parse content
doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content()))
if err != nil {
log.Fatal(err)
}
// retrieve images and download it
doc.Find("img").Each(func(i int, s *goquery.Selection) {
src, _ := s.Attr("src")
imagePath, _ := e.AddImage(src, "")
if images {
imageTag, _ := goquery.OuterHtml(s)
content += imageTag
}
content = strings.ReplaceAll(content, src, imagePath)
})
html := fmt.Sprintf("<h1>%s</h1>%s", c.Name(), content)
_, err = e.AddSection(html, c.Name(), "", "")
if err != nil {
log.Fatal(err)
}
}
outputEPUB := strings.ReplaceAll(output, ".mobi", ".epub")
err := e.Write(outputEPUB)
if err != nil {
log.Fatal(err)
}
exec.Command("kindlegen", outputEPUB).Run()
// exec command always return status 1 even if it succeed
// if err != nil {
// log.Fatal(err)
// }
fmt.Printf("Ebook saved to \"%s\"\n", output) fmt.Printf("Ebook saved to \"%s\"\n", output)
err = os.Remove(outputEPUB)
if err != nil {
log.Fatal(err)
}
} }
}, },
} }

View File

@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
Use: "version", Use: "version",
Short: "Print the version number of papeer", Short: "Print the version number of papeer",
Run: func(cmd *cobra.Command, args []string) { Run: func(cmd *cobra.Command, args []string) {
fmt.Println("papeer v0.3.1") fmt.Println("papeer v0.3.2")
}, },
} }