mirror of
https://github.com/NohamR/papeer.git
synced 2026-05-24 20:00:45 +00:00
refacto get command, fix: images option
This commit is contained in:
@@ -111,14 +111,15 @@ go get -u github.com/lapwat/papeer
|
||||
|
||||
```sh
|
||||
platform=linux # use platform=darwin for MacOS
|
||||
curl -L https://github.com/lapwat/papeer/releases/download/v0.3.1/papeer-v0.3.1-$platform-amd64 > papeer
|
||||
release=0.3.2
|
||||
curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64 > papeer
|
||||
chmod +x papeer
|
||||
sudo mv papeer /usr/local/bin
|
||||
```
|
||||
|
||||
### On Windows
|
||||
|
||||
Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.3.1/papeer-v0.3.1-windows-amd64.exe).
|
||||
Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.3.2/papeer-v0.3.2-windows-amd64.exe).
|
||||
|
||||
## Install kindlegen to export websites to MOBI (optional)
|
||||
|
||||
|
||||
103
book/format.go
103
book/format.go
@@ -22,89 +22,100 @@ func Filename(name string) string {
|
||||
}
|
||||
|
||||
func ToMarkdown(c chapter) string {
|
||||
markdown := ""
|
||||
|
||||
// make title
|
||||
underline := strings.Repeat("=", len(c.Name()))
|
||||
title := fmt.Sprintf("%s\n%s", c.Name(), underline)
|
||||
if c.config.include {
|
||||
// title
|
||||
markdown += fmt.Sprintf("%s\n", c.Name())
|
||||
markdown += fmt.Sprintf("%s\n\n", strings.Repeat("=", len(c.Name())))
|
||||
|
||||
// convert content to markdown
|
||||
content, err := md.NewConverter("", true, nil).ConvertString(c.Content())
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
// convert content to markdown
|
||||
content, err := md.NewConverter("", true, nil).ConvertString(c.Content())
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
markdown += fmt.Sprintf("%s\n\n\n", content)
|
||||
}
|
||||
|
||||
// merge title and content
|
||||
content = fmt.Sprintf("%s\n\n%s", title, content)
|
||||
|
||||
for _, sc := range c.SubChapters() {
|
||||
// merge subchapters
|
||||
content = fmt.Sprintf("%s\n\n\n%s", content, ToMarkdown(sc))
|
||||
// subchapters content
|
||||
markdown += fmt.Sprintf("%s\n\n\n", ToMarkdown(sc))
|
||||
}
|
||||
|
||||
return content
|
||||
return markdown
|
||||
}
|
||||
|
||||
func ToEpub(c chapter, filename string) string {
|
||||
if len(filename) == 0 {
|
||||
filename = fmt.Sprintf("%s.epub", c.Name())
|
||||
filename = fmt.Sprintf("%s.epub", Filename(c.Name()))
|
||||
}
|
||||
|
||||
// init ebook
|
||||
e := epub.NewEpub(c.Name())
|
||||
e.SetAuthor(c.Author())
|
||||
|
||||
AppendToEpub(e, c, false)
|
||||
AppendToEpub(e, c)
|
||||
|
||||
err := e.Write(filename)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
fmt.Printf("Ebook saved to \"%s\"\n", filename)
|
||||
|
||||
return filename
|
||||
}
|
||||
|
||||
func AppendToEpub(e *epub.Epub, c chapter, imagesOnly bool) {
|
||||
func AppendToEpub(e *epub.Epub, c chapter) {
|
||||
content := ""
|
||||
|
||||
if imagesOnly == false {
|
||||
content = c.Content()
|
||||
}
|
||||
if c.config.include {
|
||||
|
||||
// parse content
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content()))
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
// download images and replace src in img tags of content
|
||||
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||
src, _ := s.Attr("src")
|
||||
imagePath, _ := e.AddImage(src, "")
|
||||
|
||||
if imagesOnly {
|
||||
imageTag, _ := goquery.OuterHtml(s)
|
||||
content += strings.Replace(imageTag, src, imagePath, 1)
|
||||
} else {
|
||||
content = strings.Replace(content, src, imagePath, 1)
|
||||
if c.config.imagesOnly == false {
|
||||
content = c.Content()
|
||||
}
|
||||
|
||||
// parse content
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content()))
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
// download images and replace src in img tags of content
|
||||
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||
src, _ := s.Attr("src")
|
||||
src = strings.Split(src, "?")[0] // remove query part
|
||||
imagePath, _ := e.AddImage(src, "")
|
||||
|
||||
if c.config.imagesOnly {
|
||||
imageTag, _ := goquery.OuterHtml(s)
|
||||
content += strings.Replace(imageTag, src, imagePath, 1)
|
||||
} else {
|
||||
content = strings.Replace(content, src, imagePath, 1)
|
||||
}
|
||||
})
|
||||
|
||||
html := ""
|
||||
// add title only if imagesOnly = false
|
||||
if c.config.imagesOnly == false {
|
||||
html += fmt.Sprintf("<h1>%s</h1>", c.Name())
|
||||
}
|
||||
html += content
|
||||
|
||||
// write to epub file
|
||||
_, err = e.AddSection(html, c.Name(), "", "")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
})
|
||||
|
||||
html := fmt.Sprintf("<h1>%s</h1>%s", c.Name(), content)
|
||||
_, err = e.AddSection(html, c.Name(), "", "")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
for _, sc := range c.SubChapters() {
|
||||
AppendToEpub(e, sc, false)
|
||||
AppendToEpub(e, sc)
|
||||
}
|
||||
}
|
||||
|
||||
func ToMobi(c chapter, filename string) string {
|
||||
if len(filename) == 0 {
|
||||
filename = fmt.Sprintf("%s.mobi", c.Name())
|
||||
filename = fmt.Sprintf("%s.mobi", Filename(c.Name()))
|
||||
} else {
|
||||
|
||||
// add .mobi extension if not specified
|
||||
@@ -123,12 +134,10 @@ func ToMobi(c chapter, filename string) string {
|
||||
// log.Fatal(err)
|
||||
// }
|
||||
|
||||
fmt.Printf("Ebook saved to \"%s\"\n", filename)
|
||||
|
||||
err := os.Remove(filenameEPUB)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
|
||||
return filename
|
||||
}
|
||||
|
||||
@@ -22,12 +22,12 @@ func TestToMarkdown(t *testing.T) {
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||
|
||||
got := ToMarkdown(c)
|
||||
want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n2011"
|
||||
want := "Books\n=====\n\n- [Discours de la Méthode](https://books.lapw.at/posts/ren%C3%A9-descartes-discours-de-la-m%C3%A9thode/)clock 98 min read -\n1637\n\n- [The Twelve-Factor App](https://books.lapw.at/posts/adam-wiggins-the-twelve-factor-app/)clock 22 min read -\n2011\n\n\n"
|
||||
|
||||
if got != want {
|
||||
t.Errorf("got %q, wanted %q", got, want)
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
func TestToEpub(t *testing.T) {
|
||||
@@ -43,7 +43,7 @@ func TestToEpub(t *testing.T) {
|
||||
t.Errorf("cannot remove %v: %v", filename, err)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
func TestToEpubFilename(t *testing.T) {
|
||||
@@ -59,7 +59,7 @@ func TestToEpubFilename(t *testing.T) {
|
||||
t.Errorf("cannot remove %v: %v", filename, err)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
func TestToMobi(t *testing.T) {
|
||||
@@ -75,5 +75,5 @@ func TestToMobi(t *testing.T) {
|
||||
t.Errorf("cannot remove %v: %v", filename, err)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@@ -69,6 +69,13 @@ func NewScrapeConfigsWikipedia() []*ScrapeConfig {
|
||||
return []*ScrapeConfig{config0, config1}
|
||||
}
|
||||
|
||||
func NewScrapeConfigFake() *ScrapeConfig {
|
||||
config := NewScrapeConfig()
|
||||
config.include = false
|
||||
|
||||
return config
|
||||
}
|
||||
|
||||
func NewBookFromURL(url, selector, name, author string, recursive, include, imagesOnly bool, limit, offset, delay, threads int) book {
|
||||
config1 := NewScrapeConfig()
|
||||
config1.imagesOnly = imagesOnly
|
||||
@@ -85,7 +92,7 @@ func NewBookFromURL(url, selector, name, author string, recursive, include, imag
|
||||
config2.threads = threads
|
||||
config2.include = include
|
||||
config2.imagesOnly = imagesOnly
|
||||
chapters, home = tableOfContent(url, config2)
|
||||
chapters, home = tableOfContent(url, config2, config1)
|
||||
} else {
|
||||
chapters = []chapter{NewChapterFromURL(url, []*ScrapeConfig{config1}, 0, func(index int, name string) {})}
|
||||
home = chapters[0]
|
||||
@@ -136,7 +143,7 @@ func NewChapterFromURL(url string, configs []*ScrapeConfig, index int, updatePro
|
||||
}
|
||||
name := article.Title
|
||||
|
||||
// notify progress bar
|
||||
// notify progress bar with new name
|
||||
updateProgressBarName(index, name)
|
||||
|
||||
subchapters := []chapter{}
|
||||
@@ -222,6 +229,8 @@ func NewChapterFromURL(url string, configs []*ScrapeConfig, index int, updatePro
|
||||
content = ""
|
||||
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||
imageTag, _ := goquery.OuterHtml(s)
|
||||
imageTag = strings.ReplaceAll(imageTag, "\n", "")
|
||||
|
||||
content += imageTag
|
||||
})
|
||||
|
||||
@@ -231,7 +240,7 @@ func NewChapterFromURL(url string, configs []*ScrapeConfig, index int, updatePro
|
||||
return chapter{string(body), name, article.Byline, content, subchapters, config}
|
||||
}
|
||||
|
||||
func tableOfContent(url string, config *ScrapeConfig) ([]chapter, chapter) {
|
||||
func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig) ([]chapter, chapter) {
|
||||
base, err := urllib.Parse(url)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
@@ -243,7 +252,7 @@ func tableOfContent(url string, config *ScrapeConfig) ([]chapter, chapter) {
|
||||
}
|
||||
|
||||
chapters := make([]chapter, len(links))
|
||||
progress := NewProgress(links, "", 0)
|
||||
// progress := NewProgress(links, "", 0)
|
||||
delay := config.delay
|
||||
|
||||
if delay >= 0 {
|
||||
@@ -256,9 +265,9 @@ func tableOfContent(url string, config *ScrapeConfig) ([]chapter, chapter) {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
sc := NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||
sc := NewChapterFromURL(u.String(), []*ScrapeConfig{subConfig}, 0, func(index int, name string) {})
|
||||
chapters[index] = sc
|
||||
progress.Increment(index)
|
||||
// progress.Increment(index)
|
||||
|
||||
// short sleep for last chapter to let the progress bar update
|
||||
if index == len(links)-1 {
|
||||
@@ -292,9 +301,9 @@ func tableOfContent(url string, config *ScrapeConfig) ([]chapter, chapter) {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
sc := NewChapterFromURL(u.String(), []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
|
||||
sc := NewChapterFromURL(u.String(), []*ScrapeConfig{subConfig}, 0, func(index int, name string) {})
|
||||
chapters[index] = sc
|
||||
progress.Increment(index)
|
||||
// progress.Increment(index)
|
||||
|
||||
<-semaphore
|
||||
}(index, l)
|
||||
|
||||
@@ -153,7 +153,7 @@ func TestSubChaptersLimitOver(t *testing.T) {
|
||||
|
||||
config0 := NewScrapeConfig()
|
||||
config0.limit = 3
|
||||
|
||||
|
||||
config1 := NewScrapeConfig()
|
||||
|
||||
c := NewChapterFromURL("https://books.lapw.at/", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
|
||||
|
||||
157
cmd/get.go
157
cmd/get.go
@@ -5,18 +5,14 @@ import (
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
|
||||
md "github.com/JohannesKaufmann/html-to-markdown"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
epub "github.com/bmaupin/go-epub"
|
||||
"github.com/spf13/cobra"
|
||||
|
||||
"github.com/lapwat/papeer/book"
|
||||
)
|
||||
|
||||
var recursive, include, images bool
|
||||
var recursive, include, images, quiet bool
|
||||
var format, output, selector, name, author string
|
||||
var limit, offset, delay, threads int
|
||||
|
||||
@@ -79,159 +75,46 @@ var getCmd = &cobra.Command{
|
||||
url := args[0]
|
||||
b := book.NewBookFromURL(url, selector, name, author, recursive, include, images, limit, offset, delay, threads)
|
||||
|
||||
if len(output) == 0 {
|
||||
// set default output
|
||||
output = strings.ReplaceAll(b.Name(), " ", "_")
|
||||
output = strings.ReplaceAll(output, "/", "")
|
||||
output = fmt.Sprintf("%s.%s", output, format)
|
||||
}
|
||||
fakeConfig := book.NewScrapeConfigFake()
|
||||
fakeChapter := book.NewChapter("", b.Name(), b.Author(), "", b.Chapters(), fakeConfig)
|
||||
|
||||
if format == "stdout" {
|
||||
|
||||
for _, c := range b.Chapters() {
|
||||
// convert to markdown
|
||||
content, err := md.NewConverter("", true, nil).ConvertString(c.Content())
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
text := fmt.Sprintf("%s\n%s\n\n%s\n\n\n", c.Name(), strings.Repeat("=", len(c.Name())), content)
|
||||
|
||||
// write to stdout
|
||||
fmt.Println(text)
|
||||
}
|
||||
|
||||
// TODO: ToMarkdownString
|
||||
markdown := book.ToMarkdown(fakeChapter)
|
||||
fmt.Println(markdown)
|
||||
}
|
||||
|
||||
if format == "md" {
|
||||
// TODO: ToMarkdownFile
|
||||
markdown := book.ToMarkdown(fakeChapter)
|
||||
|
||||
// create markdown file
|
||||
if len(output) == 0 {
|
||||
filename := book.Filename(fakeChapter.Name())
|
||||
output = fmt.Sprintf("%s.md", filename)
|
||||
}
|
||||
|
||||
// write to file
|
||||
f, err := os.Create(output)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
for _, c := range b.Chapters() {
|
||||
// convert to markdown
|
||||
content, err := md.NewConverter("", true, nil).ConvertString(c.Content())
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
text := fmt.Sprintf("%s\n%s\n\n%s\n\n\n", c.Name(), strings.Repeat("=", len(c.Name())), content)
|
||||
|
||||
// write to markdown file
|
||||
_, err = f.WriteString(text)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
_, err2 := f.WriteString(markdown)
|
||||
if err2 != nil {
|
||||
log.Fatal(err2)
|
||||
}
|
||||
f.Close()
|
||||
|
||||
fmt.Printf("Markdown saved to \"%s\"\n", output)
|
||||
}
|
||||
|
||||
if format == "epub" {
|
||||
e := epub.NewEpub(b.Name())
|
||||
e.SetAuthor(b.Author())
|
||||
|
||||
for _, c := range b.Chapters() {
|
||||
var content string
|
||||
|
||||
if images == false {
|
||||
content = c.Content()
|
||||
}
|
||||
|
||||
// parse content
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content()))
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
// retrieve images and download it
|
||||
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||
src, _ := s.Attr("src")
|
||||
imagePath, _ := e.AddImage(src, "")
|
||||
|
||||
if images {
|
||||
imageTag, _ := goquery.OuterHtml(s)
|
||||
content += imageTag
|
||||
}
|
||||
|
||||
content = strings.ReplaceAll(content, src, imagePath)
|
||||
})
|
||||
|
||||
html := fmt.Sprintf("<h1>%s</h1>%s", c.Name(), content)
|
||||
_, err = e.AddSection(html, c.Name(), "", "")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
err := e.Write(output)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
output = book.ToEpub(fakeChapter, output)
|
||||
fmt.Printf("Ebook saved to \"%s\"\n", output)
|
||||
}
|
||||
|
||||
if format == "mobi" {
|
||||
e := epub.NewEpub(b.Name())
|
||||
e.SetAuthor(b.Author())
|
||||
|
||||
for _, c := range b.Chapters() {
|
||||
var content string
|
||||
|
||||
if images == false {
|
||||
content = c.Content()
|
||||
}
|
||||
|
||||
// parse content
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content()))
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
// retrieve images and download it
|
||||
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||
src, _ := s.Attr("src")
|
||||
imagePath, _ := e.AddImage(src, "")
|
||||
|
||||
if images {
|
||||
imageTag, _ := goquery.OuterHtml(s)
|
||||
content += imageTag
|
||||
}
|
||||
|
||||
content = strings.ReplaceAll(content, src, imagePath)
|
||||
})
|
||||
|
||||
html := fmt.Sprintf("<h1>%s</h1>%s", c.Name(), content)
|
||||
_, err = e.AddSection(html, c.Name(), "", "")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
outputEPUB := strings.ReplaceAll(output, ".mobi", ".epub")
|
||||
|
||||
err := e.Write(outputEPUB)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
exec.Command("kindlegen", outputEPUB).Run()
|
||||
// exec command always return status 1 even if it succeed
|
||||
// if err != nil {
|
||||
// log.Fatal(err)
|
||||
// }
|
||||
|
||||
output = book.ToMobi(fakeChapter, output)
|
||||
fmt.Printf("Ebook saved to \"%s\"\n", output)
|
||||
|
||||
err = os.Remove(outputEPUB)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
|
||||
Use: "version",
|
||||
Short: "Print the version number of papeer",
|
||||
Run: func(cmd *cobra.Command, args []string) {
|
||||
fmt.Println("papeer v0.3.1")
|
||||
fmt.Println("papeer v0.3.2")
|
||||
},
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user