add book name and author options

This commit is contained in:
lapwat
2021-12-19 18:20:54 +01:00
parent e7ffd8c66c
commit 4e9b0611e8
7 changed files with 118 additions and 55 deletions

10
Makefile Normal file
View File

@@ -0,0 +1,10 @@
install:
go install
format:
gofmt -s -w .
clean:
find . -maxdepth 1 -not -name 'README.md' -name '*.md' -delete
find . -maxdepth 1 -name '*.epub' -delete
find . -maxdepth 1 -name '*.mobi' -delete

View File

@@ -19,14 +19,16 @@ Available Commands:
version Print the version number of papeer
Flags:
-a, --author string book author
-d, --delay int time to wait before downloading next chapter, in milliseconds (default -1)
-f, --format string file format [stdout, md, epub, mobi] (default "stdout")
-h, --help help for papeer
--images retrieve images only
-i, --include include URL as first chapter, in resursive mode
-l, --limit int limit number of chapters, in recursive mode (default -1)
-n, --name string book name (default: page title)
-o, --offset int skip first chapters, in recursive mode
--output string output file
--output string file name (default: book name)
-r, --recursive create one chapter per natigation item
-s, --selector string table of content CSS selector, in resursive mode
-t, --threads int download concurrency, in recursive mode (default -1)
@@ -134,4 +136,4 @@ You can replace `bash` by your own shell (zsh, fish or powershell).
- `html-to-markdown` convert HTML to Markdown
- `go-epub` convert HTML to EPUB
- `colly` query HTML trees
- `uiprogress` display progress bars
- `uiprogress` display progress bars

View File

@@ -14,25 +14,34 @@ import (
colly "github.com/gocolly/colly/v2"
)
func NewBookFromURL(url, selector string, recursive, include, images bool, limit, offset, delay, threads int) book {
func NewBookFromURL(url, selector, name, author string, recursive, include bool, limit, offset, delay, threads int) book {
var chapters []chapter
var home chapter
if recursive {
chapters := tableOfContent(url, selector, limit, offset, delay, threads, include, images)
b := New(chapters[0].Name(), chapters[0].Author())
for _, c := range chapters {
b.AddChapter(c)
}
return b
chapters, home = tableOfContent(url, selector, limit, offset, delay, threads, include)
} else {
c := NewChapterFromURL(url, images)
b := New(c.Name(), c.Author())
b.AddChapter(c)
return b
chapters = []chapter{NewChapterFromURL(url)}
home = chapters[0]
}
if len(name) == 0 {
name = home.Name()
}
if len(author) == 0 {
author = home.Author()
}
b := New(name, author)
for _, c := range chapters {
b.AddChapter(c)
}
return b
}
func NewChapterFromURL(url string, images bool) chapter {
func NewChapterFromURL(url string) chapter {
article, err := readability.FromURL(url, 30*time.Second)
if err != nil {
log.Fatalf("failed to parse %s, %v\n", url, err)
@@ -40,31 +49,31 @@ func NewChapterFromURL(url string, images bool) chapter {
content := strings.ReplaceAll(article.Content, "\n", "")
if images {
// parse html content
doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
if err != nil {
log.Fatal(err)
}
// if images {
// // parse html content
// doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
// if err != nil {
// log.Fatal(err)
// }
// extract images only
content = ""
doc.Find("img").Each(func(i int, s *goquery.Selection) {
newContent, _ := goquery.OuterHtml(s)
content += newContent
})
}
// // extract images only
// content = ""
// doc.Find("img").Each(func(i int, s *goquery.Selection) {
// newContent, _ := goquery.OuterHtml(s)
// content += newContent
// })
// }
return chapter{article.Title, article.Byline, content}
}
func tableOfContent(url, selector string, limit, offset, delay, threads int, include, images bool) []chapter {
func tableOfContent(url, selector string, limit, offset, delay, threads int, include bool) ([]chapter, chapter) {
base, err := urllib.Parse(url)
if err != nil {
log.Fatal(err)
}
links, err := GetLinks(base, selector, limit, offset, include)
links, home, err := GetLinks(base, selector, limit, offset, include)
if err != nil {
log.Fatal(err)
}
@@ -82,7 +91,7 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc
log.Fatal(err)
}
chapters[index] = NewChapterFromURL(u.String(), images)
chapters[index] = NewChapterFromURL(u.String())
progress.Incr(index)
// short sleep for last chapter to let the progress bar update
@@ -116,7 +125,7 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc
log.Fatal(err)
}
chapters[index] = NewChapterFromURL(u.String(), images)
chapters[index] = NewChapterFromURL(u.String())
progress.Incr(index)
<-semaphore
@@ -124,7 +133,8 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc
}
wg.Wait()
}
return chapters
return chapters, home
}
func GetPath(elm *goquery.Selection) string {
@@ -144,7 +154,7 @@ func GetPath(elm *goquery.Selection) string {
return join
}
func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) ([]link, error) {
func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) ([]link, chapter, error) {
selectorSet := true
if selector == "" {
selector = "a"
@@ -182,7 +192,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool)
links := pathLinks[pathMax]
if len(links) == 0 {
return []link{}, fmt.Errorf("no link found for selector: %s", selector)
return []link{}, chapter{}, fmt.Errorf("no link found for selector: %s", selector)
}
end := len(links)
@@ -192,11 +202,12 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool)
links = links[offset:end]
home := NewChapterFromURL(url.String())
if include {
c := NewChapterFromURL(url.String(), false)
l := NewLink(url.String(), c.Name())
l := NewLink(url.String(), home.Name())
links = append([]link{l}, links...)
}
return links, nil
return links, home, nil
}

View File

@@ -17,7 +17,7 @@ import (
)
var recursive, include, images bool
var format, output, selector string
var format, output, selector, name, author string
var limit, offset, delay, threads int
var getCmd = &cobra.Command{
@@ -77,7 +77,7 @@ var getCmd = &cobra.Command{
},
Run: func(cmd *cobra.Command, args []string) {
url := args[0]
b := book.NewBookFromURL(url, selector, recursive, include, images, limit, offset, delay, threads)
b := book.NewBookFromURL(url, selector, name, author, recursive, include, limit, offset, delay, threads)
if len(output) == 0 {
// set default output
@@ -136,27 +136,36 @@ var getCmd = &cobra.Command{
e.SetAuthor(b.Author())
for _, c := range b.Chapters() {
// parse content
var content string
if images == false {
content = c.Content()
}
// parse content
doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content()))
if err != nil {
log.Fatal(err)
}
// retrieve images and download it
contentWithLocalImages := c.Content()
doc.Find("img").Each(func(i int, s *goquery.Selection) {
src, _ := s.Attr("src")
imagePath, _ := e.AddImage(src, "")
contentWithLocalImages = strings.ReplaceAll(contentWithLocalImages, src, imagePath)
if images {
imageTag, _ := goquery.OuterHtml(s)
content += imageTag
}
content = strings.ReplaceAll(content, src, imagePath)
})
html := fmt.Sprintf("<h1>%s</h1>%s", c.Name(), contentWithLocalImages)
html := fmt.Sprintf("<h1>%s</h1>%s", c.Name(), content)
_, err = e.AddSection(html, c.Name(), "", "")
if err != nil {
log.Fatal(err)
}
}
err := e.Write(output)
@@ -171,8 +180,37 @@ var getCmd = &cobra.Command{
e := epub.NewEpub(b.Name())
e.SetAuthor(b.Author())
for _, chapter := range b.Chapters() {
e.AddSection(chapter.Content(), chapter.Name(), "", "")
for _, c := range b.Chapters() {
var content string
if images == false {
content = c.Content()
}
// parse content
doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content()))
if err != nil {
log.Fatal(err)
}
// retrieve images and download it
doc.Find("img").Each(func(i int, s *goquery.Selection) {
src, _ := s.Attr("src")
imagePath, _ := e.AddImage(src, "")
if images {
imageTag, _ := goquery.OuterHtml(s)
content += imageTag
}
content = strings.ReplaceAll(content, src, imagePath)
})
html := fmt.Sprintf("<h1>%s</h1>%s", c.Name(), content)
_, err = e.AddSection(html, c.Name(), "", "")
if err != nil {
log.Fatal(err)
}
}
outputEPUB := strings.ReplaceAll(output, ".mobi", ".epub")
@@ -183,16 +221,16 @@ var getCmd = &cobra.Command{
}
exec.Command("kindlegen", outputEPUB).Run()
// exec command always return status 1 even if it fails
// exec command always return status 1 even if it succeed
// if err != nil {
// log.Fatal(err)
// }
fmt.Printf("Ebook saved to \"%s\"\n", output)
err2 := os.Remove(outputEPUB)
if err2 != nil {
log.Fatal(err2)
err = os.Remove(outputEPUB)
if err != nil {
log.Fatal(err)
}
}
},

View File

@@ -27,7 +27,7 @@ var listCmd = &cobra.Command{
log.Fatal(err)
}
links, err := book.GetLinks(base, selector, limit, offset, include)
links, _, err := book.GetLinks(base, selector, limit, offset, include)
if err != nil {
log.Fatal(err)
}

View File

@@ -23,8 +23,10 @@ func Execute() {
}
func init() {
rootCmd.PersistentFlags().StringVarP(&name, "name", "n", "", "book name (default: page title)")
rootCmd.PersistentFlags().StringVarP(&author, "author", "a", "", "book author")
rootCmd.PersistentFlags().StringVarP(&format, "format", "f", "stdout", "file format [stdout, md, epub, mobi]")
rootCmd.PersistentFlags().StringVarP(&output, "output", "", "", "output file")
rootCmd.PersistentFlags().StringVarP(&output, "output", "", "", "file name (default: book name)")
rootCmd.PersistentFlags().StringVarP(&selector, "selector", "s", "", "table of content CSS selector, in resursive mode")
rootCmd.PersistentFlags().BoolVarP(&recursive, "recursive", "r", false, "create one chapter per natigation item")
rootCmd.PersistentFlags().BoolVarP(&include, "include", "i", false, "include URL as first chapter, in resursive mode")

View File

@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
Use: "version",
Short: "Print the version number of papeer",
Run: func(cmd *cobra.Command, args []string) {
fmt.Println("papeer v0.3.0")
fmt.Println("papeer v0.3.1")
},
}