mirror of
https://github.com/NohamR/papeer.git
synced 2026-05-25 20:00:47 +00:00
Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0009435769 | ||
|
|
4e9b0611e8 | ||
|
|
e7ffd8c66c | ||
|
|
84e6ad8585 | ||
|
|
d593a74e6e |
10
Makefile
Normal file
10
Makefile
Normal file
@@ -0,0 +1,10 @@
|
||||
install:
|
||||
go install
|
||||
|
||||
format:
|
||||
gofmt -s -w .
|
||||
|
||||
clean:
|
||||
find . -maxdepth 1 -not -name 'README.md' -name '*.md' -delete
|
||||
find . -maxdepth 1 -name '*.epub' -delete
|
||||
find . -maxdepth 1 -name '*.mobi' -delete
|
||||
@@ -19,14 +19,16 @@ Available Commands:
|
||||
version Print the version number of papeer
|
||||
|
||||
Flags:
|
||||
-a, --author string book author
|
||||
-d, --delay int time to wait before downloading next chapter, in milliseconds (default -1)
|
||||
-f, --format string file format [stdout, md, epub, mobi] (default "stdout")
|
||||
-h, --help help for papeer
|
||||
--images retrieve images only
|
||||
-i, --include include URL as first chapter, in resursive mode
|
||||
-l, --limit int limit number of chapters, in recursive mode (default -1)
|
||||
-n, --name string book name (default: page title)
|
||||
-o, --offset int skip first chapters, in recursive mode
|
||||
--output string output file
|
||||
--output string file name (default: book name)
|
||||
-r, --recursive create one chapter per natigation item
|
||||
-s, --selector string table of content CSS selector, in resursive mode
|
||||
-t, --threads int download concurrency, in recursive mode (default -1)
|
||||
@@ -95,14 +97,14 @@ go get -u github.com/lapwat/papeer
|
||||
|
||||
```sh
|
||||
platform=linux # use platform=darwin for MacOS
|
||||
curl -L https://github.com/lapwat/papeer/releases/download/v0.3.0/papeer-v0.3.0-$platform-amd64 > papeer
|
||||
curl -L https://github.com/lapwat/papeer/releases/download/v0.3.1/papeer-v0.3.1-$platform-amd64 > papeer
|
||||
chmod +x papeer
|
||||
sudo mv papeer /usr/local/bin
|
||||
```
|
||||
|
||||
### On Windows
|
||||
|
||||
Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.3.0/papeer-v0.3.0-windows-amd64.exe).
|
||||
Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.3.1/papeer-v0.3.1-windows-amd64.exe).
|
||||
|
||||
## Install kindlegen to export websites to MOBI (optional)
|
||||
|
||||
|
||||
@@ -14,25 +14,34 @@ import (
|
||||
colly "github.com/gocolly/colly/v2"
|
||||
)
|
||||
|
||||
func NewBookFromURL(url, selector string, recursive, include, images bool, limit, offset, delay, threads int) book {
|
||||
func NewBookFromURL(url, selector, name, author string, recursive, include bool, limit, offset, delay, threads int) book {
|
||||
var chapters []chapter
|
||||
var home chapter
|
||||
|
||||
if recursive {
|
||||
chapters := tableOfContent(url, selector, limit, offset, delay, threads, include, images)
|
||||
|
||||
b := New(chapters[0].Name(), chapters[0].Author())
|
||||
for _, c := range chapters {
|
||||
b.AddChapter(c)
|
||||
}
|
||||
|
||||
return b
|
||||
chapters, home = tableOfContent(url, selector, limit, offset, delay, threads, include)
|
||||
} else {
|
||||
c := NewChapterFromURL(url, images)
|
||||
b := New(c.Name(), c.Author())
|
||||
b.AddChapter(c)
|
||||
return b
|
||||
chapters = []chapter{NewChapterFromURL(url)}
|
||||
home = chapters[0]
|
||||
}
|
||||
|
||||
if len(name) == 0 {
|
||||
name = home.Name()
|
||||
}
|
||||
|
||||
if len(author) == 0 {
|
||||
author = home.Author()
|
||||
}
|
||||
|
||||
b := New(name, author)
|
||||
for _, c := range chapters {
|
||||
b.AddChapter(c)
|
||||
}
|
||||
|
||||
return b
|
||||
}
|
||||
|
||||
func NewChapterFromURL(url string, images bool) chapter {
|
||||
func NewChapterFromURL(url string) chapter {
|
||||
article, err := readability.FromURL(url, 30*time.Second)
|
||||
if err != nil {
|
||||
log.Fatalf("failed to parse %s, %v\n", url, err)
|
||||
@@ -40,31 +49,31 @@ func NewChapterFromURL(url string, images bool) chapter {
|
||||
|
||||
content := strings.ReplaceAll(article.Content, "\n", "")
|
||||
|
||||
if images {
|
||||
// parse html content
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
// if images {
|
||||
// // parse html content
|
||||
// doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
|
||||
// if err != nil {
|
||||
// log.Fatal(err)
|
||||
// }
|
||||
|
||||
// extract images only
|
||||
content = ""
|
||||
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||
newContent, _ := goquery.OuterHtml(s)
|
||||
content += newContent
|
||||
})
|
||||
}
|
||||
// // extract images only
|
||||
// content = ""
|
||||
// doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||
// newContent, _ := goquery.OuterHtml(s)
|
||||
// content += newContent
|
||||
// })
|
||||
// }
|
||||
|
||||
return chapter{article.Title, article.Byline, content}
|
||||
}
|
||||
|
||||
func tableOfContent(url, selector string, limit, offset, delay, threads int, include, images bool) []chapter {
|
||||
func tableOfContent(url, selector string, limit, offset, delay, threads int, include bool) ([]chapter, chapter) {
|
||||
base, err := urllib.Parse(url)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
links, err := GetLinks(base, selector, limit, offset, include)
|
||||
links, home, err := GetLinks(base, selector, limit, offset, include)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
@@ -82,7 +91,7 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
chapters[index] = NewChapterFromURL(u.String(), images)
|
||||
chapters[index] = NewChapterFromURL(u.String())
|
||||
progress.Incr(index)
|
||||
|
||||
// short sleep for last chapter to let the progress bar update
|
||||
@@ -116,7 +125,7 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
chapters[index] = NewChapterFromURL(u.String(), images)
|
||||
chapters[index] = NewChapterFromURL(u.String())
|
||||
progress.Incr(index)
|
||||
|
||||
<-semaphore
|
||||
@@ -124,7 +133,8 @@ func tableOfContent(url, selector string, limit, offset, delay, threads int, inc
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
return chapters
|
||||
|
||||
return chapters, home
|
||||
}
|
||||
|
||||
func GetPath(elm *goquery.Selection) string {
|
||||
@@ -144,7 +154,7 @@ func GetPath(elm *goquery.Selection) string {
|
||||
return join
|
||||
}
|
||||
|
||||
func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) ([]link, error) {
|
||||
func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) ([]link, chapter, error) {
|
||||
selectorSet := true
|
||||
if selector == "" {
|
||||
selector = "a"
|
||||
@@ -182,7 +192,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool)
|
||||
|
||||
links := pathLinks[pathMax]
|
||||
if len(links) == 0 {
|
||||
return []link{}, fmt.Errorf("no link found for selector: %s", selector)
|
||||
return []link{}, chapter{}, fmt.Errorf("no link found for selector: %s", selector)
|
||||
}
|
||||
|
||||
end := len(links)
|
||||
@@ -192,11 +202,12 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool)
|
||||
|
||||
links = links[offset:end]
|
||||
|
||||
home := NewChapterFromURL(url.String())
|
||||
|
||||
if include {
|
||||
c := NewChapterFromURL(url.String(), false)
|
||||
l := NewLink(url.String(), c.Name())
|
||||
l := NewLink(url.String(), home.Name())
|
||||
links = append([]link{l}, links...)
|
||||
}
|
||||
|
||||
return links, nil
|
||||
return links, home, nil
|
||||
}
|
||||
|
||||
62
cmd/get.go
62
cmd/get.go
@@ -17,7 +17,7 @@ import (
|
||||
)
|
||||
|
||||
var recursive, include, images bool
|
||||
var format, output, selector string
|
||||
var format, output, selector, name, author string
|
||||
var limit, offset, delay, threads int
|
||||
|
||||
var getCmd = &cobra.Command{
|
||||
@@ -77,7 +77,7 @@ var getCmd = &cobra.Command{
|
||||
},
|
||||
Run: func(cmd *cobra.Command, args []string) {
|
||||
url := args[0]
|
||||
b := book.NewBookFromURL(url, selector, recursive, include, images, limit, offset, delay, threads)
|
||||
b := book.NewBookFromURL(url, selector, name, author, recursive, include, limit, offset, delay, threads)
|
||||
|
||||
if len(output) == 0 {
|
||||
// set default output
|
||||
@@ -136,6 +136,12 @@ var getCmd = &cobra.Command{
|
||||
e.SetAuthor(b.Author())
|
||||
|
||||
for _, c := range b.Chapters() {
|
||||
var content string
|
||||
|
||||
if images == false {
|
||||
content = c.Content()
|
||||
}
|
||||
|
||||
// parse content
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content()))
|
||||
if err != nil {
|
||||
@@ -143,20 +149,23 @@ var getCmd = &cobra.Command{
|
||||
}
|
||||
|
||||
// retrieve images and download it
|
||||
contentWithLocalImages := c.Content()
|
||||
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||
src, _ := s.Attr("src")
|
||||
imagePath, _ := e.AddImage(src, "")
|
||||
|
||||
contentWithLocalImages = strings.ReplaceAll(contentWithLocalImages, src, imagePath)
|
||||
if images {
|
||||
imageTag, _ := goquery.OuterHtml(s)
|
||||
content += imageTag
|
||||
}
|
||||
|
||||
content = strings.ReplaceAll(content, src, imagePath)
|
||||
})
|
||||
|
||||
html := fmt.Sprintf("<h1>%s</h1>%s", c.Name(), contentWithLocalImages)
|
||||
html := fmt.Sprintf("<h1>%s</h1>%s", c.Name(), content)
|
||||
_, err = e.AddSection(html, c.Name(), "", "")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
err := e.Write(output)
|
||||
@@ -171,8 +180,37 @@ var getCmd = &cobra.Command{
|
||||
e := epub.NewEpub(b.Name())
|
||||
e.SetAuthor(b.Author())
|
||||
|
||||
for _, chapter := range b.Chapters() {
|
||||
e.AddSection(chapter.Content(), chapter.Name(), "", "")
|
||||
for _, c := range b.Chapters() {
|
||||
var content string
|
||||
|
||||
if images == false {
|
||||
content = c.Content()
|
||||
}
|
||||
|
||||
// parse content
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(c.Content()))
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
// retrieve images and download it
|
||||
doc.Find("img").Each(func(i int, s *goquery.Selection) {
|
||||
src, _ := s.Attr("src")
|
||||
imagePath, _ := e.AddImage(src, "")
|
||||
|
||||
if images {
|
||||
imageTag, _ := goquery.OuterHtml(s)
|
||||
content += imageTag
|
||||
}
|
||||
|
||||
content = strings.ReplaceAll(content, src, imagePath)
|
||||
})
|
||||
|
||||
html := fmt.Sprintf("<h1>%s</h1>%s", c.Name(), content)
|
||||
_, err = e.AddSection(html, c.Name(), "", "")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
outputEPUB := strings.ReplaceAll(output, ".mobi", ".epub")
|
||||
@@ -183,16 +221,16 @@ var getCmd = &cobra.Command{
|
||||
}
|
||||
|
||||
exec.Command("kindlegen", outputEPUB).Run()
|
||||
// exec command always return status 1 even if it fails
|
||||
// exec command always return status 1 even if it succeed
|
||||
// if err != nil {
|
||||
// log.Fatal(err)
|
||||
// }
|
||||
|
||||
fmt.Printf("Ebook saved to \"%s\"\n", output)
|
||||
|
||||
err2 := os.Remove(outputEPUB)
|
||||
if err2 != nil {
|
||||
log.Fatal(err2)
|
||||
err = os.Remove(outputEPUB)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
@@ -27,7 +27,7 @@ var listCmd = &cobra.Command{
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
links, err := book.GetLinks(base, selector, limit, offset, include)
|
||||
links, _, err := book.GetLinks(base, selector, limit, offset, include)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
@@ -23,8 +23,10 @@ func Execute() {
|
||||
}
|
||||
|
||||
func init() {
|
||||
rootCmd.PersistentFlags().StringVarP(&name, "name", "n", "", "book name (default: page title)")
|
||||
rootCmd.PersistentFlags().StringVarP(&author, "author", "a", "", "book author")
|
||||
rootCmd.PersistentFlags().StringVarP(&format, "format", "f", "stdout", "file format [stdout, md, epub, mobi]")
|
||||
rootCmd.PersistentFlags().StringVarP(&output, "output", "", "", "output file")
|
||||
rootCmd.PersistentFlags().StringVarP(&output, "output", "", "", "file name (default: book name)")
|
||||
rootCmd.PersistentFlags().StringVarP(&selector, "selector", "s", "", "table of content CSS selector, in resursive mode")
|
||||
rootCmd.PersistentFlags().BoolVarP(&recursive, "recursive", "r", false, "create one chapter per natigation item")
|
||||
rootCmd.PersistentFlags().BoolVarP(&include, "include", "i", false, "include URL as first chapter, in resursive mode")
|
||||
|
||||
@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
|
||||
Use: "version",
|
||||
Short: "Print the version number of papeer",
|
||||
Run: func(cmd *cobra.Command, args []string) {
|
||||
fmt.Println("papeer v0.3.0")
|
||||
fmt.Println("papeer v0.3.1")
|
||||
},
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user