5 Commits

Author SHA1 Message Date
q
32168718c9 fix tests, fix hidden images, remove source label under title 2025-01-12 18:57:33 +01:00
lapwat
403fdcc0f0 [get] print url option 2024-08-14 23:32:33 +02:00
lapwat
1b2be1c390 update tests 2024-08-14 14:32:00 +02:00
lapwat
4521497d12 add pub date to chapter skeleton 2023-10-01 22:07:38 +02:00
lapwat
2cbcf17cc2 [get] json format 2023-10-01 16:00:27 +02:00
10 changed files with 181 additions and 94 deletions

View File

@@ -15,7 +15,7 @@ jobs:
goos: windows goos: windows
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
- uses: wangyoucao577/go-release-action@v1.30 - uses: wangyoucao577/go-release-action@v1.40
with: with:
github_token: ${{ secrets.GITHUB_TOKEN }} github_token: ${{ secrets.GITHUB_TOKEN }}
goos: ${{ matrix.goos }} goos: ${{ matrix.goos }}

View File

@@ -40,7 +40,9 @@ Download [latest release](https://github.com/lapwat/papeer/releases/latest) for
## MOBI support ## MOBI support
Install kindlegen to convert websites, Linux only. > Kindle e-readers now support EPUB format
Install kindlegen to export websites to Kindle compatible ebooks, Linux only.
```sh ```sh
TMPDIR=$(mktemp -d -t papeer-XXXXX) TMPDIR=$(mktemp -d -t papeer-XXXXX)

View File

@@ -1,6 +1,7 @@
package book package book
type chapter struct { type chapter struct {
url string
body string body string
name string name string
author string author string
@@ -10,11 +11,11 @@ type chapter struct {
} }
func NewEmptyChapter() chapter { func NewEmptyChapter() chapter {
return chapter{"", "", "", "", []chapter{}, NewScrapeConfigNoInclude()} return chapter{"", "", "", "", "", []chapter{}, NewScrapeConfigNoInclude()}
} }
func NewChapter(body, name, author, content string, subChapters []chapter, config *ScrapeConfig) chapter { func (c chapter) URL() string {
return chapter{body, name, author, content, subChapters, config} return c.url
} }
func (c chapter) Body() string { func (c chapter) Body() string {

View File

@@ -30,6 +30,11 @@ func ToMarkdownString(c chapter) string {
markdown += fmt.Sprintf("%s\n", c.Name()) markdown += fmt.Sprintf("%s\n", c.Name())
markdown += fmt.Sprintf("%s\n\n", strings.Repeat("=", len(c.Name()))) markdown += fmt.Sprintf("%s\n\n", strings.Repeat("=", len(c.Name())))
// url
if c.config.PrintURL {
markdown += fmt.Sprintf("_%s_\n\n", c.URL())
}
// convert content to markdown // convert content to markdown
content, err := md.NewConverter("", true, nil).ConvertString(c.Content()) content, err := md.NewConverter("", true, nil).ConvertString(c.Content())
if err != nil { if err != nil {
@@ -72,7 +77,15 @@ func ToHtmlString(c chapter) string {
// chapter content // chapter content
if c.config.Include { if c.config.Include {
html += fmt.Sprintf("<h1>%s</h1>", c.Name()) // title
html += fmt.Sprintf("<h1>%s</h1>\n", c.Name())
// url
if c.config.PrintURL {
html += fmt.Sprintf("<p><i>%s</i></p>\n", c.URL())
}
// content
html += c.Content() html += c.Content()
} }
@@ -114,19 +127,6 @@ func ToEpub(c chapter, filename string) string {
e := epub.NewEpub(c.Name()) e := epub.NewEpub(c.Name())
e.SetAuthor(c.Author()) e.SetAuthor(c.Author())
AppendToEpub(e, c)
err := e.Write(filename)
if err != nil {
log.Fatal(err)
}
return filename
}
func AppendToEpub(e *epub.Epub, c chapter) {
content := ""
// append table of content // append table of content
if len(c.SubChapters()) > 1 { if len(c.SubChapters()) > 1 {
html := "<h1>Table of Contents</h1>" html := "<h1>Table of Contents</h1>"
@@ -143,6 +143,19 @@ func AppendToEpub(e *epub.Epub, c chapter) {
} }
} }
AppendToEpub(e, c)
err := e.Write(filename)
if err != nil {
log.Fatal(err)
}
return filename
}
func AppendToEpub(e *epub.Epub, c chapter) {
content := ""
// chapter content // chapter content
if c.config.Include { if c.config.Include {
@@ -164,17 +177,24 @@ func AppendToEpub(e *epub.Epub, c chapter) {
if c.config.ImagesOnly { if c.config.ImagesOnly {
imageTag, _ := goquery.OuterHtml(s) imageTag, _ := goquery.OuterHtml(s)
content += strings.Replace(imageTag, src, imagePath, 1) content += strings.ReplaceAll(imageTag, src, imagePath)
} else { } else {
content = strings.Replace(content, src, imagePath, 1) content = strings.ReplaceAll(content, src, imagePath)
} }
}) })
html := "" html := ""
// add title only if ImagesOnly = false // add title only if ImagesOnly = false
if c.config.ImagesOnly == false { if c.config.ImagesOnly == false {
html += fmt.Sprintf("<h1>%s</h1>", c.Name()) html += fmt.Sprintf("<h1>%s</h1>\n", c.Name())
} }
// url
if c.config.PrintURL {
html += fmt.Sprintf("<p><i>%s</i></p>\n", c.URL())
}
// content
html += content html += content
// write to epub file // write to epub file

File diff suppressed because one or more lines are too long

View File

@@ -1,10 +1,13 @@
package book package book
import "time"
type link struct { type link struct {
Href string `json:"url"` Href string `json:"url"`
Text string `json:"name"` Text string `json:"name"`
Date *time.Time `json:"date"`
} }
func NewLink(href, text string) link { func NewLink(href, text string, date *time.Time) link {
return link{href, text} return link{href, text, date}
} }

View File

@@ -30,14 +30,19 @@ type ScrapeConfig struct {
Include bool Include bool
ImagesOnly bool ImagesOnly bool
UseLinkName bool UseLinkName bool
PrintURL bool
} }
func NewScrapeConfig() *ScrapeConfig { func NewScrapeConfig() *ScrapeConfig {
return &ScrapeConfig{0, "", false, -1, 0, false, -1, -1, true, false, false} return &ScrapeConfig{0, "", false, -1, 0, false, -1, -1, true, false, false, false}
}
func NewScrapeConfigQuiet() *ScrapeConfig {
return &ScrapeConfig{0, "", true, -1, 0, false, -1, -1, true, false, false, false}
} }
func NewScrapeConfigNoInclude() *ScrapeConfig { func NewScrapeConfigNoInclude() *ScrapeConfig {
return &ScrapeConfig{0, "", false, -1, 0, false, -1, -1, false, false, false} return &ScrapeConfig{0, "", false, -1, 0, false, -1, -1, false, false, false, false}
} }
func NewScrapeConfigs(selectors []string) []*ScrapeConfig { func NewScrapeConfigs(selectors []string) []*ScrapeConfig {
@@ -237,7 +242,6 @@ func NewChapterFromURL(url, linkName string, configs []*ScrapeConfig, index int,
content = "" content = ""
doc.Find("img").Each(func(i int, s *goquery.Selection) { doc.Find("img").Each(func(i int, s *goquery.Selection) {
imageTag, _ := goquery.OuterHtml(s) imageTag, _ := goquery.OuterHtml(s)
// imageTag = strings.ReplaceAll(imageTag, "\n", "")
content += imageTag content += imageTag
}) })
@@ -252,7 +256,7 @@ func NewChapterFromURL(url, linkName string, configs []*ScrapeConfig, index int,
} }
return chapter{string(body), name, article.Byline, content, subchapters, config} return chapter{url, string(body), name, article.Byline, content, subchapters, config}
} }
func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig, quiet bool) ([]chapter, chapter) { func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig, quiet bool) ([]chapter, chapter) {
@@ -370,7 +374,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, incl
log.Fatal(err) log.Fatal(err)
} }
links = append(links, NewLink(u.String(), item.Title)) links = append(links, NewLink(u.String(), item.Title, item.PublishedParsed))
} }
pathMax = "RSS" pathMax = "RSS"
@@ -405,7 +409,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, incl
// if selector is set, we use the selector specified by the user // if selector is set, we use the selector specified by the user
key = selector key = selector
pathLinks[key] = append(pathLinks[key], NewLink(href, text)) pathLinks[key] = append(pathLinks[key], NewLink(href, text, &time.Time{}))
pathCount[key] += 1 pathCount[key] += 1
pathMax = key pathMax = key
@@ -419,7 +423,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, incl
// we count this key if the link text is not empty // we count this key if the link text is not empty
if text != "" { if text != "" {
pathLinks[key] = append(pathLinks[key], NewLink(href, text)) pathLinks[key] = append(pathLinks[key], NewLink(href, text, &time.Time{}))
pathCount[key] += len(text) pathCount[key] += len(text)
if pathCount[key] > pathCount[pathMax] { if pathCount[key] > pathCount[pathMax] {
@@ -449,7 +453,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, incl
// include home page // include home page
if include { if include {
l := NewLink(url.String(), home.Name()) l := NewLink(url.String(), home.Name(), &time.Time{})
links = append([]link{l}, links...) links = append([]link{l}, links...)
} }

File diff suppressed because one or more lines are too long

View File

@@ -1,6 +1,7 @@
package cmd package cmd
import ( import (
"encoding/json"
"errors" "errors"
"fmt" "fmt"
"io/ioutil" "io/ioutil"
@@ -32,6 +33,7 @@ type GetOptions struct {
threads int threads int
include bool include bool
useLinkName bool useLinkName bool
printURL bool
} }
var getOpts *GetOptions var getOpts *GetOptions
@@ -41,10 +43,11 @@ func init() {
getCmd.Flags().StringVarP(&getOpts.name, "name", "n", "", "book name (default: page title)") getCmd.Flags().StringVarP(&getOpts.name, "name", "n", "", "book name (default: page title)")
getCmd.Flags().StringVarP(&getOpts.author, "author", "a", "", "book author") getCmd.Flags().StringVarP(&getOpts.author, "author", "a", "", "book author")
getCmd.Flags().StringVarP(&getOpts.Format, "format", "f", "md", "file format [md, html, epub, mobi]") getCmd.Flags().StringVarP(&getOpts.Format, "format", "f", "md", "file format [md, json, html, epub, mobi]")
getCmd.Flags().StringVarP(&getOpts.output, "output", "", "", "file name (default: book name)") getCmd.Flags().StringVarP(&getOpts.output, "output", "", "", "file name (default: book name)")
getCmd.Flags().BoolVarP(&getOpts.stdout, "stdout", "", false, "print to standard output") getCmd.Flags().BoolVarP(&getOpts.stdout, "stdout", "", false, "print to standard output")
getCmd.Flags().BoolVarP(&getOpts.images, "images", "", false, "retrieve images only") getCmd.Flags().BoolVarP(&getOpts.images, "images", "", false, "retrieve images only")
getCmd.Flags().BoolVarP(&getOpts.printURL, "print-url", "", false, "print url after chapter title")
getCmd.Flags().BoolVarP(&getOpts.quiet, "quiet", "q", false, "hide progress bar") getCmd.Flags().BoolVarP(&getOpts.quiet, "quiet", "q", false, "hide progress bar")
// common with list command // common with list command
@@ -73,6 +76,7 @@ var getCmd = &cobra.Command{
// check provided format is in list // check provided format is in list
formatEnum := map[string]bool{ formatEnum := map[string]bool{
"md": true, "md": true,
"json": true,
"html": true, "html": true,
"epub": true, "epub": true,
"mobi": true, "mobi": true,
@@ -145,6 +149,7 @@ var getCmd = &cobra.Command{
config.ImagesOnly = getOpts.images config.ImagesOnly = getOpts.images
config.Include = getOpts.include config.Include = getOpts.include
config.UseLinkName = getOpts.useLinkName config.UseLinkName = getOpts.useLinkName
config.PrintURL = getOpts.printURL
// do not use link name for root level as there is not parent link // do not use link name for root level as there is not parent link
if index == 0 { if index == 0 {
@@ -182,6 +187,26 @@ var getCmd = &cobra.Command{
} }
} }
if getOpts.Format == "json" {
filename := book.ToMarkdown(c, getOpts.output)
bytesRead, err := ioutil.ReadFile(filename)
if err != nil {
log.Fatal(err)
}
book := make(map[string]interface{})
book["name"] = c.Name()
book["content"] = string(bytesRead)
bookJson, err := json.Marshal(book)
if err != nil {
log.Fatal(err)
}
fmt.Println(string(bookJson))
}
if getOpts.Format == "html" { if getOpts.Format == "html" {
filename := book.ToHtml(c, getOpts.output) filename := book.ToHtml(c, getOpts.output)

View File

@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
Use: "version", Use: "version",
Short: "Print the version number of papeer", Short: "Print the version number of papeer",
Run: func(cmd *cobra.Command, args []string) { Run: func(cmd *cobra.Command, args []string) {
fmt.Println("papeer v0.7.1") fmt.Println("papeer v0.8.3")
}, },
} }