5 Commits

Author SHA1 Message Date
q
99b7d16de7 stdout: remove file after use 2025-03-06 18:26:14 +01:00
q
32168718c9 fix tests, fix hidden images, remove source label under title 2025-01-12 18:57:33 +01:00
lapwat
403fdcc0f0 [get] print url option 2024-08-14 23:32:33 +02:00
lapwat
1b2be1c390 update tests 2024-08-14 14:32:00 +02:00
lapwat
4521497d12 add pub date to chapter skeleton 2023-10-01 22:07:38 +02:00
8 changed files with 160 additions and 91 deletions

View File

@@ -1,6 +1,7 @@
package book package book
type chapter struct { type chapter struct {
url string
body string body string
name string name string
author string author string
@@ -10,11 +11,11 @@ type chapter struct {
} }
func NewEmptyChapter() chapter { func NewEmptyChapter() chapter {
return chapter{"", "", "", "", []chapter{}, NewScrapeConfigNoInclude()} return chapter{"", "", "", "", "", []chapter{}, NewScrapeConfigNoInclude()}
} }
func NewChapter(body, name, author, content string, subChapters []chapter, config *ScrapeConfig) chapter { func (c chapter) URL() string {
return chapter{body, name, author, content, subChapters, config} return c.url
} }
func (c chapter) Body() string { func (c chapter) Body() string {

View File

@@ -30,6 +30,11 @@ func ToMarkdownString(c chapter) string {
markdown += fmt.Sprintf("%s\n", c.Name()) markdown += fmt.Sprintf("%s\n", c.Name())
markdown += fmt.Sprintf("%s\n\n", strings.Repeat("=", len(c.Name()))) markdown += fmt.Sprintf("%s\n\n", strings.Repeat("=", len(c.Name())))
// url
if c.config.PrintURL {
markdown += fmt.Sprintf("_%s_\n\n", c.URL())
}
// convert content to markdown // convert content to markdown
content, err := md.NewConverter("", true, nil).ConvertString(c.Content()) content, err := md.NewConverter("", true, nil).ConvertString(c.Content())
if err != nil { if err != nil {
@@ -72,7 +77,15 @@ func ToHtmlString(c chapter) string {
// chapter content // chapter content
if c.config.Include { if c.config.Include {
html += fmt.Sprintf("<h1>%s</h1>", c.Name()) // title
html += fmt.Sprintf("<h1>%s</h1>\n", c.Name())
// url
if c.config.PrintURL {
html += fmt.Sprintf("<p><i>%s</i></p>\n", c.URL())
}
// content
html += c.Content() html += c.Content()
} }
@@ -114,19 +127,6 @@ func ToEpub(c chapter, filename string) string {
e := epub.NewEpub(c.Name()) e := epub.NewEpub(c.Name())
e.SetAuthor(c.Author()) e.SetAuthor(c.Author())
AppendToEpub(e, c)
err := e.Write(filename)
if err != nil {
log.Fatal(err)
}
return filename
}
func AppendToEpub(e *epub.Epub, c chapter) {
content := ""
// append table of content // append table of content
if len(c.SubChapters()) > 1 { if len(c.SubChapters()) > 1 {
html := "<h1>Table of Contents</h1>" html := "<h1>Table of Contents</h1>"
@@ -143,6 +143,19 @@ func AppendToEpub(e *epub.Epub, c chapter) {
} }
} }
AppendToEpub(e, c)
err := e.Write(filename)
if err != nil {
log.Fatal(err)
}
return filename
}
func AppendToEpub(e *epub.Epub, c chapter) {
content := ""
// chapter content // chapter content
if c.config.Include { if c.config.Include {
@@ -164,17 +177,24 @@ func AppendToEpub(e *epub.Epub, c chapter) {
if c.config.ImagesOnly { if c.config.ImagesOnly {
imageTag, _ := goquery.OuterHtml(s) imageTag, _ := goquery.OuterHtml(s)
content += strings.Replace(imageTag, src, imagePath, 1) content += strings.ReplaceAll(imageTag, src, imagePath)
} else { } else {
content = strings.Replace(content, src, imagePath, 1) content = strings.ReplaceAll(content, src, imagePath)
} }
}) })
html := "" html := ""
// add title only if ImagesOnly = false // add title only if ImagesOnly = false
if c.config.ImagesOnly == false { if c.config.ImagesOnly == false {
html += fmt.Sprintf("<h1>%s</h1>", c.Name()) html += fmt.Sprintf("<h1>%s</h1>\n", c.Name())
} }
// url
if c.config.PrintURL {
html += fmt.Sprintf("<p><i>%s</i></p>\n", c.URL())
}
// content
html += content html += content
// write to epub file // write to epub file

File diff suppressed because one or more lines are too long

View File

@@ -1,10 +1,13 @@
package book package book
import "time"
type link struct { type link struct {
Href string `json:"url"` Href string `json:"url"`
Text string `json:"name"` Text string `json:"name"`
Date *time.Time `json:"date"`
} }
func NewLink(href, text string) link { func NewLink(href, text string, date *time.Time) link {
return link{href, text} return link{href, text, date}
} }

View File

@@ -30,14 +30,19 @@ type ScrapeConfig struct {
Include bool Include bool
ImagesOnly bool ImagesOnly bool
UseLinkName bool UseLinkName bool
PrintURL bool
} }
func NewScrapeConfig() *ScrapeConfig { func NewScrapeConfig() *ScrapeConfig {
return &ScrapeConfig{0, "", false, -1, 0, false, -1, -1, true, false, false} return &ScrapeConfig{0, "", false, -1, 0, false, -1, -1, true, false, false, false}
}
func NewScrapeConfigQuiet() *ScrapeConfig {
return &ScrapeConfig{0, "", true, -1, 0, false, -1, -1, true, false, false, false}
} }
func NewScrapeConfigNoInclude() *ScrapeConfig { func NewScrapeConfigNoInclude() *ScrapeConfig {
return &ScrapeConfig{0, "", false, -1, 0, false, -1, -1, false, false, false} return &ScrapeConfig{0, "", false, -1, 0, false, -1, -1, false, false, false, false}
} }
func NewScrapeConfigs(selectors []string) []*ScrapeConfig { func NewScrapeConfigs(selectors []string) []*ScrapeConfig {
@@ -237,7 +242,6 @@ func NewChapterFromURL(url, linkName string, configs []*ScrapeConfig, index int,
content = "" content = ""
doc.Find("img").Each(func(i int, s *goquery.Selection) { doc.Find("img").Each(func(i int, s *goquery.Selection) {
imageTag, _ := goquery.OuterHtml(s) imageTag, _ := goquery.OuterHtml(s)
// imageTag = strings.ReplaceAll(imageTag, "\n", "")
content += imageTag content += imageTag
}) })
@@ -252,7 +256,7 @@ func NewChapterFromURL(url, linkName string, configs []*ScrapeConfig, index int,
} }
return chapter{string(body), name, article.Byline, content, subchapters, config} return chapter{url, string(body), name, article.Byline, content, subchapters, config}
} }
func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig, quiet bool) ([]chapter, chapter) { func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig, quiet bool) ([]chapter, chapter) {
@@ -370,7 +374,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, incl
log.Fatal(err) log.Fatal(err)
} }
links = append(links, NewLink(u.String(), item.Title)) links = append(links, NewLink(u.String(), item.Title, item.PublishedParsed))
} }
pathMax = "RSS" pathMax = "RSS"
@@ -405,7 +409,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, incl
// if selector is set, we use the selector specified by the user // if selector is set, we use the selector specified by the user
key = selector key = selector
pathLinks[key] = append(pathLinks[key], NewLink(href, text)) pathLinks[key] = append(pathLinks[key], NewLink(href, text, &time.Time{}))
pathCount[key] += 1 pathCount[key] += 1
pathMax = key pathMax = key
@@ -419,7 +423,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, incl
// we count this key if the link text is not empty // we count this key if the link text is not empty
if text != "" { if text != "" {
pathLinks[key] = append(pathLinks[key], NewLink(href, text)) pathLinks[key] = append(pathLinks[key], NewLink(href, text, &time.Time{}))
pathCount[key] += len(text) pathCount[key] += len(text)
if pathCount[key] > pathCount[pathMax] { if pathCount[key] > pathCount[pathMax] {
@@ -449,7 +453,7 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, incl
// include home page // include home page
if include { if include {
l := NewLink(url.String(), home.Name()) l := NewLink(url.String(), home.Name(), &time.Time{})
links = append([]link{l}, links...) links = append([]link{l}, links...)
} }

File diff suppressed because one or more lines are too long

View File

@@ -6,6 +6,7 @@ import (
"fmt" "fmt"
"io/ioutil" "io/ioutil"
"log" "log"
"os"
"strings" "strings"
"github.com/spf13/cobra" "github.com/spf13/cobra"
@@ -33,6 +34,7 @@ type GetOptions struct {
threads int threads int
include bool include bool
useLinkName bool useLinkName bool
printURL bool
} }
var getOpts *GetOptions var getOpts *GetOptions
@@ -46,6 +48,7 @@ func init() {
getCmd.Flags().StringVarP(&getOpts.output, "output", "", "", "file name (default: book name)") getCmd.Flags().StringVarP(&getOpts.output, "output", "", "", "file name (default: book name)")
getCmd.Flags().BoolVarP(&getOpts.stdout, "stdout", "", false, "print to standard output") getCmd.Flags().BoolVarP(&getOpts.stdout, "stdout", "", false, "print to standard output")
getCmd.Flags().BoolVarP(&getOpts.images, "images", "", false, "retrieve images only") getCmd.Flags().BoolVarP(&getOpts.images, "images", "", false, "retrieve images only")
getCmd.Flags().BoolVarP(&getOpts.printURL, "print-url", "", false, "print url after chapter title")
getCmd.Flags().BoolVarP(&getOpts.quiet, "quiet", "q", false, "hide progress bar") getCmd.Flags().BoolVarP(&getOpts.quiet, "quiet", "q", false, "hide progress bar")
// common with list command // common with list command
@@ -147,6 +150,7 @@ var getCmd = &cobra.Command{
config.ImagesOnly = getOpts.images config.ImagesOnly = getOpts.images
config.Include = getOpts.include config.Include = getOpts.include
config.UseLinkName = getOpts.useLinkName config.UseLinkName = getOpts.useLinkName
config.PrintURL = getOpts.printURL
// do not use link name for root level as there is not parent link // do not use link name for root level as there is not parent link
if index == 0 { if index == 0 {
@@ -177,6 +181,7 @@ var getCmd = &cobra.Command{
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
os.Remove(filename)
fmt.Println(string(bytesRead)) fmt.Println(string(bytesRead))
} else { } else {
@@ -191,6 +196,7 @@ var getCmd = &cobra.Command{
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
os.Remove(filename)
book := make(map[string]interface{}) book := make(map[string]interface{})
book["name"] = c.Name() book["name"] = c.Name()
@@ -212,6 +218,7 @@ var getCmd = &cobra.Command{
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
os.Remove(filename)
fmt.Println(string(bytesRead)) fmt.Println(string(bytesRead))
} else { } else {
@@ -227,6 +234,7 @@ var getCmd = &cobra.Command{
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
os.Remove(filename)
fmt.Println(string(bytesRead)) fmt.Println(string(bytesRead))
} else { } else {
@@ -242,6 +250,7 @@ var getCmd = &cobra.Command{
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
os.Remove(filename)
fmt.Println(string(bytesRead)) fmt.Println(string(bytesRead))
} else { } else {

View File

@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
Use: "version", Use: "version",
Short: "Print the version number of papeer", Short: "Print the version number of papeer",
Run: func(cmd *cobra.Command, args []string) { Run: func(cmd *cobra.Command, args []string) {
fmt.Println("papeer v0.8.0") fmt.Println("papeer v0.8.4")
}, },
} }