3 Commits

Author SHA1 Message Date
lapwat
be69854b17 add reverse option 2022-02-21 00:32:39 +01:00
lapwat
d8a3cc027f fix: selector depth 2022-02-06 23:35:35 +01:00
lapwat
be45a8f744 update installation instructions 2022-02-05 11:58:02 +01:00
7 changed files with 79 additions and 41 deletions

View File

@@ -1,23 +1,22 @@
# Papeer # Papeer
Papeer is a powerful an **ereader internet vacuum**. It can scrape any website, removing ads and keeping only the relevant content (formatted text and images). You can export the content to Markdown, EPUB or MOBI files. Papeer is a powerful **ereader internet vacuum**. It can scrape any website, removing ads and keeping only the relevant content (formatted text and images). You can export the content to Markdown, EPUB or MOBI files.
# Table of contents # Table of contents
- [Usage](#usage) - [Usage](#usage)
- [Examples](#examples) * [Scrape a web page](#scrape-a-web-page)
* [Grab a single page](#grab-a-single-page) * [Scrape a whole website](#scrape-a-whole-website)
* [Grab several pages](#grab-several-pages)
+ [`selector` option](#-selector--option)
+ [`depth` option](#-depth--option) + [`depth` option](#-depth--option)
+ [Display table of contents](#display-table-of-contents) + [`selector` option](#-selector--option)
+ [Display the table of contents](#display-the-table-of-contents)
+ [Scrape time](#scrape-time) + [Scrape time](#scrape-time)
- [Installation](#installation) - [Installation](#installation)
* [From source](#from-source) * [From source](#from-source)
* [From binary](#from-binary) * [From binary](#from-binary)
+ [Linux / MacOS](#linux---macos) + [Linux / MacOS](#linux---macos)
+ [Windows](#windows) + [Windows](#windows)
* [MOBI support (optional)](#mobi-support--optional-) * [MOBI support](#mobi-support)
- [Autocompletion](#autocompletion) - [Autocompletion](#autocompletion)
- [Dependencies](#dependencies) - [Dependencies](#dependencies)
@@ -49,6 +48,7 @@ Flags:
-o, --offset int skip first chapters, use with depth/selector -o, --offset int skip first chapters, use with depth/selector
--output string file name (default: book name) --output string file name (default: book name)
-q, --quiet hide progress bar -q, --quiet hide progress bar
-r, --reverse reverse chapter order
-s, --selector strings table of contents CSS selector -s, --selector strings table of contents CSS selector
-t, --threads int download concurrency, use with depth/selector (default -1) -t, --threads int download concurrency, use with depth/selector (default -1)
--use-link-name use link name for chapter title --use-link-name use link name for chapter title
@@ -64,9 +64,9 @@ You can activate this mode by using the `depth` or `selector` options.
This option defaults to 0, `papeer` will grab only the main page. This option defaults to 0, `papeer` will grab only the main page.
If you specify a value greater than 0, `papeer` will grab only the pages as deep as the value you specify. If you specify a value greater than 0, `papeer` will grab pages as deep as the value you specify.
> Using `include` option will include all intermediary levels. > Using `include` option will include all intermediary levels into the book.
### `selector` option ### `selector` option
@@ -76,9 +76,9 @@ If this option is specified, `papeer` will select the links (a HTML tag) present
You can chain this option to grab several level of pages with diferent selectors for each level. You can chain this option to grab several level of pages with diferent selectors for each level.
## Display the table of contents ### Display the table of contents
Before actually scraping a whole website, it is a good idea to use the `list` command. This command is like a **dry run**, which lets you vizualize the content before actually retrieving it. You can use several options to customize the table of contents extraction, such as `selector`, `limit`, `offset` and `include`. Type `papeer list --help` for more information about those options. Before actually scraping a whole website, it is a good idea to use the `list` command. This command is like a **dry run**, which lets you vizualize the content before actually retrieving it. You can use several options to customize the table of contents extraction, such as `selector`, `limit`, `offset`, `reverse` and `include`. Type `papeer list --help` for more information about those options.
```sh ```sh
papeer list https://12factor.net/ -s 'section.concrete>article>h2>a' papeer list https://12factor.net/ -s 'section.concrete>article>h2>a'
@@ -138,7 +138,7 @@ go get -u github.com/lapwat/papeer
```sh ```sh
# use platform=darwin for MacOS # use platform=darwin for MacOS
platform=linux platform=linux
release=0.4.0 release=0.4.2
# download and extract # download and extract
curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64.tar.gz > papeer.tar.gz curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64.tar.gz > papeer.tar.gz
@@ -146,15 +146,14 @@ tar xzvf papeer.tar.gz
rm papeer.tar.gz rm papeer.tar.gz
# move to user binaries # move to user binaries
chmod +x papeer
sudo mv papeer /usr/local/bin sudo mv papeer /usr/local/bin
``` ```
### Windows ### Windows
Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.4.0/papeer-v0.4.0-windows-amd64.exe). Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.4.2/papeer-v0.4.2-windows-amd64.exe.zip).
## (optional) MOBI support ## MOBI support
Install kindlegen to convert websites, Linux only Install kindlegen to convert websites, Linux only

View File

@@ -23,6 +23,7 @@ type ScrapeConfig struct {
Quiet bool Quiet bool
Limit int Limit int
Offset int Offset int
Reverse bool
Delay int Delay int
Threads int Threads int
Include bool Include bool
@@ -31,7 +32,7 @@ type ScrapeConfig struct {
} }
func NewScrapeConfig() *ScrapeConfig { func NewScrapeConfig() *ScrapeConfig {
return &ScrapeConfig{0, "", false, -1, 0, -1, -1, true, false, false} return &ScrapeConfig{0, "", false, -1, 0, false, -1, -1, true, false, false}
} }
func NewScrapeConfigs(selectors []string) []*ScrapeConfig { func NewScrapeConfigs(selectors []string) []*ScrapeConfig {
@@ -167,21 +168,24 @@ func NewChapterFromURL(url, linkName string, configs []*ScrapeConfig, index int,
updateProgressBarName(index, name) updateProgressBarName(index, name)
} }
subchapters := []chapter{} var subchapters []chapter
if len(configs) > 1 { if len(configs) > 1 {
// add subchapters
links, _, _, err := GetLinks(base, config.Selector, config.Limit, config.Offset, false) // retrieve links on page
links, _, _, err := GetLinks(base, config.Selector, config.Limit, config.Offset, config.Reverse, false)
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
subchapters = make([]chapter, len(links)) // init progess bar
var p progress var p progress
if config.Quiet == false { if config.Quiet == false {
p = NewProgress(links, name, config.Depth) p = NewProgress(links, name, config.Depth)
} }
// init chapters list
subchapters = make([]chapter, len(links))
if config.Delay >= 0 { if config.Delay >= 0 {
// synchronous mode // synchronous mode
@@ -277,7 +281,7 @@ func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig, q
log.Fatal(err) log.Fatal(err)
} }
links, _, home, err := GetLinks(base, config.Selector, config.Limit, config.Offset, config.Include) links, _, home, err := GetLinks(base, config.Selector, config.Limit, config.Offset, config.Reverse, config.Include)
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }
@@ -370,7 +374,7 @@ func GetPath(elm *goquery.Selection) string {
return join return join
} }
func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) ([]link, string, chapter, error) { func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, include bool) ([]link, string, chapter, error) {
selectorSet := true selectorSet := true
if len(selector) == 0 { if len(selector) == 0 {
selector = "a" selector = "a"
@@ -434,10 +438,18 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool)
home := NewChapterFromURL(url.String(), "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {}) home := NewChapterFromURL(url.String(), "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
// include home page
if include { if include {
l := NewLink(url.String(), home.Name()) l := NewLink(url.String(), home.Name())
links = append([]link{l}, links...) links = append([]link{l}, links...)
} }
// reverse links
if reverse {
for i, j := 0, len(links)-1; i < j; i, j = i+1, j-1 {
links[i], links[j] = links[j], links[i]
}
}
return links, pathMax, home, nil return links, pathMax, home, nil
} }

View File

@@ -182,6 +182,24 @@ func TestSubChaptersLimitOver(t *testing.T) {
} }
func TestReverse(t *testing.T) {
config0 := NewScrapeConfig()
config0.Reverse = true
config1 := NewScrapeConfig()
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
got := c.SubChapters()[0].Name()
want := "The Twelve-Factor App"
if got != want {
t.Errorf("got %v, wanted %v", got, want)
}
}
func TestNotInclude(t *testing.T) { func TestNotInclude(t *testing.T) {
config := NewScrapeConfig() config := NewScrapeConfig()

View File

@@ -25,6 +25,7 @@ type GetOptions struct {
depth int depth int
limit int limit int
offset int offset int
reverse bool
delay int delay int
threads int threads int
// includeUrl bool // includeUrl bool
@@ -49,6 +50,7 @@ func init() {
getCmd.Flags().IntVarP(&getOpts.depth, "depth", "d", 0, "scraping depth") getCmd.Flags().IntVarP(&getOpts.depth, "depth", "d", 0, "scraping depth")
getCmd.Flags().IntVarP(&getOpts.limit, "limit", "l", -1, "limit number of chapters, use with depth/selector") getCmd.Flags().IntVarP(&getOpts.limit, "limit", "l", -1, "limit number of chapters, use with depth/selector")
getCmd.Flags().IntVarP(&getOpts.offset, "offset", "o", 0, "skip first chapters, use with depth/selector") getCmd.Flags().IntVarP(&getOpts.offset, "offset", "o", 0, "skip first chapters, use with depth/selector")
getCmd.Flags().BoolVarP(&getOpts.reverse, "reverse", "r", false, "reverse chapter order")
getCmd.Flags().IntVarP(&getOpts.delay, "delay", "", -1, "time in milliseconds to wait before downloading next chapter, use with depth/selector") getCmd.Flags().IntVarP(&getOpts.delay, "delay", "", -1, "time in milliseconds to wait before downloading next chapter, use with depth/selector")
getCmd.Flags().IntVarP(&getOpts.threads, "threads", "t", -1, "download concurrency, use with depth/selector") getCmd.Flags().IntVarP(&getOpts.threads, "threads", "t", -1, "download concurrency, use with depth/selector")
getCmd.Flags().BoolVarP(&getOpts.include, "include", "i", false, "include URL as first chapter, use with depth/selector") getCmd.Flags().BoolVarP(&getOpts.include, "include", "i", false, "include URL as first chapter, use with depth/selector")
@@ -96,6 +98,10 @@ var getCmd = &cobra.Command{
return errors.New("cannot use offset option if depth/selector is not specified") return errors.New("cannot use offset option if depth/selector is not specified")
} }
if cmd.Flags().Changed("reverse") && getOpts.depth == 0 && len(getOpts.Selector) == 0 {
return errors.New("cannot use reverse option if depth/selector is not specified")
}
if cmd.Flags().Changed("delay") && getOpts.depth == 0 && len(getOpts.Selector) == 0 { if cmd.Flags().Changed("delay") && getOpts.depth == 0 && len(getOpts.Selector) == 0 {
return errors.New("cannot use delay option if depth/selector is not specified") return errors.New("cannot use delay option if depth/selector is not specified")
} }
@@ -104,6 +110,10 @@ var getCmd = &cobra.Command{
return errors.New("cannot use threads option if depth/selector is not specified") return errors.New("cannot use threads option if depth/selector is not specified")
} }
if cmd.Flags().Changed("use-link-name") && getOpts.depth == 0 && len(getOpts.Selector) == 0 {
return errors.New("cannot use use-link-name option if depth/selector is not specified")
}
if cmd.Flags().Changed("delay") && cmd.Flags().Changed("threads") { if cmd.Flags().Changed("delay") && cmd.Flags().Changed("threads") {
return errors.New("cannot use delay and threads options at the same time") return errors.New("cannot use delay and threads options at the same time")
} }
@@ -114,7 +124,8 @@ var getCmd = &cobra.Command{
url := args[0] url := args[0]
// fill selector array with empty selectors to match depth // fill selector array with empty selectors to match depth
for len(getOpts.Selector) < getOpts.depth+2 { getOpts.Selector = append(getOpts.Selector, "")
for len(getOpts.Selector) < getOpts.depth+1 {
getOpts.Selector = append(getOpts.Selector, "") getOpts.Selector = append(getOpts.Selector, "")
} }
@@ -126,6 +137,7 @@ var getCmd = &cobra.Command{
config.Quiet = getOpts.quiet config.Quiet = getOpts.quiet
config.Limit = getOpts.limit config.Limit = getOpts.limit
config.Offset = getOpts.offset config.Offset = getOpts.offset
config.Reverse = getOpts.reverse
config.Delay = getOpts.delay config.Delay = getOpts.delay
config.Threads = getOpts.threads config.Threads = getOpts.threads
config.ImagesOnly = getOpts.images config.ImagesOnly = getOpts.images

View File

@@ -21,6 +21,7 @@ type ListOptions struct {
depth int depth int
limit int limit int
offset int offset int
reverse bool
delay int delay int
threads int threads int
// includeUrl bool // includeUrl bool
@@ -33,10 +34,12 @@ var listOpts *ListOptions
func init() { func init() {
listOpts = &ListOptions{} listOpts = &ListOptions{}
// common with get command
listCmd.Flags().StringSliceVarP(&listOpts.Selector, "selector", "s", []string{}, "table of contents CSS selector") listCmd.Flags().StringSliceVarP(&listOpts.Selector, "selector", "s", []string{}, "table of contents CSS selector")
listCmd.Flags().IntVarP(&listOpts.depth, "depth", "d", 0, "scraping depth") listCmd.Flags().IntVarP(&listOpts.depth, "depth", "d", 0, "scraping depth")
listCmd.Flags().IntVarP(&listOpts.limit, "limit", "l", -1, "limit number of chapters, use with depth/selector") listCmd.Flags().IntVarP(&listOpts.limit, "limit", "l", -1, "limit number of chapters, use with depth/selector")
listCmd.Flags().IntVarP(&listOpts.offset, "offset", "o", 0, "skip first chapters, use with depth/selector") listCmd.Flags().IntVarP(&listOpts.offset, "offset", "o", 0, "skip first chapters, use with depth/selector")
listCmd.Flags().BoolVarP(&listOpts.reverse, "reverse", "r", false, "reverse chapter order")
listCmd.Flags().IntVarP(&listOpts.delay, "delay", "", -1, "time in milliseconds to wait before downloading next chapter, use with depth/selector") listCmd.Flags().IntVarP(&listOpts.delay, "delay", "", -1, "time in milliseconds to wait before downloading next chapter, use with depth/selector")
listCmd.Flags().IntVarP(&listOpts.threads, "threads", "t", -1, "download concurrency, use with depth/selector") listCmd.Flags().IntVarP(&listOpts.threads, "threads", "t", -1, "download concurrency, use with depth/selector")
listCmd.Flags().BoolVarP(&listOpts.include, "include", "i", false, "include URL as first chapter, use with depth/selector") listCmd.Flags().BoolVarP(&listOpts.include, "include", "i", false, "include URL as first chapter, use with depth/selector")
@@ -66,7 +69,7 @@ var listCmd = &cobra.Command{
log.Fatal(err) log.Fatal(err)
} }
links, path, _, err := book.GetLinks(base, listOpts.Selector[0], listOpts.limit, listOpts.offset, listOpts.include) links, path, _, err := book.GetLinks(base, listOpts.Selector[0], listOpts.limit, listOpts.offset, listOpts.reverse, listOpts.include)
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
} }

View File

@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
Use: "version", Use: "version",
Short: "Print the version number of papeer", Short: "Print the version number of papeer",
Run: func(cmd *cobra.Command, args []string) { Run: func(cmd *cobra.Command, args []string) {
fmt.Println("papeer v0.4.0") fmt.Println("papeer v0.4.2")
}, },
} }

View File

@@ -1,34 +1,28 @@
#!/usr/bin/env bash #!/usr/bin/env bash
version=$1
platforms=("linux/amd64" "darwin/amd64" "windows/amd64")
if [ "$#" -ne 1 ]; then if [ "$#" -ne 1 ]; then
echo "Illegal number of parameters" echo "Illegal number of parameters"
echo "Usage: ./release.sh X.X.X" echo "Usage: ./release.sh X.X.X"
exit 1 exit 1
fi fi
version=$1
platforms=("linux/amd64" "darwin/amd64" "windows/amd64")
for platform in "${platforms[@]}" for platform in "${platforms[@]}"
do do
platform_split=(${platform//\// }) platform_split=(${platform//\// })
GOOS=${platform_split[0]} GOOS=${platform_split[0]}
GOARCH=${platform_split[1]} GOARCH=${platform_split[1]}
output_name='papeer-v'$version'-'$GOOS'-'$GOARCH output_name=papeer
if [ $GOOS = "windows" ]; then
output_name+='.exe'
fi
env GOOS=$GOOS GOARCH=$GOARCH go build -o $output_name
if [ $? -ne 0 ]; then
echo 'An error has occurred! Aborting the script execution...'
exit 1
fi
if [ $GOOS = "windows" ]; then if [ $GOOS = "windows" ]; then
zip "$output_name.exe.zip" "$output_name" env GOOS=$GOOS GOARCH=$GOARCH go build -o "$output_name.exe"
zip "$output_name-v$version-$GOOS-$GOARCH.exe.zip" "$output_name.exe"
rm "$output_name.exe"
else else
tar czvf "$output_name.tar.gz" "$output_name" env GOOS=$GOOS GOARCH=$GOARCH go build -o "$output_name"
tar czvf "$output_name-v$version-$GOOS-$GOARCH.tar.gz" "$output_name"
rm "$output_name"
fi fi
rm "$output_name"
done done